In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation,BatchNormalization,Input,Embedding,Dot,Dense,Flatten
from tensorflow.keras.callbacks import ModelCheckpoint,LearningRateScheduler,TensorBoard,EarlyStopping

from wordcloud import WordCloud
%matplotlib inline

import os

### READING ANIMELIST.CSV

In [2]:
INPUT_DIR = os.path.join("..","artifacts","raw")

In [3]:
INPUT_DIR

'..\\artifacts\\raw'

In [6]:
rating_df = pd.read_csv(INPUT_DIR+"/animelist.csv" , low_memory=True,usecols=["user_id","anime_id","rating"])
rating_df.shape

(26135607, 3)

In [None]:
rating_df.isna().sum()

In [None]:
rating_df.head(1)

In [None]:
#

#### DATA PROCESSING

In [None]:
n_ratings = rating_df["user_id"].value_counts()
rating_df = rating_df[rating_df["user_id"].isin(n_ratings[n_ratings>=1000].index)].copy()
rating_df.shape

In [None]:
#rating_df=rating_df.drop_duplicates()
#rating_df.shape

In [None]:
#rating_df.to_csv(r"C:\Users\hrith\OneDrive\Documents\mlops_tutorial\mlops_beginner_to_advanced\anime_recommender_system\artifacts\raw\filtered_animelist.csv", index=False)
#rating_df.shape

In [2]:
rating_df=pd.read_csv(r"C:\Users\hrith\OneDrive\Documents\mlops_tutorial\mlops_beginner_to_advanced\anime_recommender_system\artifacts\raw\filtered_animelist.csv")
rating_df.shape

(26135607, 3)

In [3]:
rating_df.head(1)

Unnamed: 0,user_id,anime_id,rating
0,17,34572,0


In [5]:
rating_df['user_id'].unique()

array([    17,     42,     60, ..., 353365, 353390, 353395],
      shape=(16636,))

In [None]:
rating_df.shape[0]

In [None]:
len(rating_df)

In [None]:
min_rating =min(rating_df["rating"])
min_rating

In [None]:
max_rating =max(rating_df["rating"])
max_rating

In [None]:
rating_df['rating'].value_counts(dropna=False)

In [None]:
avg_rating =np.mean(rating_df["rating"])
avg_rating

In [None]:
########### min max scaling ###############

In [None]:
rating_df["rating"] = rating_df["rating"].apply(lambda x: (x-min_rating)/(max_rating-min_rating)).values.astype(np.float64)

In [None]:
rating_df.duplicated().sum()

In [None]:
avg_rating =np.mean(rating_df["rating"])
avg_rating

In [None]:
rating_df.isnull().sum()

In [None]:
user_ids = rating_df["user_id"].unique().tolist()
len(user_ids)

In [None]:
#######################################################################################

In [None]:
user2user_encoded = {x : i for i , x in enumerate(user_ids)}
#user2user_encoded

In [None]:
user2user_decoded = {i : x for i , x in enumerate(user_ids)}
#user2user_decoded

In [None]:
rating_df["user"] = rating_df["user_id"].map(user2user_encoded)

In [None]:
n_users = len(user2user_encoded)
n_users

In [None]:
#### example :
## 11054 : 12  -> user2user encoded
## 12 : 11054 ---> usser2userdecocded   

In [None]:
anime_ids = rating_df["anime_id"].unique().tolist()
anime2anime_encoded = {x : i for i , x in enumerate(anime_ids)}
anime2anime_decoded = {i : x for i , x in enumerate(anime_ids)}
rating_df["anime"] = rating_df["anime_id"].map(anime2anime_encoded)

In [None]:
n_anime = len(anime2anime_encoded)
n_anime

In [None]:
############################### suffling the data ######################################

In [None]:
rating_df = rating_df.sample(frac=1,random_state=43).reset_index(drop=True)

In [None]:
rating_df.head()

In [None]:
#####################################################################################

In [None]:
rating_df.shape

In [None]:
#rating_df.drop_duplicates().shape

In [None]:
X = rating_df[["user","anime"]].values
y = rating_df["rating"]

In [None]:
############################ splitting into test train #####################################

In [None]:
X.shape, y.shape

In [None]:
test_size = int(y.shape[0]*0.2)
test_size

In [None]:
train_indices = rating_df.shape[0] - test_size

In [None]:
X_train , X_test , y_train , y_test = (
    X[:train_indices],
    X[train_indices :],
    y[:train_indices],
    y[train_indices:],
)

In [None]:
X_train.shape, X_test.shape

In [None]:
type(X_train), type(X_test)

In [None]:
X_train[:5]

In [None]:
X_test[:5]

In [None]:
X_train_array = [X_train[: , 0] , X_train[: ,1]]
X_test_array = [X_test[: , 0] , X_test[: ,1]]

In [None]:
type(X_train_array), type(X_test_array)

In [None]:
len(X_train_array), len(X_train_array[0])

In [None]:
X_test_array[0]

In [None]:
n_anime

#### MODEL ARCHITECTURE

In [None]:
def RecommenderNet():
    embedding_size =128

    user = Input(name="user",shape=[1])

    user_embedding = Embedding(name="user_embedding",input_dim=n_users,output_dim=embedding_size)(user)

    anime = Input(name="anime",shape=[1])

    anime_embedding = Embedding(name="anime_embedding",input_dim=n_anime,output_dim=embedding_size)(anime)

    x = Dot(name="dot_product" , normalize=True , axes=2)([user_embedding,anime_embedding])

    x = Flatten()(x)

    x = Dense(1,kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)

    model = Model(inputs=[user,anime], outputs=x)
    model.compile(loss="mse",metrics=["mae","mse"],optimizer='Adam')
    return model

In [None]:
model = RecommenderNet()

In [None]:
model.summary()

In [None]:
start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.0001
batch_size = 10000

ramup_epochs = 10
sustain_epochs = 5
exp_decay = 0.8

def lrfn(epoch):
    if epoch<ramup_epochs:
        return (max_lr-start_lr)/ramup_epochs*epoch + start_lr
    elif epoch<ramup_epochs+sustain_epochs:
        return max_lr
    else:
        return (max_lr-min_lr) * exp_decay ** (epoch-ramup_epochs-sustain_epochs)+min_lr

In [None]:
lr_callback = LearningRateScheduler(lambda epoch:lrfn(epoch) , verbose=0)
checkpoint_filepath = './weights.weights.h5'

model_checkpoint = ModelCheckpoint(filepath=checkpoint_filepath,save_weights_only=True,monitor="val_loss",mode="min",save_best_only=True)

early_stopping = EarlyStopping(patience=5,monitor="val_loss",mode="min",restore_best_weights=True)

In [None]:
my_callbacks = [model_checkpoint,lr_callback,early_stopping]

In [None]:
history = model.fit(
    x=X_train_array,
    y=y_train,
    batch_size=batch_size,
    epochs=50,
    verbose=1,
    validation_data = (X_test_array,y_test),
    callbacks=my_callbacks
)

In [None]:
model.load_weights(checkpoint_filepath)

In [None]:
metrics = ["loss", "mae", "mse"]

# Create subplots
fig, axes = plt.subplots(len(metrics), 1, figsize=(8, len(metrics) * 4))

for i, metric in enumerate(metrics):
    ax = axes[i]
    ax.plot(history.history[metric][0:-2], marker="o", label=f"train {metric}")
    ax.plot(history.history[f"val_{metric}"][0:-2], marker="o", label=f"test {metric}")
    ax.set_title(f"Model {metric.capitalize()}")
    ax.set_ylabel(metric.capitalize())
    ax.set_xlabel("Epoch")
    ax.legend(loc="upper left")
    ax.grid(True)

plt.tight_layout()
plt.show()

In [None]:
def extract_weights(name,model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    weights = weights/np.linalg.norm(weights,axis=1).reshape((-1,1))
    return weights

In [None]:
anime_weights = extract_weights("anime_embedding",model)

In [None]:
user_weights = extract_weights("user_embedding",model)

#### READING ANIME.CSV

In [None]:
df = pd.read_csv(INPUT_DIR+"/anime.csv",low_memory=True)
df.head(2)

In [None]:
df = df.replace("Unknown",np.nan)

In [None]:
def getAnimeName(anime_id):
    try:
        name = df[df.anime_id == anime_id].eng_version.values[0]
        if name is np.nan:
            name = df[df.anime_id == anime_id].Name.values[0]
    except:
        print("Error")
    return name

In [None]:
df["anime_id"] = df["MAL_ID"]
df["eng_version"] = df["English name"]
df["eng_version"] = df.anime_id.apply(lambda x:getAnimeName(x))

In [None]:
getAnimeName(6702)

In [None]:
df.sort_values(by=["Score"],
               inplace=True,
               ascending=False,
               kind="quicksort",
               na_position="last")

In [None]:
df.columns

In [None]:
df = df[["anime_id" , "eng_version","Score","Genres","Episodes","Type","Premiered","Members"]]

In [None]:
df.head()

In [None]:
def getAnimeFrame(anime,df):
    if isinstance(anime,int):
        return df[df.anime_id == anime]
    if isinstance(anime,str):
        return df[df.eng_version == anime]
    

In [None]:
getAnimeFrame(40028 , df)

In [None]:
getAnimeFrame("Steins;Gate",df)

#### ANIME_WITH_SYNOPSIS.CSV 

In [None]:
cols = ["MAL_ID","Name","Genres","sypnopsis"]

In [None]:
synopsis_df = pd.read_csv(INPUT_DIR+"/anime_with_synopsis.csv",usecols=cols)

In [None]:
synopsis_df.head(1)

In [None]:
synopsis_df.columns

In [None]:
def getSynopsis(anime,synopsis_df):
    if isinstance(anime,int):
        return synopsis_df[synopsis_df.MAL_ID == anime].sypnopsis.values[0]
    if isinstance(anime,str):
        return synopsis_df[synopsis_df.Name == anime].sypnopsis.values[0]
    

In [None]:
getSynopsis(40028 , synopsis_df)

In [None]:
getSynopsis("Steins;Gate",synopsis_df)

### CONTENT/ITEM BASED RECOMMENDATION

In [None]:
pd.set_option("max_colwidth",None)

In [None]:
def find_similar_animes(name, anime_weights, anime2anime_encoded, anime2anime_decoded, df, synopsis_df, n=10, return_dist=False, neg=False):
    # Get the anime_id for the given name
    index = getAnimeFrame(name, df).anime_id.values[0]
    encoded_index = anime2anime_encoded.get(index)

    if encoded_index is None:
        raise ValueError(f"Encoded index not found for anime ID: {index}")

    weights = anime_weights

    # Compute the similarity distances
    dists = np.dot(weights, weights[encoded_index])  # Ensure weights[encoded_index] is a 1D array
    sorted_dists = np.argsort(dists)

    n = n + 1

    # Select closest or farthest based on 'neg' flag
    if neg:
        closest = sorted_dists[:n]
    else:
        closest = sorted_dists[-n:]

    # Return distances and closest indices if requested
    if return_dist:
        return dists, closest

    # Build the similarity array
    SimilarityArr = []
    for close in closest:
        decoded_id = anime2anime_decoded.get(close)
       

       
        anime_frame = getAnimeFrame(decoded_id, df)

        anime_name = anime_frame.eng_version.values[0]
        genre = anime_frame.Genres.values[0]
        similarity = dists[close]
   

        SimilarityArr.append({
            "anime_id": decoded_id,
            "name": anime_name,
            "similarity": similarity,
            "genre": genre,
        })
       

    # Create a DataFrame with results and sort by similarity
    Frame = pd.DataFrame(SimilarityArr).sort_values(by="similarity", ascending=False)
    return Frame[Frame.anime_id != index].drop(['anime_id'], axis=1)


In [None]:
find_similar_animes(
    "Steins;Gate",
    anime_weights,
    anime2anime_encoded,
    anime2anime_decoded,
    df,
    synopsis_df,
)

### USER BASED RECOMMENDATION

In [None]:
def find_similar_users(item_input , user_weights , user2user_encoded , user2user_decoded, n=10 , return_dist=False,neg=False):
    try:
        index=item_input
        encoded_index = user2user_encoded.get(index)

        weights = user_weights

        dists = np.dot(weights,weights[encoded_index])
        sorted_dists = np.argsort(dists)

        n=n+1

        if neg:
            closest = sorted_dists[:n]
        else:
            closest = sorted_dists[-n:]
            

        if return_dist:
            return dists,closest
        
        SimilarityArr = []

        for close in closest:
            similarity = dists[close]

            if isinstance(item_input,int):
                decoded_id = user2user_decoded.get(close)
                SimilarityArr.append({
                    "similar_users" : decoded_id,
                    "similarity" : similarity
                })
        similar_users = pd.DataFrame(SimilarityArr).sort_values(by="similarity",ascending=False)
        similar_users = similar_users[similar_users.similar_users != item_input]
        return similar_users
    except Exception as e:
        print("Error Occured",e)

        
        

In [None]:
find_similar_users(int(11880),user_weights,user2user_encoded,user2user_decoded)

In [None]:
def showWordCloud(all_genres):
    genres_cloud = WordCloud(width=700,height=400,background_color='white',colormap='gnuplot').generate_from_frequencies(all_genres)
    plt.figure(figsize=(10,8))
    plt.imshow(genres_cloud,interpolation="bilinear")
    plt.axis("off")
    plt.show()

In [None]:
from collections import defaultdict

In [None]:
df.head(1)

In [None]:
def getFavGenre(frame , plot=False):
    frame.dropna(inplace=False)
    all_genres = defaultdict(int)

    genres_list = []
    for genres in frame["Genres"]:
        if isinstance(genres,str):
            for genre in genres.split(','):
                genres_list.append(genre)
                all_genres[genre.strip()] += 1

    if plot:
        showWordCloud(all_genres)
    
    return genres_list



In [None]:
rating_df.head(2)

In [None]:
def get_user_preferences(user_id , rating_df , df ,plot=False):

    animes_watched_by_user = rating_df[rating_df.user_id == user_id]

    user_rating_percentile = np.percentile(animes_watched_by_user.rating , 75)

    animes_watched_by_user = animes_watched_by_user[animes_watched_by_user.rating >= user_rating_percentile]

    top_animes_user = (
        animes_watched_by_user.sort_values(by="rating" , ascending=False).anime_id.values
    )

    anime_df_rows = df[df["anime_id"].isin(top_animes_user)]
    anime_df_rows = anime_df_rows[["eng_version","Genres"]]

    if plot:
        getFavGenre(anime_df_rows,plot)


    return anime_df_rows




In [None]:
get_user_preferences(11880 , rating_df, df , plot=True)

In [None]:
def get_user_recommendations(similar_users , user_pref ,df , synopsis_df, rating_df, n=10):

    recommended_animes = []
    anime_list = []

    for user_id in similar_users.similar_users.values:
        pref_list = get_user_preferences(int(user_id) , rating_df, df)

        pref_list = pref_list[~pref_list.eng_version.isin(user_pref.eng_version.values)]

        if not pref_list.empty:
            anime_list.append(pref_list.eng_version.values)

    if anime_list:
            anime_list = pd.DataFrame(anime_list)

            sorted_list = pd.DataFrame(pd.Series(anime_list.values.ravel()).value_counts()).head(n)

            for i,anime_name in enumerate(sorted_list.index):
                n_user_pref = sorted_list[sorted_list.index == anime_name].values[0][0]

                if isinstance(anime_name,str):
                    frame = getAnimeFrame(anime_name,df)
                    anime_id = frame.anime_id.values[0]
                    genre = frame.Genres.values[0]
                    synopsis = getSynopsis(int(anime_id),synopsis_df)

                    recommended_animes.append({
                        "n" : n_user_pref,
                        "anime_name" : anime_name,
                        "Genres" : genre,
                        "Synopsis": synopsis
                    })
    return pd.DataFrame(recommended_animes).head(n)
            



    

In [None]:
similar_users =find_similar_users(int(11880),user_weights,user2user_encoded,user2user_decoded)

In [None]:
user_pref = get_user_preferences(11880 , rating_df, df , plot=False)

In [None]:
get_user_recommendations(similar_users,user_pref,df, synopsis_df,rating_df,n=1)

### HYBRID RECOMMENDER SYSTEM

In [None]:
def hybrid_recommendation(user_id , user_weight=0.5, content_weight =0.5):

    ## User Recommndation

    similar_users =find_similar_users(user_id,user_weights,user2user_encoded,user2user_decoded)
    user_pref = get_user_preferences(user_id , rating_df, df)
    user_recommended_animes =get_user_recommendations(similar_users,user_pref,df, synopsis_df,rating_df)
    

    user_recommended_anime_list = user_recommended_animes["anime_name"].tolist()
    print(user_recommended_anime_list)

    #### Content recommendation
    content_recommended_animes = []

    for anime in user_recommended_anime_list:
        similar_animes = find_similar_animes(anime, anime_weights, anime2anime_encoded, anime2anime_decoded, df, synopsis_df)

        if similar_animes is not None and not similar_animes.empty:
            content_recommended_animes.extend(similar_animes["name"].tolist())
        else:
            print(f"No similar anime found {anime}")
    
    combined_scores = {}

    for anime in user_recommended_anime_list:
        combined_scores[anime] = combined_scores.get(anime,0) + user_weight

    for anime in content_recommended_animes:
        combined_scores[anime] = combined_scores.get(anime,0) + content_weight  

    sorted_animes = sorted(combined_scores.items() , key=lambda x:x[1] , reverse=True)

    return [anime for anime , score in sorted_animes[:10]] 



In [None]:
hybrid_recommendation(11880)