In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation, BatchNormalization, Input, Embedding, Dot, Dense, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping

from wordcloud import WordCloud

%matplotlib inline

## Reading animelist.csv

In [57]:
import os
from config.paths_config import RAW_DIR

In [58]:
rating_df = pd.read_parquet(os.path.join("..", RAW_DIR, 'animelist.parquet'), columns=["user_id", "anime_id", "rating"])

In [59]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


## Data Processing

In [60]:
n_ratings = rating_df["user_id"].value_counts()
high_consumers = n_ratings[n_ratings >= 400].index
rating_df = rating_df.loc[rating_df["user_id"].isin(high_consumers)]

In [62]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
213,2,24833,0
214,2,235,10
215,2,36721,0
216,2,40956,0
217,2,31933,0


In [63]:
max_rating = rating_df["rating"].max()
min_rating = rating_df["rating"].min()
avg_rating = rating_df["rating"].mean()
min_rating, avg_rating, max_rating

(np.int64(0), np.float64(4.122732695114736), np.int64(10))

- min-max scaling

In [64]:

rating_df["rating"] = (rating_df["rating"] - min_rating) / (max_rating - min_rating)

- check for duplicates

In [65]:
rating_df.duplicated().sum()

np.int64(0)

In [80]:
user_ids = rating_df["user_id"].unique().tolist()
anime_ids = rating_df["anime_id"].unique().tolist()

In [107]:
rating_df = rating_df.sample(frac=1, random_state=42).reset_index(drop=True)

user_id_encoding = {x: i for i, x in enumerate(user_ids)}
user_id_decoding = {i: x for i, x in enumerate(user_ids)}
rating_df["user"] = rating_df["user_id"].map(user_id_encoding)

anime_id_encoding = {x: i for i, x in enumerate(anime_ids)}
anime_id_decoding = {i: x for i, x in enumerate(anime_ids)}
rating_df["anime"] = rating_df["anime_id"].map(anime_id_encoding)

In [108]:
X = rating_df[["user", "anime"]].to_numpy()
y = rating_df["rating"]
X

array([[3533, 4037],
       [2073, 5078],
       [ 147,  349],
       ...,
       [1250,  578],
       [1405, 3376],
       [ 602, 3278]])

In [109]:
test_frac = 0.1
test_size = int(len(X) * test_frac)
X_train = [X[:-test_size, 0], X[:-test_size, 1]]
X_test = [X[-test_size:, 0], X[-test_size:, 1]]
y_train = y[:-test_size]
y_test = y[-test_size:]

test_size

324664

## Model Architecture

In [123]:
def recommender_net(n_users: int, n_animes: int, embedding_size=128) -> Model:
    user = Input(name="user", shape=[1])
    user_embedding = Embedding(name="user_embedding",
                               input_dim=n_users,
                               output_dim=embedding_size)(user)
    anime = Input(name="anime", shape=[1])
    anime_embedding = Embedding(name="anime_embedding",
                                input_dim=n_animes,
                                output_dim=embedding_size)(anime)
    x = Dot(name="dot_product", normalize=True, axes=2)(inputs=[user_embedding, anime_embedding])
    x = Flatten()(x)

    """
    Dense(
        units,
        activation=None,
        use_bias=True,
        kernel_initializer="glorot_uniform",
        bias_initializer="zeros",
        kernel_regularizer=None,
        bias_regularizer=None,
        activity_regularizer=None,
        kernel_constraint=None,
        bias_constraint=None,
        lora_rank=None,
        **kwargs
    )
    """
    x = Dense(units=2)(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Dense(units=1)(x)
    x = BatchNormalization()(x)
    x = Activation("sigmoid")(x)

    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss="binary_crossentropy",
                  metrics=["mae", "mse"],
                  optimizer="Adam")
    return model


In [124]:
model = recommender_net(n_users=len(user_ids), n_animes=len(anime_ids))
model.summary()

In [125]:
checkpoint_fp = os.path.join(".", "weights.weights.h5")
model_checkpoint = ModelCheckpoint(filepath=checkpoint_fp,
                                   save_weights_only=True,
                                   monitor="val_loss",
                                   mode="min",
                                   save_best_only=True)
early_stopping = EarlyStopping(patience=3,
                               monitor="val_loss",
                               mode="min",
                               restore_best_weights=True)

callbacks = [model_checkpoint, early_stopping]

In [126]:
batch_size = 10000

history = model.fit(
    X_train, y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=True,
    validation_data=(X_test, y_test),
    callbacks=callbacks
)

Epoch 1/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 62ms/step - loss: 0.7384 - mae: 0.3700 - mse: 0.1726 - val_loss: 0.6787 - val_mae: 0.3639 - val_mse: 0.1484
Epoch 2/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 62ms/step - loss: 0.6786 - mae: 0.3639 - mse: 0.1484 - val_loss: 0.6814 - val_mae: 0.3629 - val_mse: 0.1497
Epoch 3/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 60ms/step - loss: 0.6280 - mae: 0.3162 - mse: 0.1265 - val_loss: 0.5719 - val_mae: 0.2637 - val_mse: 0.1016
Epoch 4/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 58ms/step - loss: 0.5139 - mae: 0.2237 - mse: 0.0774 - val_loss: 0.5668 - val_mae: 0.2428 - val_mse: 0.1009
Epoch 5/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 58ms/step - loss: 0.4673 - mae: 0.1838 - mse: 0.0592 - val_loss: 0.5641 - val_mae: 0.2281 - val_mse: 0.0967
Epoch 6/20
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m