In [1]:
!pip install tensorflow-recommenders

Collecting tensorflow-recommenders
  Downloading tensorflow_recommenders-0.7.3-py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<3.20,>=3.9.2
  Downloading protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: protobuf, tensorflow-recommenders
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tfx-bsl 1.12.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.83.0 w

In [2]:
# import required packages
# utility
import html
import pprint
import numpy as np
import pandas as pd
from typing import Dict, Text

# tensorflow
import tensorflow as tf
import tensorflow_recommenders as tfrs

In [3]:
# # configure gpu
# gpus = tf.config.list_physical_devices("GPU")

# print("Num GPUs Available: ", len(gpus))

# # if gpus:
# #     # Create 2 virtual GPUs with 4GB memory each
# #     try:
# #         tf.config.set_logical_device_configuration(gpus[0],[tf.config.LogicalDeviceConfiguration(memory_limit=4096), tf.config.LogicalDeviceConfiguration(memory_limit=4096)])
# #         logical_gpus = tf.config.list_logical_devices("GPU")
# #         print(len(gpus), "Physical GPU,", len(logical_gpus), "Logical GPUs")
# #     except RuntimeError as e:
# #         # Virtual devices must be set before GPUs have been initialized
# #         print(e)

# strategy = tf.distribute.MirroredStrategy()

## EDA and Preprocessing: Anime Recommendations Dataset

In [4]:
# load ratings dataset
rating_df = pd.read_csv('/kaggle/input/anime-recommendations-database/rating.csv')

rating_df.shape

(7813737, 3)

In [5]:
# first 5 data points
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
# info
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


In [7]:
# convert id columns to string
rating_df['user_id'] = rating_df['user_id'].map(lambda x: str(x))
rating_df['anime_id'] = rating_df['anime_id'].map(lambda x: str(x))

In [8]:
rating_df.isna().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [9]:
# load anime metadata
anime_df = pd.read_csv('/kaggle/input/anime-recommendations-database/anime.csv')

anime_df.shape

(12294, 7)

In [10]:
# first 5 data points
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [11]:
# info
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [12]:
# convert id column to string
anime_df['anime_id'] = anime_df['anime_id'].apply(lambda x: str(x))

In [13]:
# sample anime name with invalid characters
anime_df['name'].loc[8441]

'Dennou Sentai Voogie&#039;s★Angel: Forever and Ever'

In [14]:
# clean anime name column
# remove html encodings
# convert to regular characters
anime_df['name'] = anime_df['name'].apply(html.unescape)

# remove emojis and other miscellaneous unicode characters
# encode as ascii to remove miscellaneous unicode characters
# replaces the characters with '?'
# converts str to array of bytes
anime_df['name'] = anime_df['name'].map(lambda x: x.encode('ascii', 'replace'))

# decode to convert to str
anime_df['name'] = anime_df['name'].map(lambda x: x.decode())

# replace '?' with a space
anime_df['name'] = anime_df['name'].map(lambda x: x.replace('?', ' '))

In [15]:
# verify cleaning
anime_df['name'].loc[8441]

"Dennou Sentai Voogie's Angel: Forever and Ever"

In [16]:
# add anime name column from anime dataframe to rating dataframe
rating_df = pd.merge(rating_df, anime_df.loc[:, ['anime_id', 'name']], on='anime_id')

rating_df.shape

(7813727, 4)

In [17]:
# first 5 data points
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,name
0,1,20,-1,Naruto
1,3,20,8,Naruto
2,5,20,6,Naruto
3,6,20,-1,Naruto
4,10,20,-1,Naruto


In [18]:
# convert rating data from dataframe to tensorflow Dataset
ratings = tf.data.Dataset.from_tensor_slices(dict(rating_df))

# keep only user_id and anime_name column
ratings = ratings.map(lambda x: {
    'anime_title': x['name'],
    'user_id': x['user_id']
})

for x in ratings.take(5).as_numpy_iterator():
    pprint.pprint(x)

{'anime_title': b'Naruto', 'user_id': b'1'}
{'anime_title': b'Naruto', 'user_id': b'3'}
{'anime_title': b'Naruto', 'user_id': b'5'}
{'anime_title': b'Naruto', 'user_id': b'6'}
{'anime_title': b'Naruto', 'user_id': b'10'}


In [19]:
# convert anime metadata from dataframe to tensorflow Dataset
# keep only anime name
animes = tf.data.Dataset.from_tensor_slices(anime_df['name'])

for x in animes.take(5).as_numpy_iterator():
    pprint.pprint(x)

b'Kimi no Na wa.'
b'Fullmetal Alchemist: Brotherhood'
b'Gintama '
b'Steins;Gate'
b"Gintama'"


In [20]:
# shuffle and split data: train, valid, test
# set seed
tf.random.set_seed(42)

# total data points
N = rating_df.shape[0]

# total train data points
N_train = int(0.6 * N)

# total valid data points
N_valid = int(0.2 * N)

# total test data points
N_test = N - (N_train + N_valid)

# shuffle data
shuffled = ratings.shuffle(N, seed=42, reshuffle_each_iteration=False)

# # split data
train = shuffled.take(N_train)
valid = shuffled.skip(N_train).take(N_valid)
test = shuffled.skip((N_train + N_valid)).take(N_test)

In [21]:
# extract list of unique anime titles
unique_anime_titles = np.concatenate(list(animes.apply(tf.data.experimental.unique()).batch(1000)))

unique_anime_titles[:10]

array([b'Kimi no Na wa.', b'Fullmetal Alchemist: Brotherhood',
       b'Gintama ', b'Steins;Gate', b"Gintama'",
       b'Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou',
       b'Hunter x Hunter (2011)', b'Ginga Eiyuu Densetsu',
       b'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare',
       b"Gintama': Enchousen"], dtype=object)

In [22]:
# extract list of unique user ids
user_ids = ratings.batch(1_000_000).map(lambda x: x['user_id'])
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_user_ids[:4]

array([b'1', b'10', b'100', b'1000'], dtype=object)

## Retrieval Model: Two Tower Architecture
- Query Tower
- Candidate Tower

### Model Architecture

In [23]:
# query and candidate embedding dimension
embedding_dimension = 32

- ### Query Tower

In [24]:
# query tower
# with strategy.scope():

user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),

    # add an additional embedding to account for unknown tokens
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

- ### Candidate Tower

In [25]:
# candidate tower
# with strategy.scope():

anime_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=unique_anime_titles, mask_token=None),

    # add an additional embedding to account for unknown tokens
    tf.keras.layers.Embedding(len(unique_anime_titles) + 1, embedding_dimension)
])

- ### Metrics and Loss

In [26]:
# with strategy.scope():

# metrics
metrics = tfrs.metrics.FactorizedTopK(
  candidates=animes.batch(1024).map(anime_model)
)

# loss
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

- ### Retrieval Model

In [27]:
# retrieval model
class AnimeRetrievalModel(tfrs.Model):

    def __init__(self, user_model, anime_model):
        super().__init__()
        self.user_model: tf.keras.Model = user_model
        self.anime_model: tf.keras.Model = anime_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False):
        # pass the user features into the user model
        # returns the embeddings
        user_embeddings = self.user_model(features["user_id"])
        
        # pass the movie features into the movie model
        # returns the embeddings
        positive_anime_embeddings = self.anime_model(features["anime_title"])

        # the task computes the loss and the metrics
        # compute_metrics=not training:
        # - turns metric calculation off while training
        # - speeds up training
        return self.task(user_embeddings, positive_anime_embeddings, compute_metrics=not training)

In [28]:
# with strategy.scope():

# initialize model
model = AnimeRetrievalModel(user_model, anime_model)

# compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

In [29]:
# prepare train and validation sets for training
cached_train = train.shuffle(N_train).batch(8192).cache()
cached_valid = valid.batch(4096).cache()

### Train and Evaluate Model

In [30]:
# train model
history = model.fit(cached_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [31]:
# evaluate model
model.evaluate(cached_valid, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.000883701432030648,
 'factorized_top_k/top_5_categorical_accuracy': 0.008170238696038723,
 'factorized_top_k/top_10_categorical_accuracy': 0.018187228590250015,
 'factorized_top_k/top_50_categorical_accuracy': 0.08765153586864471,
 'factorized_top_k/top_100_categorical_accuracy': 0.15612207353115082,
 'loss': 14864.486328125,
 'regularization_loss': 0,
 'total_loss': 14864.486328125}

### Making Predictions

In [32]:
# use fatorized_top_k.BruteForce layer to make predictions
# facilitates input raw query features,
# embedding it and recommending movies out of the entire dataset

# create the single layer model
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)

# generate recommendations
index.index_from_dataset(
  tf.data.Dataset.zip((animes.batch(100), animes.batch(100).map(model.anime_model)))
)

# get recommendation for specific user
# user_id: 42
_, titles = index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")

Recommendations for user 42: [b'Blue Drop: Tenshi-tachi no Gikyoku' b'Kyoufu Shinbun (2014)'
 b'Maria-sama ga Miteru 3rd']


## Ranking Model

### Prepare data for Ranking Model:
- Add the rating column to the data used for Retrieval Model

In [33]:
# convert rating data from dataframe to tensorflow Dataset
ratings = tf.data.Dataset.from_tensor_slices(dict(rating_df))

# keep only user_id and anime_name column
ratings = ratings.map(lambda x: {
    'anime_title': x['name'],
    'user_id': x['user_id'],
    'user_rating': x['rating']
})

for x in ratings.take(5).as_numpy_iterator():
    pprint.pprint(x)

{'anime_title': b'Naruto', 'user_id': b'1', 'user_rating': -1}
{'anime_title': b'Naruto', 'user_id': b'3', 'user_rating': 8}
{'anime_title': b'Naruto', 'user_id': b'5', 'user_rating': 6}
{'anime_title': b'Naruto', 'user_id': b'6', 'user_rating': -1}
{'anime_title': b'Naruto', 'user_id': b'10', 'user_rating': -1}


In [34]:
# shuffle and split data: train, valid, test
# total data points
N = rating_df.shape[0]

# total train data points
N_train = int(0.6 * N)

# total valid data points
N_valid = int(0.2 * N)

# total test data points
N_test = N - (N_train + N_valid)

# shuffle data
shuffled = ratings.shuffle(N, seed=42, reshuffle_each_iteration=False)

# # split data
train = shuffled.take(N_train)
valid = shuffled.skip(N_train).take(N_valid)
test = shuffled.skip((N_train + N_valid)).take(N_test)

### Model Architecture

#### Base Ranking Model

In [35]:
class RankingModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        # user model
        # computes user embeddings
        self.user_embeddings = tf.keras.Sequential([
          tf.keras.layers.StringLookup(vocabulary=unique_user_ids, mask_token=None),
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

        # anime model
        # computes anime embeddings
        self.anime_embeddings = tf.keras.Sequential([
          tf.keras.layers.StringLookup(vocabulary=unique_anime_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_anime_titles) + 1, embedding_dimension)
        ])

        # ratings model
        # predicts ratings
        self.ratings = tf.keras.Sequential([
          tf.keras.layers.Dense(256, activation="relu"),
          tf.keras.layers.Dense(64, activation="relu"),
          tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):
        user_id, anime_title = inputs
        
        # generate embeddings for the user id
        user_embedding = self.user_embeddings(user_id)
        
        # generate embeddings for the anime title
        anime_embedding = self.anime_embeddings(anime_title)
        
        # predict and return the ratings for user id and anime title pair
        return self.ratings(tf.concat([user_embedding, anime_embedding], axis=1))

#### Loss and Metrics

In [36]:
# with strategy.scope():

task = tfrs.tasks.Ranking(
    loss = tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

#### Anime Ranking Model

In [37]:
class AnimeRankingModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        
        self.ranking_model: tf.keras.Model = RankingModel()
            
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
          loss = tf.keras.losses.MeanSquaredError(),
          metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features: Dict[str, tf.Tensor]):
        return self.ranking_model((features["user_id"], features["anime_title"]))

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False):
        labels = features.pop("user_rating")

        rating_predictions = self(features)

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)

In [38]:
# with strategy.scope():

# initialize model
model = AnimeRankingModel()

# compile model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

In [39]:
# prepare train and test sets for training
cached_train = train.shuffle(N_train).batch(8192).cache()
cached_valid = valid.batch(4096).cache()

### Train and Evaluate Model

In [40]:
# train the model
history = model.fit(cached_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [41]:
# evaluate the model
model.evaluate(cached_valid, return_dict=True)



{'root_mean_squared_error': 2.179445505142212,
 'loss': 4.694512367248535,
 'regularization_loss': 0,
 'total_loss': 4.694512367248535}

### Making Predictions

In [42]:
for x in test.take(5).as_numpy_iterator():
    pprint.pprint(x)

{'anime_title': b'Zettai Bouei Leviathan',
 'user_id': b'58604',
 'user_rating': 7}
{'anime_title': b'Zero no Tsukaima F', 'user_id': b'40390', 'user_rating': 10}
{'anime_title': b'Steins;Gate', 'user_id': b'4262', 'user_rating': 10}
{'anime_title': b'Higashi no Eden', 'user_id': b'47739', 'user_rating': 10}
{'anime_title': b'Trigun', 'user_id': b'45901', 'user_rating': 10}


In [43]:
test_ratings = {}
test_anime_titles = ['Oppai Infinity ! The Animation', 'Steins;Gate', 'Higashi no Eden']
for anime_title in test_anime_titles:
    test_ratings[anime_title] = model({
        "user_id": np.array(["4262"]),
        "anime_title": np.array([anime_title])
    })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Ratings:
Steins;Gate: [[10.153532]]
Higashi no Eden: [[8.536829]]
Oppai Infinity ! The Animation: [[5.9849634]]
