# Netflix Recommendation System
## Background



## 1.Library import

In [8]:
import os
import logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from wordcloud import WordCloud
import random
import pandas as pd
import numpy as np
import collections
import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow.keras.metrics import MeanAbsoluteError, RootMeanSquaredError, MeanSquaredError
import sklearn
from matplotlib import pyplot as plt
from typing import Dict, Text

tf.get_logger().setLevel('ERROR')
# tf.logging.set_verbosity(tf.logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.FATAL)

ModuleNotFoundError: No module named 'wordcloud'

## 2. Data Preprocessing

In [9]:
movies_path = '../dataset/movies_metadata.csv'
rating_path = '../dataset/ratings_small.csv'

df_movies = pd.read_csv(movies_path).drop([19730, 29503, 35587])
df_rating = pd.read_csv(rating_path)

df_movies_cleaned = df_movies[['id', 'original_title']]
df_rating_cleaned = df_rating[['movieId', 'userId', 'rating']]
display(df_movies_cleaned.head(5))
display(df_rating_cleaned.head(5))
df_movies_cleaned['id'] = df_movies_cleaned['id'].astype('int64')

df_movies_cleaned.loc[:, 'id'] = pd.to_numeric(df_movies_cleaned['id'], errors='coerce').astype('Int64')
df_movies_cleaned = df_movies_cleaned.dropna(subset=['id']) # remove any line with NaN
display(df_movies_cleaned.head(5))

merged_dataset = pd.merge(df_rating_cleaned, df_movies_cleaned[['id', 'original_title']], left_on='movieId', right_on='id', how='left')

merged_dataset = merged_dataset[~merged_dataset['id'].isna()]
merged_dataset.dropna(inplace=True)
merged_dataset.drop('movieId', axis=1, inplace=True)
display(merged_dataset['id'].describe())
dataset_size = merged_dataset.shape[0] # count final size for dataset

merged_dataset.reset_index(drop=True, inplace=True)

display(merged_dataset.head())

df_movies_cleaned = df_movies_cleaned[~df_movies_cleaned['original_title'].duplicated()]
df_movies_cleaned['original_title'].head()

merged_dataset['userId'] = merged_dataset['userId'].astype(str)
# df_movies_cleaned['original_title'] = df_movies_cleaned['original_title'].astype(str)

rating_dict = dict(merged_dataset[['userId', 'original_title', 'rating']])
movies_dict = dict(df_movies_cleaned[['original_title']])

# Transfer to tf.tensor_slice
ratings_tf = tf.data.Dataset.from_tensor_slices(rating_dict)
movies_tf = tf.data.Dataset.from_tensor_slices(movies_dict)

movies_tf = movies_tf.map(lambda x: x["original_title"])

ratings_tf = ratings_tf.map(lambda x: {
    "original_title": x["original_title"],
    "rating": float(x["rating"]),
    "userId": x["userId"]
})
def slice_df_data(data_tf, total_size, test_rate=0.2):
    test_size = int(total_size * test_rate)
    train_size = total_size - test_size
    return data_tf.take(train_size), data_tf.skip(train_size).take(test_size)

train_ds, test_ds = slice_df_data(ratings_tf, dataset_size)
usrID_lookup = np.unique(np.concatenate([merged_dataset['userId'].to_numpy()]))
title_lookup = np.unique(np.concatenate([df_movies_cleaned['original_title']]))

KeyError: '[19730, 29503, 35587] not found in axis'

## Data Visualization

In [None]:
#visulaize overall ratings
plt.figure(figsize= (25, 10))
plt.hist(x=[merged_dataset.userId], bins = 500)
plt.title("Overall rating by users")
plt.show()

In [None]:
#Filter the ratingings for one a specific movie, e.g. "Rocky III"
movieData = merged_dataset[merged_dataset['original_title'] == "Rocky III"]

# Create the histogram plot
plt.figure(figsize=(8, 6))
plt.hist(x=movieData['rating'], bins=np.arange(0, 5.5, 0.5), edgecolor='black', alpha=0.7, color='steelblue')
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Distribution of Ratings - Rocky III', fontsize=14)
plt.xticks(np.arange(0, 5.5, 0.5), fontsize=10)
plt.yticks(fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
#the scatter plot visualizes the relationship between user IDs and ratings for a random subset of 100 users

# Select a random sample of 100 unique user IDs
random_users = np.random.choice(merged_dataset['userId'].unique(), size=100, replace=False)

# Filter the dataset to include ratings from the random users
random_subset = merged_dataset[merged_dataset['userId'].isin(random_users)]

# Create the scatter plot
plt.figure(figsize=(12, 8))
plt.scatter(x=random_subset['userId'], y=random_subset['rating'], s=50, alpha=0.7, color='dodgerblue', edgecolor='k', linewidth=0.5)
plt.xlabel('')  # Leave x-axis label blank
plt.xlabel('User ID', fontsize=12)
plt.ylabel('Rating', fontsize=12)
plt.title('User Ratings by Movie ID (Random 100 Users)', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
# generate a word cloud visualization that represents the top 25 most highly rated movies with their realtion to rating

# Calculate the average rating for each movie
avg_rating_data = merged_dataset.groupby('original_title')['rating'].mean().reset_index()

# Sort the data to get the top 25 most rated movies
top_rated_movies = avg_rating_data.nlargest(25, 'rating')

# Create a dictionary with movie titles as keys and average ratings as values
rating_dict = dict(zip(top_rated_movies['original_title'], top_rated_movies['rating']))

# Generate the word cloud
wordcloud = WordCloud(background_color='white', colormap='YlGnBu', max_words=50).generate_from_frequencies(rating_dict)


def vibrant_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "rgb({}, {}, {})".format(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))

# Plot the word cloud with vibrant colors
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud.recolor(color_func=vibrant_color_func), interpolation='bilinear')
plt.axis('off')
plt.title('Top 25 Most Rated Movies - Word Cloud', fontsize=16)
plt.show()


## 3. Model construction

In [11]:
# customized model for collaborative-based system
class CollabModel(tfrs.models.Model):
  def __init__(self, rating_w, retrieval_w, embedding_dim=64, L1_num=256, L2_num=128, L3_num=32, act_func="relu") -> None:
    super().__init__()
    embedding_dim = embedding_dim # number of dimension for embedding

    self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=title_lookup, mask_token=None),
      tf.keras.layers.Embedding(len(title_lookup) + 1, embedding_dim)
    ])
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=usrID_lookup, mask_token=None),
      tf.keras.layers.Embedding(len(usrID_lookup) + 1, embedding_dim)
    ])
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(L1_num, activation=act_func), # first layer
        tf.keras.layers.Dense(L2_num, activation=act_func), # second layer
        tf.keras.layers.Dense(L3_num, activation=act_func),# third layer
        tf.keras.layers.Dense(1), # output layer
    ])

    self.rating_eval: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss = MeanSquaredError(),
        metrics = [RootMeanSquaredError(), MeanAbsoluteError()],
    )
    self.retrieval_eval: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies_tf.batch(128).map(self.movie_model)
        )
    )

    self.rating_weight = rating_w
    self.retrieval_weight = retrieval_w

  # overloading call function
  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    user_embeddings = self.user_model(features["userId"])
    movie_embeddings = self.movie_model(features["original_title"])
    
    return (
        user_embeddings,
        movie_embeddings,
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  # overloading compute_loss function
  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor: 
    label_r = features.pop("rating")
    user_embeddings, movie_embeddings, rating_pred = self(features)
    rating_loss = self.rating_eval(labels=label_r, predictions=rating_pred)
    retrieval_loss = self.retrieval_eval(user_embeddings, movie_embeddings)

    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

## 4. Model training

In [14]:
model = CollabModel(rating_w=1.0, retrieval_w=1.0, embedding_dim=32, L1_num=256, L2_num=128, L3_num=32, act_func="relu")
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

train_ds_cache = train_ds.shuffle(10_000).batch(1_000).cache()
test_ds_cache = test_ds.batch(1_000).cache()

model.fit(train_ds_cache, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f3ff67cd7f0>

In [15]:
save_path = '../model/MRS_v2_e32.h5'
model.save_weights(save_path)

## 5. Model evaluation