In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Data collection and preprocessing

In [11]:
# Load the datasets
title_basics_df = pd.read_csv('datasets/title.basics.tsv', delimiter='\t')
ratings_df = pd.read_csv('datasets/ml-1m/ratings.dat', delimiter='::', engine='python', header=None, names=['userId', 'movieId', 'rating', 'timestamp'])
ratings_df.set_index('userId', inplace=True)
users_df = pd.read_csv('datasets/ml-1m/users.dat', delimiter='::', engine='python', header=None, names=['userId', 'gender', 'age', 'occupation', 'zipcode'])
users_df.set_index('userId', inplace=True)
movies_df = pd.read_csv('datasets/ml-1m/movies.dat', delimiter='::', engine='python', header=None, names=['movieId', 'title', 'genres'], encoding="ISO-8859-1")
movies_df.set_index('movieId', inplace=True)

  title_basics_df = pd.read_csv('datasets/title.basics.tsv', delimiter='\t')


### Normalize text data (lower case, remove -, accents...)

In [12]:
from unidecode import unidecode

def reformat_movie_title_movielens(df, column='title'):
    ''' 
    Reformat the movie title to match the movielens dataset 
        - Remove the year from the title
        - Remove any - or : from the title
        - Remove all accent characters
        - Normalize the title to lowercase
    '''
    df[column] = df[column].astype(str)
    # Strip year and extra whitespace
    if column == 'title':
        df[column] = df[column].str.extract(r'^(.*?)\s*\(')[0]
    # Remove hyphens, colons
    df[column] = df[column].replace({'-': '', ':': ''}, regex=True)
    # Remove accent characters
    df[column] = df[column].apply(unidecode)
    # Convert to lowercase
    df[column] = df[column].str.lower()
    # Trim any leading or trailing whitespace
    df[column] = df[column].str.strip()

    return df


In [13]:
movies_df = reformat_movie_title_movielens(movies_df, column='title')
movies_df['movieId'] = movies_df.index

### Merging

We merge the data from imdb and movielens using the title

In [14]:
title_basics_df.drop(title_basics_df[title_basics_df['titleType'] != 'movie'].index, inplace=True)
title_basics_df.drop(columns=['titleType'], inplace=True)
title_basics_df.dropna(subset=['primaryTitle'], inplace=True)
title_basics_df = reformat_movie_title_movielens(title_basics_df, column='primaryTitle')

In [15]:
# We merge the movies_df with the merged_imdb_df, if the title matches, we expect to have at maximum len(movies_df) rows
movies_merged_df = pd.merge(movies_df, title_basics_df, left_on='title', right_on='primaryTitle', how='inner')

Uniformize the genres, we transform genre in a Python List format

In [16]:
# genre_x is in this format "Action|Adventure|Sci-Fi" genre_y is in this format "Action,Adventure,Sci-Fi" we need to see if the genres in x are in y if not we drop the row
movies_merged_df['genres_x'] = movies_merged_df['genres_x'].str.replace('|', ',')
movies_merged_df['genres_x'] = movies_merged_df['genres_x'].astype(str).str.lower()
movies_merged_df['genres_y'] = movies_merged_df['genres_y'].str.lower()
movies_merged_df['genres_x'] = movies_merged_df['genres_x'].str.split(',')
movies_merged_df['genres_y'] = movies_merged_df['genres_y'].str.split(',')
# We should check if at least one of the genre in x is in y if not we drop the row


In [17]:
def check_genre(genres_x, genres_y):
    for genre in genres_x:
        if genre in genres_y:
            return True
    return False

movies_merged_df['genre_match'] = movies_merged_df.apply(lambda x: check_genre(x['genres_x'], x['genres_y']), axis=1)
movies_merged_df = movies_merged_df[movies_merged_df['genre_match']]

In some case, we happen to have movies with the same title and genres but a different start year. We decide to drop the duplicates and only keep the most recent one

In [18]:
# Drop duplicate rows with the same title and genres_x

merged_df = movies_merged_df.drop_duplicates(subset=['movieId'])
merged_df.drop(columns=['genres_x', 'genre_match', 'primaryTitle', 'tconst'], inplace=True)
merged_df.rename(columns={'genres_y': 'genres', 'primaryName': 'directors'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df.drop(columns=['genres_x', 'genre_match', 'primaryTitle', 'tconst'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df.rename(columns={'genres_y': 'genres', 'primaryName': 'directors'}, inplace=True)


# Feature engineering

In [19]:
genres = [
    'action',
    'adventure',
    'animation',
    'children',
    'comedy',
    'crime',
    'documentary',
    'drama',
    'fantasy',
    'film-noir',
    'horror',
    'musical',
    'mystery',
    'romance',
    'sci-Fi',
    'thriller',
    'war',
    'western'
]


def encode_gender(value):
    if value == 'M':
        return 1
    return 0


def transform_movies(merged_df):
    """
    This function will transform the merged_df to a format that can be used by the model
    1. We will encode the genres as binary values
    2. We will drop the title and originalTitle columns since we have the movieId
    3. We will replace the '\\N' values with 0
    4. We will convert the runtimeMinutes and startYear to integers
    5. We will add the average rating for each movie
    6. We will set the movieId as the index
    """
    for genre in genres:
        merged_df[genre] = merged_df['genres'].apply(lambda x: 1 if genre in x else 0)

    merged_df.drop(columns=['genres', 'title', 'originalTitle'], inplace=True)
    merged_df.replace('\\N', 0, inplace=True)
    merged_df['runtimeMinutes'] = merged_df['runtimeMinutes'].astype(int)
    merged_df['startYear'] = merged_df['startYear'].astype(int)
    # Add the average rating for each movie
    average_ratings = ratings_df.groupby('movieId')['rating'].mean().to_dict()
    merged_df['average_rating'] = merged_df['movieId'].apply(lambda x: average_ratings.get(x, 0))
    movie_features = merged_df.copy()
    movie_features.reset_index(inplace=True)
    movie_features.drop(columns=['index'], inplace=True)
    movie_features.set_index('movieId', inplace=True)
    return movie_features

def transform_users(user_features, movie_features):
    """
    This function will transform the user_features to a format that can be used by the model
    1. We add the average rating given by the user
    2. We add the number of ratings given by the user per genre
    3. We encode user gender as binary values
    4. We encode the zipcode as integers
    """
    # For user features we will add the average rating he gave and the number of ratings he gave per genre
    movies_df.drop(columns=['movieId'], inplace=True)
    ratings_df = ratings_df.join(movies_df, on='movieId', lsuffix='_ratings', rsuffix='_movies')
    ratings_df = ratings_df.join(users_df, on='userId', lsuffix='_ratings', rsuffix='_users')
    ratings_df = ratings_df.join(movie_features, on='movieId', lsuffix='_ratings', rsuffix='_features')
    user_features['average_rating'] = ratings_df.groupby('userId')['rating'].mean()
    user_features['number_of_ratings'] = ratings_df.groupby('userId')['rating'].count()
    for genre in genres:
        user_features[f'number_of_{genre}_ratings'] = ratings_df.groupby('userId')[genre].sum()

    user_features.replace(np.nan, 0, inplace=True)
    user_features['gender'] = user_features['gender'].apply(encode_gender)

    zipcodes = user_features['zipcode'].unique()
    zipcodes_dict = {zipcode: i for i, zipcode in enumerate(zipcodes)}
    user_features['zipcode'] = user_features['zipcode'].apply(lambda x: zipcodes_dict[x])
    return user_features

def interaction_matrix(ratings_df):
    """ 
    This function will return the interaction matrix Y from the ratings_df
    """
    ratings_df = ratings_df[ratings_df['movieId'].isin(merged_df['movieId'])]
    Y = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')
    Y.replace(np.nan, 0, inplace=True)
    return Y


movie_features = transform_movies(merged_df)
user_features = transform_users(users_df, movie_features)
Y = interaction_matrix(ratings_df, merged_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[genre] = merged_df['genres'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[genre] = merged_df['genres'].apply(lambda x: 1 if genre in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df[genre] = merged_df['genres'].apply(la

# Model development

In [25]:
def create_id_mappings(ids):
    """Map original IDs to zero-based indices for array access."""
    return {old_id: new_index for new_index, old_id in enumerate(ids)}

def create_dataset(user_features, movie_features, interaction_matrix):
    """ 
    This function will create the dataset for the model
    We will create the following pairs:
    - (user1, user2, movie) with the average rating given by both users
    We will use the interaction_matrix to get the users that rated the same movie
    User1 id can't be equal to user2 id or higher than user2 id, this way we ensure that we don't have duplicate pairs
    We can create every combination of pairs with the users that rated the same movie however this will create a lot of pairs (O(n^2))
    """
    user1_vecs, user2_vecs, movie_vecs, ratings = [], [], [], []

    user_features_np = user_features.to_numpy()
    movie_features_np = movie_features.to_numpy()

    user_id_map = create_id_mappings(user_features.index)
    movie_id_map = create_id_mappings(movie_features.index)

    for user1_id, user_movies in interaction_matrix.iterrows():
        rated_movies = user_movies[user_movies > 0].index
        if rated_movies.empty:
            continue

        user1_idx = user_id_map[user1_id]
        user1_features = user_features_np[user1_idx]

        for movie_id in rated_movies:
            if movie_id not in movie_id_map:
                continue
            movie_idx = movie_id_map[movie_id]
            movie_features = movie_features_np[movie_idx]
            other_users = interaction_matrix.index[interaction_matrix[movie_id] > 0].difference([user1_id])

            for user2_id in other_users:
                if user2_id not in user_id_map:
                    continue
                user2_idx = user_id_map[user2_id]

                # Ensure unique pairs by ordering the user IDs
                if user1_id < user2_id:
                    user2_features = user_features_np[user2_idx]
                    avg_rating = (interaction_matrix.at[user1_id, movie_id] + interaction_matrix.at[user2_id, movie_id]) / 2

                    user1_vecs.append(user1_features)
                    user2_vecs.append(user2_features)
                    movie_vecs.append(movie_features)
                    ratings.append(avg_rating)

    return (np.array(user1_vecs), np.array(user2_vecs), np.array(movie_vecs), np.array(ratings))

user1_vecs, user2_vecs, movie_vecs, ratings = create_dataset(user_features, movie_features, Y.loc[Y.index[:1000]])

In [26]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

users = np.concatenate([user1_vecs, user2_vecs])
scalerUsers = StandardScaler()
scalerUsers.fit(users)
user1_vecs = scalerUsers.transform(user1_vecs)
user2_vecs = scalerUsers.transform(user2_vecs)

scalerMovies = StandardScaler()
scalerMovies.fit(movie_vecs)
movie_vecs = scalerMovies.transform(movie_vecs)

scalerRatings = MinMaxScaler((-1, 1))
scalerRatings.fit(ratings.reshape(-1, 1))
ratings = scalerRatings.transform(ratings.reshape(-1, 1))

In [27]:
from sklearn.model_selection import train_test_split

X = np.concatenate([user1_vecs, user2_vecs, movie_vecs], axis=1)
y = ratings
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
num_outputs = 32
tf.random.set_seed(42)
user_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='relu')
])

combined_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='relu')
])

movie_NN = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='relu')
])

user1_input = tf.keras.layers.Input(shape=(user_features.shape[1],), name='user1_input')
user2_input = tf.keras.layers.Input(shape=(user_features.shape[1],), name='user2_input')
movie_input = tf.keras.layers.Input(shape=(movie_features.shape[1],), name='movie_input')

v_user1 = user_NN(user1_input)
v_user1 = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(v_user1)

v_user2 = user_NN(user2_input)
v_user2 = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(v_user2)

combined_user = tf.keras.layers.concatenate([v_user1, v_user2])
v_combined = combined_NN(combined_user)
v_combined = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(v_combined)

v_movie = movie_NN(movie_input)
v_movie = tf.keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(v_movie)

rating = tf.keras.layers.Dot(axes=1)([v_combined, v_movie])

model = tf.keras.Model(inputs=[user1_input, user2_input, movie_input], outputs=rating)

opt = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_fn = tf.keras.losses.MeanSquaredError()
model.compile(optimizer=opt, loss=loss_fn)

model.summary()

In [29]:
model.fit(
    {'user1_input': X_train[:, :user_features.shape[1]], 'user2_input': X_train[:, user_features.shape[1]:2*user_features.shape[1]], 'movie_input': X_train[:, 2*user_features.shape[1]:]},
    y_train,
    epochs=2,
    batch_size=256,
    validation_split=0.2
)

Epoch 1/2
[1m19243/19243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 4ms/step - loss: 0.1095 - val_loss: 0.0995
Epoch 2/2
[1m19243/19243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 4ms/step - loss: 0.0987 - val_loss: 0.0964


<keras.src.callbacks.history.History at 0x39a66ed50>

In [30]:
# Evaluate the model on the test set
model.evaluate(
    {'user1_input': X_test[:, :user_features.shape[1]], 'user2_input': X_test[:, user_features.shape[1]:2*user_features.shape[1]], 'movie_input': X_test[:, 2*user_features.shape[1]:]},
    y_test
)

[1m48107/48107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 449us/step - loss: 0.0963


0.09652964025735855

### Recommendation Algorithm

In [31]:
def unwatched_movies(user1, user2):
    watched_movies = Y.loc[user1]
    watched_movies = watched_movies[watched_movies > 0].index
    watched_movies = watched_movies.union(Y.loc[user2][Y.loc[user2] > 0].index)
    unwatched = movie_features[~movie_features.index.isin(watched_movies)]
    return unwatched


def recommend_movies(user1, user2, top_n=10):
    unwatched = unwatched_movies(user1, user2)
    user1_vec = user_features.loc[user1].to_numpy()
    user2_vec = user_features.loc[user2].to_numpy()
    user1_vec = scalerUsers.transform(user1_vec.reshape(1, -1))
    user2_vec = scalerUsers.transform(user2_vec.reshape(1, -1))
    user1_vec = user1_vec[0]
    user2_vec = user2_vec[0]
    user1_vec = user1_vec.reshape(1, -1)
    user2_vec = user2_vec.reshape(1, -1)
    user1_vec = np.repeat(user1_vec, unwatched.shape[0], axis=0)
    user2_vec = np.repeat(user2_vec, unwatched.shape[0], axis=0)
    movie_vec = scalerMovies.transform(unwatched.to_numpy())
    rating = model.predict([user1_vec, user2_vec, movie_vec])
    unwatched = movies_df.loc[unwatched.index]
    unwatched['rating'] = scalerRatings.inverse_transform(rating)
    return unwatched.sort_values('rating', ascending=False).head(top_n)


recommend_movies(1, 2)

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 672us/step


Unnamed: 0_level_0,title,genres,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019,seven samurai,Action|Drama,4.576725
1172,cinema paradiso,Comedy|Drama|Romance,4.481499
1272,patton,Drama|War,4.4715
3092,chushingura,Drama,4.469363
2920,children of paradise,Drama|Romance,4.454284
928,rebecca,Romance|Thriller,4.425601
446,farewell my concubine,Drama|Romance,4.416856
3233,smashing time,Comedy,4.397852
649,cold fever,Comedy|Drama,4.397476
2936,sullivan's travels,Comedy,4.382164


In [34]:
users_df

Unnamed: 0_level_0,gender,age,occupation,zipcode
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,02460
5,M,25,20,55455
...,...,...,...,...
6036,F,25,15,32603
6037,F,45,1,76006
6038,F,56,1,14706
6039,F,45,0,01060
