In [120]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise import accuracy

# Creation of the dataset

First of all, we need to create our dataset. For this step, we will use two sources of data.

### Loading the MovieLens 1m dataset

In [121]:
df_rating = pd.read_csv("../data/ml-1m/ratings.dat", engine="python",
    encoding="ISO-8859-1",delimiter='::', header=None)
df_rating.columns = ["userId", "movieId", "rating", "timestamp"]
df_rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [122]:
df_items = pd.read_csv("../data/ml-1m/movies.dat", engine="python",
    encoding="ISO-8859-1",delimiter='::', header=None)
df_items.columns = ["movieId", "title", "genres"]
df_items

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


### Loading the IMDb dataset

In [123]:
!wget https://datasets.imdbws.com/name.basics.tsv.gz && gunzip name.basics.tsv.gz

--2024-07-07 19:17:46--  https://datasets.imdbws.com/name.basics.tsv.gz
Résolution de datasets.imdbws.com (datasets.imdbws.com)… 2600:9000:218e:4400:3:3082:af00:93a1, 2600:9000:218e:1200:3:3082:af00:93a1, 2600:9000:218e:5a00:3:3082:af00:93a1, ...
Connexion à datasets.imdbws.com (datasets.imdbws.com)|2600:9000:218e:4400:3:3082:af00:93a1|:443… connecté.
requête HTTP transmise, en attente de la réponse… 200 OK
Taille : 268935739 (256M) [binary/octet-stream]
Sauvegarde en : « name.basics.tsv.gz »


2024-07-07 19:17:51 (55,6 MB/s) — « name.basics.tsv.gz » sauvegardé [268935739/268935739]



In [124]:
df_name = pd.read_csv("name.basics.tsv", sep='\t')
df_name.head(10)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0053137,tt0027125"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0083922,tt0050976,tt0069467"
5,nm0000006,Ingrid Bergman,1915,1982,"actress,producer,soundtrack","tt0034583,tt0036855,tt0038109,tt0038787"
6,nm0000007,Humphrey Bogart,1899,1957,"actor,producer,miscellaneous","tt0034583,tt0042593,tt0043265,tt0033870"
7,nm0000008,Marlon Brando,1924,2004,"actor,director,writer","tt0078788,tt0068646,tt0047296,tt0070849"
8,nm0000009,Richard Burton,1925,1984,"actor,producer,director","tt0061184,tt0087803,tt0059749,tt0057877"
9,nm0000010,James Cagney,1899,1986,"actor,director,producer","tt0029870,tt0031867,tt0042041,tt0055256"


In [125]:
!wget https://datasets.imdbws.com/title.basics.tsv.gz && gunzip title.basics.tsv.gz

--2024-07-07 19:18:05--  https://datasets.imdbws.com/title.basics.tsv.gz
Résolution de datasets.imdbws.com (datasets.imdbws.com)… 2600:9000:218e:4400:3:3082:af00:93a1, 2600:9000:218e:1200:3:3082:af00:93a1, 2600:9000:218e:5a00:3:3082:af00:93a1, ...
Connexion à datasets.imdbws.com (datasets.imdbws.com)|2600:9000:218e:4400:3:3082:af00:93a1|:443… connecté.
requête HTTP transmise, en attente de la réponse… 200 OK
Taille : 191394068 (183M) [binary/octet-stream]
Sauvegarde en : « title.basics.tsv.gz »


2024-07-07 19:18:09 (48,1 MB/s) — « title.basics.tsv.gz » sauvegardé [191394068/191394068]

title.basics.tsv already exists -- do you wish to overwrite (y or n)? 

In [None]:
df_name = pd.read_csv("title.basics.tsv", sep='\t')
# Renaming originalTitle as title to be able to merge on this column with the movielens dataset
df_name.rename(columns={"originalTitle": "title"}, inplace=True)
df_name.head(10)

  df_name = pd.read_csv("title.basics.tsv", sep='\t')


Unnamed: 0,tconst,titleType,primaryTitle,title,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


### Merging the two datasets

In order to have the movies metadata, we merge the two datasets in one.

In [None]:
df_movies = pd.merge(df_items, df_name, on='title', how='left')
df_movies.dropna(inplace=True)
df_movies

Unnamed: 0,movieId,title,genres_x,tconst,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres_y
1,2,Jumanji (1995),Adventure|Children's|Fantasy,tt11707316,tvEpisode,Jumanji (1995),0,2014,\N,\N,History
2,2,Jumanji (1995),Adventure|Children's|Fantasy,tt15206184,tvEpisode,Jumanji (1995),0,2018,\N,44,Comedy
16,16,Casino (1995),Drama|Thriller,tt13185560,tvEpisode,Casino (1995),0,2019,\N,\N,"Comedy,Talk-Show"
18,18,Four Rooms (1995),Thriller,tt13174672,tvEpisode,Four Rooms (1995),0,2018,\N,\N,"Comedy,Talk-Show"
34,34,Babe (1995),Children's|Comedy|Drama,tt11137694,tvEpisode,Babe (1995),0,2017,\N,9,Comedy
...,...,...,...,...,...,...,...,...,...,...,...
3894,3793,X-Men (2000),Action|Sci-Fi,tt14096872,tvEpisode,X-Men (2000),0,2021,\N,\N,"Action,Documentary,Horror"
3895,3793,X-Men (2000),Action|Sci-Fi,tt9180730,tvEpisode,X-Men (2000),0,2018,\N,\N,Comedy
3944,3843,Sleepaway Camp (1983),Horror,tt6441668,tvEpisode,Sleepaway Camp (1983),0,2016,\N,\N,"Comedy,Horror,Talk-Show"
3978,3877,Supergirl (1984),Action|Adventure|Fantasy,tt15203112,tvEpisode,Supergirl (1984),0,2017,\N,33,Comedy


### Analysis on the dataset

# Feature Engineering

### Interaction Matrix

We create a user-item interaction matrix. A cell represents the rating given by a user to a specific movie. Most cells are empty, as not every user has rated every movie. Which is why it is a sparsed matrix.

In [None]:
interaction_matrix = df_rating.pivot(index="userId", columns="movieId", values="rating").fillna(0)
interaction_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Creation of the model

We implement our recommendation system using SVD (Singular Value Decomposition) using `numpy`.

In [None]:
U, Sigma, Vt = np.linalg.svd(interaction_matrix, full_matrices=False)

Sigma = np.diag(Sigma)

# Predict ratings
predicted_ratings = np.dot(np.dot(U, Sigma), Vt)

# Convert the predicted ratings to a DataFrame
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=interaction_matrix.index, columns=interaction_matrix.columns)

predicted_ratings_df

movieId,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.000000e+00,1.776074e-12,-1.993949e-14,7.363882e-14,-2.109103e-13,-5.973482e-13,-2.078358e-13,-3.887462e-13,-2.324115e-13,-1.254425e-12,...,-3.429177e-14,-5.881146e-15,-2.926533e-15,-1.689208e-14,-1.074375e-14,-2.663299e-13,-1.490669e-13,-2.691960e-14,-1.193740e-14,-9.900627e-14
2,-1.360115e-14,-3.185156e-13,-2.525587e-13,1.073412e-13,-5.551338e-14,-3.212557e-14,-1.404186e-13,1.842786e-13,-9.137062e-14,1.023246e-13,...,-7.081331e-16,-3.484626e-16,-9.243095e-16,1.420061e-15,3.392198e-15,-3.117770e-14,3.417033e-14,2.083105e-15,-6.906896e-15,4.085430e-15
3,-6.626047e-15,-4.055117e-14,-1.947931e-13,-1.521987e-13,-2.063619e-14,-7.935052e-14,3.215179e-14,-4.205775e-14,9.689271e-15,4.513295e-14,...,2.271295e-15,1.913834e-15,4.433845e-16,-1.412817e-16,3.537210e-17,1.706193e-14,1.312383e-14,1.167428e-15,1.213331e-15,7.172834e-15
4,-6.685755e-15,2.561105e-14,-6.827086e-15,8.654960e-14,3.458699e-14,-1.262624e-13,4.150718e-14,9.083443e-14,2.694088e-14,2.309463e-14,...,-2.207422e-15,3.104071e-15,-1.534390e-15,-2.765278e-15,4.111376e-15,-1.079485e-14,-8.085729e-15,-8.439193e-16,2.749882e-15,-9.513430e-15
5,-6.596806e-15,-2.818431e-14,1.129844e-14,-1.426786e-14,4.808726e-14,2.000000e+00,-4.983314e-14,-9.448619e-16,3.868569e-15,-2.053276e-14,...,-1.254286e-17,1.634977e-15,5.802650e-16,-2.882799e-15,-3.101821e-15,-3.592504e-16,-1.016905e-14,-1.172023e-15,-8.410698e-16,-3.560054e-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,1.072631e-14,-9.041703e-14,4.699284e-14,2.000000e+00,1.295185e-14,3.000000e+00,-1.496505e-14,-1.443933e-14,-1.683004e-14,4.630526e-15,...,-6.970254e-15,-2.264248e-15,-8.602467e-16,-7.153871e-15,-4.430589e-15,-4.855962e-15,-5.254735e-15,-3.518236e-16,-6.933151e-15,-8.647125e-15
6037,-1.322273e-15,-3.305378e-14,2.031301e-14,3.051590e-14,1.104670e-14,4.060649e-15,7.276833e-15,4.144756e-15,1.064646e-14,5.699560e-15,...,2.061339e-15,1.105886e-16,-9.451532e-16,-7.510777e-16,-3.744451e-15,6.708948e-15,-6.931982e-16,-2.340874e-15,-3.823574e-16,-5.825079e-15
6038,-1.580513e-16,-5.889407e-15,-3.273963e-15,3.478202e-15,-2.824631e-15,-6.564461e-16,4.596993e-15,1.028244e-15,-7.921723e-16,1.158724e-15,...,-1.482985e-17,1.929880e-16,-9.920450e-17,-2.467475e-16,3.455403e-15,9.795126e-17,1.067890e-15,8.527792e-16,-1.481657e-15,2.141562e-16
6039,-2.692254e-15,-2.611156e-14,-7.197939e-15,2.085796e-14,4.550505e-15,4.759105e-15,2.161733e-15,3.380583e-15,-1.522410e-15,4.494452e-15,...,3.165423e-15,6.704706e-16,-4.911978e-16,2.939001e-16,1.066570e-15,7.798310e-15,2.434586e-15,3.231465e-16,-1.650061e-15,-1.664130e-15


So, we first tried to implement our recommender system using a regular SVD. \
However, the result we got is not really satisfying. As we can see, the missing values were replaced by predicted ones in the reconstructed interaction matrix. Most of them are values around 0, which is a problem. Indeed, we can not affirm a user disliked a movie it has not rated. A regular SVD does not work well on sparse data, which is our case here. \
Now we'll try a variant of a regular SVD. 

## SVD using sckikit-surprise

Funk SVD is available which is implemented in the `scikit-surprise` library. \
Since it is known to be efficient computationally and to work well with sparse matrix (which is what we have for the interaction matrix), we are going to use it to implement our recommendation system. 

In [None]:
user_id1 = 1
user_id2 = 122

class CoupleRecommendation():
    def __init__(self, df_rating, df_movies):
        self.df_rating = df_rating
        self.df_movies = df_movies
        self.all_movies_ids = df_movies["movieId"]
        # Creation of the model
        self.model = SVD(n_epochs=20)
        self.fit = self.model.fit
        self.test = self.model.test

    def get_predictions_for_user(self, user_id):
        # Get list of already seen movies
        seen_movies = self.df_rating[self.df_rating['userId'] == user_id]['movieId'].tolist()
        
        # Predict ratings for all unseen movies
        predictions = [self.model.predict(user_id, movie_id) for movie_id in self.all_movies_ids if movie_id not in seen_movies]
        predictions_df = pd.DataFrame(predictions, columns=['userId', 'movieId', 'actual_rating', 'predicted_rating', 'details'])
        # Return the predictions for the user sorted from higher to lower rating
        return predictions_df[['movieId', 'predicted_rating']].sort_values(by="predicted_rating", ascending=False)

    def get_predictions_for_couple(self, user_id1, user_id2, n):
        # Get predictions for all unseen movies for each user
        user1_predictions = self.get_predictions_for_user(user_id1)
        user2_predictions = self.get_predictions_for_user(user_id2)

        # Merge the two users predictions in one dataframe
        common_ratings = pd.merge(user1_predictions, user2_predictions, on='movieId', suffixes=('_user1', '_user2'))
        # Combining the two users predicted rating into one
        common_ratings['couple_score'] = (common_ratings['predicted_rating_user1'] + common_ratings['predicted_rating_user2']) / 2

        top_ratings = common_ratings.sort_values(by='couple_score', ascending=False).drop_duplicates(subset=['movieId']).head(n)
        recommended_movies = pd.merge(top_ratings, self.df_movies[["movieId", "title"]], on='movieId', how='left').drop_duplicates(subset=['movieId'])
        return recommended_movies
        

### Training of the model
Now that our model is implemented, let's train it.

In [None]:
# Preparing data for surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_rating[['userId', 'movieId', 'rating']], reader)

# Spliting data for training and evaluation
train, test = train_test_split(data, test_size=0.2)

# Creation of the SVD model
couple_recommendation = CoupleRecommendation(df_rating, df_movies)
couple_recommendation.fit(train)
# Evaluation of the model using the test set
pred = couple_recommendation.test(test)

### Evaluation of the model

In [None]:
accuracy.rmse(pred)
# Cross validation of the model
cross_validate(couple_recommendation.model, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

RMSE: 0.8741
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8742  0.8748  0.8731  0.8722  0.8743  0.8737  0.0009  
MAE (testset)     0.6864  0.6866  0.6846  0.6844  0.6862  0.6856  0.0009  
Fit time          4.68    4.60    4.70    4.68    4.95    4.72    0.12    
Test time         0.52    0.69    0.73    0.52    0.73    0.64    0.10    


{'test_rmse': array([0.87422777, 0.8748137 , 0.8731122 , 0.87224028, 0.87425714]),
 'test_mae': array([0.68635303, 0.68658354, 0.68462293, 0.6844144 , 0.68618251]),
 'fit_time': (4.675575017929077,
  4.600184202194214,
  4.69711709022522,
  4.6811370849609375,
  4.950675964355469),
 'test_time': (0.5216960906982422,
  0.6874277591705322,
  0.7326030731201172,
  0.5175328254699707,
  0.7258949279785156)}

As we can see, the score of the model is pretty good on the test dataset. Now let's use it to recommend movies to a couple of users.

In [None]:
couple_recommendation.get_predictions_for_couple(22, 37, 20)

Unnamed: 0,movieId,predicted_rating_user1,predicted_rating_user2,couple_score,title
0,904,4.335066,4.618186,4.476626,Rear Window (1954)
2,903,4.065834,4.744778,4.405306,Vertigo (1958)
4,1276,4.009949,4.63946,4.324705,Cool Hand Luke (1967)
5,1252,4.205449,4.369003,4.287226,Chinatown (1974)
6,2160,4.556399,3.990336,4.273367,Rosemary's Baby (1968)
8,1203,3.812704,4.693947,4.253325,12 Angry Men (1957)
9,973,4.096952,4.356957,4.226954,Meet John Doe (1941)
10,541,4.050474,4.329996,4.190235,Blade Runner (1982)
12,1028,3.506004,4.867037,4.18652,Mary Poppins (1964)
14,1256,4.302217,4.070533,4.186375,Duck Soup (1933)
