In [29]:
import pandas as pd # work with the datasets
import numpy as np # work with the datasets
from sklearn.metrics.pairwise import cosine_similarity # calculate the cosine similiratry

### Test recommandations system (collaborative filtering : item-based)

**Source**<br>
tutorial: https://medium.com/grabngoinfo/recommendation-system-item-based-collaborative-filtering-f5078504996a<br>
dataset movie: https://www.kaggle.com/datasets/amirmotefaker/movielens-dataset-movies<br>
dataset ratings: https://www.kaggle.com/datasets/amirmotefaker/movielens-dataset-for-recommendation-system

In [7]:
df_movie = pd.read_csv('movies.csv') # importation of the movie dataset
df_ratings = pd.read_csv('ratings.csv') # importation of the ratings dataset

#### Datasets analysis

In [8]:
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
df_movie.shape

(10329, 3)

In [124]:
df_movie.info() #look if there is nan in the dataset and the variable's type.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [9]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [13]:
df_ratings.shape

(105339, 4)

In [125]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [10]:
df_ratings['userId'].nunique() # check the number of user in the datasets

668

In [11]:
df_movie['movieId'].nunique() # check if there is duplicate movie in the datasets.

10329

the number of unique movie is identical to the shape of the dataset.<br>
there is no duplicate movie in the dataset

### Datasets join

**merge df_ratings and df_movie to make one dataset**

In [14]:
df = pd.merge(df_ratings, df_movie, on='movieId', how='inner')

In [15]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama


**filtering movies under 100 ratings**

We filter all the movie under 100 ratings to avoid that the score is not representative of the movie quality 

In [21]:
nb_ratings = pd.pivot_table(df, 
                            values=['rating'],
                            index=['movieId'],
                            aggfunc = 'count'
                           )
nb_ratings.head() # creation of pivot_table to get the number of ratings for each movie

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,232
2,92
3,58
4,11
5,62


In [22]:
nb_ratings.reset_index(inplace = True)

In [23]:
nb_ratings = nb_ratings.loc[nb_ratings['rating'] > 100] # filtering all movies under 100 ratings

In [24]:
nb_ratings.shape

(150, 2)

we have 150 movies above 100 ratings

In [25]:
df_result = df.loc[df['movieId'].isin(nb_ratings['movieId'])] # filtering of the dataset

In [26]:
df_result.shape

(22556, 6)

We get a new datasets of 22556 rows

In [27]:
df_result.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
122,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
123,2,32,3.0,859046895,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
124,4,32,5.0,950323750,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
125,7,32,5.0,1322058768,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
126,8,32,3.0,858610933,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


#### Creation of item based matrix

We make a pivot table with the movie in index, the user in columns and the ratings in values.<br>
This matrix constitute an important step in the elaboration of the recommandations system

In [64]:
matrix = pd.pivot_table(df_result, 
                        values = ['rating'],
                        index = ['title'],
                        columns = ['userId'],
                       )
matrix

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2001: A Space Odyssey (1968),,,,,,,,,,,...,,,,5.0,,,,,,3.0
Ace Ventura: Pet Detective (1994),,,,,1.0,,1.0,,2.0,,...,,,,,,,3.0,,3.5,
Aladdin (1992),,,3.0,,3.5,,,,,,...,3.0,,,,,,,,,3.0
Alien (1979),,,,,,,5.0,,,,...,,,,5.0,,2.0,,4.0,,4.0
Aliens (1986),,,,,,,5.0,,,,...,,,,,,,,,,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
While You Were Sleeping (1995),,,3.0,,,,,,,,...,3.0,,,,,,,,3.0,2.0
Who Framed Roger Rabbit? (1988),,,,5.0,,,,,,,...,,,,,,,,3.0,,2.0
Willy Wonka & the Chocolate Factory (1971),,4.0,,,,5.0,,5.0,,,...,4.5,,3.0,5.0,,,4.0,,,
"Wizard of Oz, The (1939)",,,,5.0,,,,,,,...,,,,5.0,,,,3.0,,5.0


### Normalization

We normalize the data on the mean to improve the model performance.

In [65]:
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2001: A Space Odyssey (1968),,,,,,,,,,,...,,,,1.039683,,,,,,-0.960317
Ace Ventura: Pet Detective (1994),,,,,-1.849711,,-1.849711,,-0.849711,,...,,,,,,,0.150289,,0.650289,
Aladdin (1992),,,-0.602094,,-0.102094,,,,,,...,-0.602094,,,,,,,,,-0.602094
Alien (1979),,,,,,,0.935897,,,,...,,,,0.935897,,-2.064103,,-0.064103,,-0.064103
Aliens (1986),,,,,,,0.853503,,,,...,,,,,,,,,,-0.646497


In [66]:
matrix_norm.columns = matrix_norm.columns.droplevel()

In [74]:
matrix_norm.columns

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            659, 660, 661, 662, 663, 664, 665, 666, 667, 668],
           dtype='int64', name='userId', length=653)

#### Creating similiraty matrix with pearson correlation

the correlation matrix is the key to make recommandations to the user based on item similarity

In [68]:
movie_corr = matrix_norm.T.corr()

In [69]:
movie_corr

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),"Usual Suspects, The (1995)",Waterworld (1995),While You Were Sleeping (1995),Who Framed Roger Rabbit? (1988),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)",X-Men (2000)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),1.000000,0.009197,0.094955,0.161791,0.111129,0.255150,0.326736,-0.116787,0.339217,0.375652,...,0.133435,0.175824,0.364667,0.355418,0.175233,0.473549,0.389881,0.069144,0.360623,0.081901
Ace Ventura: Pet Detective (1994),0.009197,1.000000,0.045501,0.220471,0.017538,0.184113,0.264089,0.092017,0.536903,-0.231039,...,0.414541,-0.065448,0.180731,0.124333,0.122249,0.350943,0.418543,0.289430,-0.047664,0.109188
Aladdin (1992),0.094955,0.045501,1.000000,0.131826,0.256574,0.188945,0.074428,0.293284,0.363807,0.027862,...,0.419170,0.023321,0.355073,0.285138,0.223299,0.285658,0.520339,0.330666,0.238034,0.335238
Alien (1979),0.161791,0.220471,0.131826,1.000000,0.556528,0.008468,0.272213,0.115971,0.326924,0.189564,...,0.493435,0.160803,0.198784,0.255713,0.280980,0.087540,0.430520,0.260268,0.299903,0.144456
Aliens (1986),0.111129,0.017538,0.256574,0.556528,1.000000,0.179657,0.195056,0.200158,0.339430,0.305410,...,0.348262,0.116643,0.225911,0.297400,0.218770,0.278225,0.413528,0.011283,0.238814,0.026598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
While You Were Sleeping (1995),0.473549,0.350943,0.285658,0.087540,0.278225,0.323155,0.031898,0.086136,0.121784,-0.247799,...,0.105667,0.000869,0.338573,0.060506,0.425258,1.000000,0.260691,0.178454,0.235121,0.015246
Who Framed Roger Rabbit? (1988),0.389881,0.418543,0.520339,0.430520,0.413528,0.268721,0.401964,0.174975,0.169209,0.111093,...,0.420610,0.198687,0.291153,0.407854,0.287291,0.260691,1.000000,0.419892,0.134453,0.499656
Willy Wonka & the Chocolate Factory (1971),0.069144,0.289430,0.330666,0.260268,0.011283,0.529030,0.119050,0.159832,0.180712,0.216463,...,0.273207,-0.111587,0.037545,0.202341,-0.001061,0.178454,0.419892,1.000000,0.223773,0.183367
"Wizard of Oz, The (1939)",0.360623,-0.047664,0.238034,0.299903,0.238814,0.193737,0.038805,-0.021181,-0.093679,0.211547,...,-0.103785,0.011591,0.271304,0.154120,0.037598,0.235121,0.134453,0.223773,1.000000,0.035724


#### Creating similarity matrix with cosine

In [127]:
movie_cos = cosine_similarity(matrix_norm.fillna(0))

In [132]:
movie_cos

array([[ 1.        ,  0.00239804,  0.03036161, ...,  0.03045315,
         0.1869451 ,  0.02662157],
       [ 0.00239804,  1.        ,  0.02847017, ...,  0.10816846,
        -0.01071678,  0.03320306],
       [ 0.03036161,  0.02847017,  1.        , ...,  0.13555956,
         0.08855735,  0.09631236],
       ...,
       [ 0.03045315,  0.10816846,  0.13555956, ...,  1.        ,
         0.10176866,  0.04128851],
       [ 0.1869451 , -0.01071678,  0.08855735, ...,  0.10176866,
         1.        ,  0.00957729],
       [ 0.02662157,  0.03320306,  0.09631236, ...,  0.04128851,
         0.00957729,  1.        ]])

In [133]:
movie_cos = pd.DataFrame(movie_cos, columns = matrix_norm.index, index = matrix_norm.index)


In [134]:
movie_cos

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),"Usual Suspects, The (1995)",Waterworld (1995),While You Were Sleeping (1995),Who Framed Roger Rabbit? (1988),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)",X-Men (2000)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),1.000000,0.002398,0.030362,0.102831,0.068432,0.087204,0.100289,-0.029430,0.100589,0.184160,...,0.046372,0.070273,0.089081,0.142024,0.040875,0.078542,0.144135,0.030453,0.186945,0.026622
Ace Ventura: Pet Detective (1994),0.002398,1.000000,0.028470,0.065664,0.007766,0.036578,0.109186,0.024869,0.215627,-0.060172,...,0.165866,-0.032063,0.070456,0.055699,0.074808,0.149311,0.151619,0.108168,-0.010717,0.033203
Aladdin (1992),0.030362,0.028470,1.000000,0.051684,0.097400,0.030984,0.029856,0.090170,0.116300,0.007502,...,0.157308,0.010936,0.164669,0.146599,0.111028,0.149122,0.211902,0.135560,0.088557,0.096312
Alien (1979),0.102831,0.065664,0.051684,1.000000,0.376007,0.007532,0.123323,0.042557,0.133125,0.092797,...,0.214591,0.076489,0.071533,0.117668,0.081496,0.027983,0.195371,0.107275,0.123400,0.062152
Aliens (1986),0.068432,0.007766,0.097400,0.376007,1.000000,0.046216,0.084222,0.073928,0.105943,0.141798,...,0.121561,0.055561,0.070108,0.118617,0.060372,0.044269,0.170496,0.003122,0.082700,0.012711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
While You Were Sleeping (1995),0.078542,0.149311,0.149122,0.027983,0.044269,0.029526,0.011424,0.016841,0.047902,-0.024365,...,0.055548,-0.000497,0.104829,0.030770,0.187289,1.000000,0.076808,0.059586,0.045520,-0.018212
Who Framed Roger Rabbit? (1988),0.144135,0.151619,0.211902,0.195371,0.170496,0.080758,0.177597,0.045991,0.086300,0.029761,...,0.165048,0.067995,0.103291,0.146621,0.101843,0.076808,1.000000,0.176606,0.058590,0.194721
Willy Wonka & the Chocolate Factory (1971),0.030453,0.108168,0.135560,0.107275,0.003122,0.097012,0.045579,0.040397,0.063032,0.065275,...,0.120076,-0.039693,0.021578,0.062243,0.008998,0.059586,0.176606,1.000000,0.101769,0.041289
"Wizard of Oz, The (1939)",0.186945,-0.010717,0.088557,0.123400,0.082700,0.028151,0.017412,0.013502,-0.014368,0.075778,...,-0.027556,0.000821,0.081940,0.047932,0.010812,0.045520,0.058590,0.101769,1.000000,0.009577


### Example of ratings prediction for 1 movie and 1 user

In [79]:
user = 1
movie = 'Aladdin (1992)'

watched_movie = matrix_norm[user].loc[matrix_norm[user].notna()].sort_values(ascending=False)\
                                        .reset_index()\
                                        .rename(columns={user:'rating'})

watched_movie # get the list of watched movies and the ratings

Unnamed: 0,title,rating
0,Full Metal Jacket (1987),0.873874
1,Fight Club (1999),0.811594
2,Clear and Present Danger (1994),0.809701
3,"Silence of the Lambs, The (1991)",0.805172
4,Casablanca (1942),0.764000
...,...,...
60,Speed (1994),-1.012690
61,"Sixth Sense, The (1999)",-1.130890
62,Forrest Gump (1994),-1.138264
63,Titanic (1997),-1.856209


In [83]:
movie_sim = movie_corr.loc[movie].reset_index()\
        .rename(columns={movie :'similarity_score'}) # define the similarity score of the movie (pearson)

In [84]:
movie_sim

Unnamed: 0,title,similarity_score
0,2001: A Space Odyssey (1968),0.094955
1,Ace Ventura: Pet Detective (1994),0.045501
2,Aladdin (1992),1.000000
3,Alien (1979),0.131826
4,Aliens (1986),0.256574
...,...,...
145,While You Were Sleeping (1995),0.285658
146,Who Framed Roger Rabbit? (1988),0.520339
147,Willy Wonka & the Chocolate Factory (1971),0.330666
148,"Wizard of Oz, The (1939)",0.238034


In [135]:
movie_sim = movie_cos.loc[movie].reset_index()\
        .rename(columns={movie :'similarity_score'}) # define the similarity score of the movie (cosine similarity)

In [136]:
movie_sim

Unnamed: 0,title,similarity_score
0,2001: A Space Odyssey (1968),0.026622
1,Ace Ventura: Pet Detective (1994),0.033203
2,Aladdin (1992),0.096312
3,Alien (1979),0.062152
4,Aliens (1986),0.012711
...,...,...
145,While You Were Sleeping (1995),-0.018212
146,Who Framed Roger Rabbit? (1988),0.194721
147,Willy Wonka & the Chocolate Factory (1971),0.041289
148,"Wizard of Oz, The (1939)",0.009577


In [85]:
# merge on watched_movie and filtering the 5 highest rated movies by the user

n = 5 # number of movies wanted

sim_u_m = pd.merge(watched_movie, 
                   movie_sim, 
                   on = 'title', 
                   how = 'inner'
                  )[:n]

sim_u_m

Unnamed: 0,title,rating,similarity_score
0,Full Metal Jacket (1987),0.873874,0.194297
1,Fight Club (1999),0.811594,0.240949
2,Clear and Present Danger (1994),0.809701,0.374017
3,"Silence of the Lambs, The (1991)",0.805172,0.230297
4,Casablanca (1942),0.764,0.233715


In [87]:
# rating prediction for the movie
predicted_rating = round(np.average(sim_u_m['rating'], 
                            weights=sim_u_m['similarity_score']), 6)

print(f'the predicted rating for {movie} is {predicted_rating}')

the predicted rating for Aladdin (1992) is 0.810644


#### Creation of the function to make the recommandations system

In [128]:
def recommendation_item (user, nb_item, nb_reco, matrix, matrix_norm):
    unwatched_movie = matrix_norm[user].loc[matrix_norm[user].isna()].sort_values(ascending=False)\
                                            .reset_index()\
                                            .rename(columns={user:'pred_rating'})

    watched_movie = matrix_norm[user].loc[matrix_norm[user].notna()].sort_values(ascending=False)\
                                            .reset_index()\
                                            .rename(columns={user:'rating'})

    for movie in unwatched_movie['title']:
        movie_sim = matrix.loc[movie].reset_index()\
            .rename(columns={movie :'similarity_score'})
        movie_pred = pd.merge(watched_movie, 
                       movie_sim, 
                       on = 'title', 
                       how = 'inner'
                      )[:nb_item]


        unwatched_movie.loc[unwatched_movie['title'] == movie, 'pred_rating']=round(
                        np.average(movie_pred['rating'], 
                        weights=movie_pred['similarity_score']), 
                        6
                        )


    reco = unwatched_movie.sort_values('pred_rating', ascending=False)[:nb_reco]
    
    return reco

#### Test of recommandation for an user

In [129]:
user = 8
nb_item = 5
nb_reco = 6
matrix = movie_corr
matrix_norm = matrix_norm

recommendation_item (user, nb_item, nb_reco, matrix, matrix_norm)

Unnamed: 0,title,pred_rating
70,Inception (2010),1.473793
22,Being John Malkovich (1999),1.462027
65,Groundhog Day (1993),1.422758
24,Blade Runner (1982),1.368417
60,"Godfather: Part II, The (1974)",1.340346
...,...,...
63,Goodfellas (1990),1.098361
90,Monty Python and the Holy Grail (1975),1.072021
43,Edward Scissorhands (1990),1.064160
66,Heat (1995),1.047174


In [137]:
user = 8
nb_item = 5
nb_reco = 6
matrix = movie_cos
matrix_norm = matrix_norm

recommendation_item (user, nb_item, nb_reco, matrix, matrix_norm)

Unnamed: 0,title,pred_rating
70,Inception (2010),1.494203
22,Being John Malkovich (1999),1.434460
24,Blade Runner (1982),1.417203
65,Groundhog Day (1993),1.391672
74,Interview with the Vampire: The Vampire Chroni...,1.376385
...,...,...
37,"Dark Knight, The (2008)",1.099992
63,Goodfellas (1990),1.083106
43,Edward Scissorhands (1990),1.080538
66,Heat (1995),1.052652
