In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Normalizer
from sklearn.metrics.pairwise import cosine_similarity

### Test recommandations system (collaborative filtering : user-based)

**Source**<br>
tutorial: https://medium.com/grabngoinfo/recommendation-system-user-based-collaborative-filtering-a2e76e3e15c4<br>
dataset movie: https://www.kaggle.com/datasets/amirmotefaker/movielens-dataset-movies<br>
dataset ratings: https://www.kaggle.com/datasets/amirmotefaker/movielens-dataset-for-recommendation-system

In [2]:
df_movie = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

#### Datasets analysis

In [3]:
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_movie.shape

(10329, 3)

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [6]:
df_ratings.shape

(105339, 4)

In [7]:
df_ratings['userId'].nunique()

668

In [8]:
df_movie['movieId'].nunique()

10329

In [None]:
#merge df_ratings and df_movie

### Datasets join

**merge df_ratings and df_movie to make one dataset**

In [None]:
df = pd.merge(df_ratings, df_movie, on='movieId', how='inner')

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama


**filtering movies under 100 ratings**

We filter all the movie under 100 ratings to avoid that the score is not representative of the movie quality 

In [11]:
nb_ratings = pd.pivot_table(df, 
                            values=['rating'],
                            index=['movieId'],
                            aggfunc = 'count'
                           )
nb_ratings.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,232
2,92
3,58
4,11
5,62


In [12]:
nb_ratings.reset_index(inplace = True)

In [13]:
nb_ratings = nb_ratings.loc[nb_ratings['rating'] > 100] 

In [14]:
nb_ratings.shape

(150, 2)

In [15]:
df_result = df.loc[df['movieId'].isin(nb_ratings['movieId'])]

In [16]:
df_result.shape

(22556, 6)

In [17]:
df_result.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
122,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
123,2,32,3.0,859046895,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
124,4,32,5.0,950323750,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
125,7,32,5.0,1322058768,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
126,8,32,3.0,858610933,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


#### Creation of user based matrix

We make a pivot table with the user in index, the movie in columns and the ratings in values.<br>
This matrix constitute an important step in the elaboration of the recommandations system

In [18]:
matrix = pd.pivot_table(df_result, 
                        values = ['rating'],
                        index = ['userId'],
                        columns = ['title'],
                        #fill_value = 0
                       )
matrix

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),"Usual Suspects, The (1995)",Waterworld (1995),While You Were Sleeping (1995),Who Framed Roger Rabbit? (1988),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)",X-Men (2000)
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,4.0,,,,...,,4.0,3.0,4.0,,,,,,
2,,,,,,,,,,,...,,3.0,4.0,,,,,4.0,,
3,,,3.0,,,,,,,,...,,,3.0,5.0,,3.0,,,,
4,,,,,,,,,,,...,,5.0,,,,,5.0,,5.0,
5,,1.0,3.5,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,,,,2.0,,,,,,,...,,,,,,,,,,
665,,3.0,,,,4.5,,,,,...,,3.5,3.0,,,,,4.0,,
666,,,,4.0,,,4.0,,2.0,4.0,...,,,,4.0,,,3.0,,3.0,4.0
667,,3.5,,,,4.5,4.5,,,,...,,4.5,,4.5,,3.0,,,,


### Normalization

We normalize the data on the mean to improve the model performance.

In [19]:
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),"Usual Suspects, The (1995)",Waterworld (1995),While You Were Sleeping (1995),Who Framed Roger Rabbit? (1988),Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)",X-Men (2000)
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,,,,,,,0.169231,,,,...,,0.169231,-0.830769,0.169231,,,,,,
2,,,,,,,,,,,...,,-1.111111,-0.111111,,,,,-0.111111,,
3,,,-0.714286,,,,,,,,...,,,-0.714286,1.285714,,-0.714286,,,,
4,,,,,,,,,,,...,,0.9375,,,,,0.9375,,0.9375,
5,,-1.7,0.8,,,,,,,,...,,,,,,,,,,


In [20]:
matrix_norm.columns = matrix_norm.columns.droplevel()

In [21]:
matrix_norm.columns

Index(['2001: A Space Odyssey (1968)', 'Ace Ventura: Pet Detective (1994)',
       'Aladdin (1992)', 'Alien (1979)', 'Aliens (1986)',
       'Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)',
       'American Beauty (1999)', 'American History X (1998)',
       'American Pie (1999)', 'Apocalypse Now (1979)',
       ...
       'Truman Show, The (1998)', 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
       'Twister (1996)', 'Usual Suspects, The (1995)', 'Waterworld (1995)',
       'While You Were Sleeping (1995)', 'Who Framed Roger Rabbit? (1988)',
       'Willy Wonka & the Chocolate Factory (1971)',
       'Wizard of Oz, The (1939)', 'X-Men (2000)'],
      dtype='object', name='title', length=150)

#### Creating similiraty matrix with pearson correlation

the correlation matrix is the key to make recommandations to the user based on item similarity

In [22]:
movie_corr = matrix_norm.T.corr()

In [23]:
movie_corr

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.306186,0.614559,0.431124,-0.094491,0.471405,0.153463,0.369274,0.195776,-1.0,...,0.099066,0.375769,0.107583,0.349334,0.410803,-0.505322,0.029258,0.018696,0.144082,0.230468
2,-0.306186,1.000000,0.866025,-1.000000,,,-0.555556,0.577350,0.333333,,...,-1.000000,,0.816497,-0.577350,,,-0.174078,,-1.000000,0.388556
3,0.614559,0.866025,1.000000,,0.162938,,-0.228587,1.000000,0.391121,-1.0,...,0.345186,0.585540,1.000000,0.000000,0.252050,-0.491398,0.484123,0.336336,0.392989,0.231076
4,0.431124,-1.000000,,1.000000,,,-0.115186,,-0.250000,,...,-0.662266,,-1.000000,0.408248,,,-0.801784,-0.063610,0.965824,0.291551
5,-0.094491,,0.162938,,1.000000,-0.970725,0.077671,,-0.158114,,...,-0.427828,,,0.960769,,-0.548611,0.104257,-0.944911,-0.052658,-0.113529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,-0.505322,,-0.491398,,-0.548611,,-0.271749,-0.852803,-0.302708,1.0,...,0.171191,,0.301511,-0.647150,-0.090909,1.000000,0.169489,-0.352034,-0.090167,-0.470566
665,0.029258,-0.174078,0.484123,-0.801784,0.104257,-0.166667,0.002218,0.636364,-0.076847,1.0,...,0.307814,0.207514,,-0.288675,0.000000,0.169489,1.000000,-0.329502,0.225991,0.247156
666,0.018696,,0.336336,-0.063610,-0.944911,,0.213668,,0.545545,,...,0.376376,-1.000000,0.000000,-0.171592,-0.456435,-0.352034,-0.329502,1.000000,0.499537,-0.170243
667,0.144082,-1.000000,0.392989,0.965824,-0.052658,1.000000,0.257518,-0.288675,0.329404,,...,0.476747,-0.471405,-0.327327,-0.084215,0.912871,-0.090167,0.225991,0.499537,1.000000,0.340742


#### Creating similarity matrix with cosine

In [24]:
movie_cos = cosine_similarity(matrix_norm.fillna(0))

In [25]:
movie_cos

array([[ 1.        , -0.03582951,  0.20601523, ...,  0.02123052,
         0.04260818,  0.12506527],
       [-0.03582951,  1.        ,  0.1523258 , ...,  0.07311028,
        -0.07814087,  0.07738403],
       [ 0.20601523,  0.1523258 ,  1.        , ...,  0.08228767,
         0.10688984,  0.10399275],
       ...,
       [ 0.02123052,  0.07311028,  0.08228767, ...,  1.        ,
         0.14654264, -0.0928286 ],
       [ 0.04260818, -0.07814087,  0.10688984, ...,  0.14654264,
         1.        ,  0.1431604 ],
       [ 0.12506527,  0.07738403,  0.10399275, ..., -0.0928286 ,
         0.1431604 ,  1.        ]])

In [26]:
movie_cos = pd.DataFrame(movie_cos, columns = matrix_norm.index, index = matrix_norm.index)


In [27]:
movie_cos

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,-0.035830,0.206015,0.079337,-0.011293,0.058817,0.074279,0.049303,0.049409,-0.010772,...,0.055534,0.096663,0.026969,0.144731,0.126032,-0.148462,0.016409,0.021231,0.042608,0.125065
2,-0.035830,1.000000,0.152326,-0.148343,0.107229,-0.017177,-0.083182,0.372104,-0.012354,0.000000,...,0.033733,0.000000,0.415301,-0.158274,-0.139983,0.009350,0.024708,0.073110,-0.078141,0.077384
3,0.206015,0.152326,1.000000,-0.012940,0.028421,-0.021129,-0.059749,0.117897,0.252470,-0.114153,...,0.092838,0.152893,0.217485,-0.049378,0.100799,-0.108868,0.123886,0.082288,0.106890,0.103993
4,0.079337,-0.148343,-0.012940,1.000000,-0.144293,-0.029302,-0.042817,-0.068126,-0.005428,0.000000,...,-0.007525,-0.002226,0.028577,0.035839,-0.010484,0.004669,-0.014385,-0.031258,0.307073,0.106771
5,-0.011293,0.107229,0.028421,-0.144293,1.000000,-0.108723,0.025429,0.064838,-0.014903,-0.038633,...,-0.080723,-0.107563,-0.078646,0.081995,-0.019957,-0.079261,0.177944,-0.061092,-0.002459,-0.043177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,-0.148462,0.009350,-0.108868,0.004669,-0.079261,0.234327,-0.069455,-0.115128,-0.093374,0.363611,...,0.057065,0.132760,0.033976,-0.152960,-0.006644,1.000000,0.085020,-0.070976,-0.008447,-0.175570
665,0.016409,0.024708,0.123886,-0.014385,0.177944,-0.036804,0.001241,0.089642,-0.010714,0.061721,...,0.176479,0.056386,0.036244,-0.036913,0.136117,0.085020,1.000000,-0.131240,0.099001,0.100273
666,0.021231,0.073110,0.082288,-0.031258,-0.061092,0.042912,0.079624,0.000000,0.150966,0.000000,...,0.134779,0.026552,0.001787,-0.064813,-0.045730,-0.070976,-0.131240,1.000000,0.146543,-0.092829
667,0.042608,-0.078141,0.106890,0.307073,-0.002459,0.032613,0.148756,-0.042730,0.100180,0.059407,...,0.237887,0.022152,-0.028656,0.011656,0.245504,-0.008447,0.099001,0.146543,1.000000,0.143160


### Example of ratings prediction for 1 user

In [51]:
n = 10

user = 1
user_sim = movie_corr
user_sim = user_sim[user].loc[user_sim.index != user].sort_values(ascending = False)[:n]

user_sim

userId
533    1.000000
34     1.000000
67     1.000000
154    1.000000
117    1.000000
621    0.981981
526    0.966988
103    0.945343
271    0.943300
26     0.940582
Name: 1, dtype: float64

In [39]:
watched_movie = matrix_norm.loc[matrix_norm.index == user].dropna(
                                                            axis = 1,
                                                            how = 'all'
                                                            )
watched_movie

title,American Beauty (1999),Apollo 13 (1995),Back to the Future (1985),Batman (1989),Batman Begins (2005),"Beautiful Mind, A (2001)","Bourne Identity, The (2002)",Braveheart (1995),Casablanca (1942),Clear and Present Danger (1994),...,Star Wars: Episode IV - A New Hope (1977),Star Wars: Episode V - The Empire Strikes Back (1980),Star Wars: Episode VI - Return of the Jedi (1983),Terminator 2: Judgment Day (1991),There's Something About Mary (1998),Titanic (1997),True Lies (1994),Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),"Usual Suspects, The (1995)"
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.169231,-0.830769,-0.830769,-1.330769,0.669231,0.169231,0.169231,0.169231,1.169231,0.669231,...,0.669231,0.669231,0.669231,-0.330769,-2.330769,-2.330769,-0.830769,0.169231,-0.830769,0.169231


In [None]:
watched_sim = matrix_norm.loc[matrix_norm.index.isin(user_sim.index)]\
                .dropna(axis = 1, how = 'all')\
                .drop(watched_movie.columns, axis = 1, errors = 'ignore')


watched_sim

title,American Pie (1999),Austin Powers: The Spy Who Shagged Me (1999),Being John Malkovich (1999),"Breakfast Club, The (1985)",Broken Arrow (1996),"Bug's Life, A (1998)",Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964),E.T. the Extra-Terrestrial (1982),Ferris Bueller's Day Off (1986),"Fifth Element, The (1997)",...,"Hunt for Red October, The (1990)",In the Line of Fire (1993),Kill Bill: Vol. 1 (2003),Memento (2000),"Rock, The (1996)",Total Recall (1990),Toy Story (1995),Trainspotting (1996),Who Framed Roger Rabbit? (1988),Willy Wonka & the Chocolate Factory (1971)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26,,-0.416667,1.083333,,,,,,,-1.416667,...,0.083333,-0.416667,,,,,,,0.583333,
34,,,,,,,0.4,,,,...,,,,,,,,,,
67,,-0.625,-0.125,-0.625,,,0.875,,,0.875,...,,,,,,,,,,
103,,,,,,,,0.125,,,...,,,,,,-0.875,0.125,,,
117,,,,,-1.0,,,,,,...,,,,,0.0,,0.0,,,-1.0
154,,,,,,,,,,,...,,,,,,,,,,
271,,,,,-1.0,,,,,,...,,,,,0.0,,-1.0,-1.0,,1.0
526,0.4,0.4,,,-0.6,-0.6,,,0.4,,...,,,,,,,,,,
533,,,,,,,,,0.0,,...,,,,,,,,,,
621,,,,,,,,,,,...,,,0.5,-0.5,,,,,,


### Creation of the user based recommandation function

In [55]:
user = 7
user_sim = movie_corr



def recommandation_user (user,user_sim):
    
    item_score = {}

    user_sim = user_sim[user].loc[user_sim.index != user].sort_values(ascending = False)[:10]

    watched_movie = matrix_norm.loc[matrix_norm.index == user].dropna(
                                                                axis = 1,
                                                                how = 'all'
                                                                )

    watched_sim = matrix_norm.loc[matrix_norm.index.isin(user_sim.index)]\
                    .dropna(axis = 1, how = 'all')\
                    .drop(watched_movie.columns, axis = 1, errors = 'ignore')

    for i in watched_sim.columns:
        rating = watched_sim[i]

        total = 0
        count = 0

        for j in user_sim.index:
            if pd.isna(rating[j]) == False:
                score = user_sim[j]*rating[j]
                total += score
                count += 1
        item_score[i] = total / count

    item_score = pd.DataFrame(item_score.items(), 
                                  columns=['movie', 'movie_score'])

    ranked_item = item_score.sort_values(by='movie_score', 
                                               ascending=False)[:10]

    return ranked_item


### test of the user based recommandation function

In [56]:
recommandation_user (user,user_sim = movie_corr)

Unnamed: 0,movie,movie_score
21,Shakespeare in Love (1998),1.456088
10,Fargo (1996),0.947502
13,Gladiator (2000),0.916667
30,Willy Wonka & the Chocolate Factory (1971),0.857143
18,"Monsters, Inc. (2001)",0.747882
19,One Flew Over the Cuckoo's Nest (1975),0.747882
29,Who Framed Roger Rabbit? (1988),0.747882
23,There's Something About Mary (1998),0.666667
11,"Fugitive, The (1993)",0.666667
27,True Lies (1994),0.606703


In [57]:
recommandation_user (user,user_sim = movie_cos)

Unnamed: 0,movie,movie_score
31,Ghostbusters (a.k.a. Ghost Busters) (1984),0.327151
69,Willy Wonka & the Chocolate Factory (1971),0.30767
62,Trainspotting (1996),0.261489
2,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",0.203483
15,Casablanca (1942),0.200964
33,Good Will Hunting (1997),0.184685
53,"Shawshank Redemption, The (1994)",0.17679
28,"Fugitive, The (1993)",0.158604
43,"Lion King, The (1994)",0.156165
65,"Usual Suspects, The (1995)",0.137625
