In [1]:
import pandas as pd # work with the datasets
import numpy as np # work with the datasets
from sklearn.metrics.pairwise import cosine_similarity # calculate the cosine similiratry
from sklearn.neighbors import NearestNeighbors

### Test recommandations system (KNN)

**Source**<br>
tutorial 1: https://github.com/jisilvia/kNN_Recommender_System/blob/main/kNN_Recommender_System.ipynb<br>
tuorial 2: https://machinelearninggeek.com/book-recommender-system-using-knn/  <br>
dataset movie: https://www.kaggle.com/datasets/amirmotefaker/movielens-dataset-movies<br>
dataset ratings: https://www.kaggle.com/datasets/amirmotefaker/movielens-dataset-for-recommendation-system

In [2]:
df_movie = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

#### Datasets analysis

In [3]:
df_movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_movie.shape

(10329, 3)

In [5]:
df_movie.info() #look if there is nan in the dataset and the variable's type.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [6]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [7]:
df_ratings.shape

(105339, 4)

In [8]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [9]:
df_ratings['userId'].nunique() # check the number of user in the datasets

668

In [10]:
df_movie['movieId'].nunique() # check if there is duplicate movie in the datasets.

10329

the number of unique movie is identical to the shape of the dataset.<br>
there is no duplicate movie in the dataset

### Datasets join

**merge df_ratings and df_movie to make one dataset**

In [11]:
df = pd.merge(df_ratings, df_movie, on='movieId', how='inner')

In [12]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,9,16,4.0,842686699,Casino (1995),Crime|Drama
2,12,16,1.5,1144396284,Casino (1995),Crime|Drama
3,24,16,4.0,963468757,Casino (1995),Crime|Drama
4,29,16,3.0,836820223,Casino (1995),Crime|Drama


**filtering movies under 100 ratings**

We filter all the movie under 100 ratings to avoid that the score is not representative of the movie quality 

In [13]:
nb_ratings = pd.pivot_table(df, 
                            values=['rating'],
                            index=['movieId'],
                            aggfunc = 'count'
                           )
nb_ratings.head() # creation of pivot_table to get the number of ratings for each movie

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,232
2,92
3,58
4,11
5,62


In [14]:
nb_ratings.reset_index(inplace = True)

In [15]:
nb_ratings = nb_ratings.loc[nb_ratings['rating'] > 100] 

In [16]:
nb_ratings.shape

(150, 2)

we have 150 movies above 100 ratings

In [17]:
df_result = df.loc[df['movieId'].isin(nb_ratings['movieId'])] # filtering of the dataset

In [18]:
df_result.shape

(22556, 6)

We get a new datasets of 22556 rows

In [19]:
df_result.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
122,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
123,2,32,3.0,859046895,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
124,4,32,5.0,950323750,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
125,7,32,5.0,1322058768,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
126,8,32,3.0,858610933,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


#### Creation of user movie matrix

We make a pivot table with the movie in index, the user in columns and the ratings in values.<br>
This matrix constitute an important step in the elaboration of the recommandations system

In [20]:
matrix = pd.pivot_table(df_result, 
                        values = ['rating'],
                        index = ['title'],
                        columns = ['userId'],
                        #fill_value = 0
                       )
matrix

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2001: A Space Odyssey (1968),,,,,,,,,,,...,,,,5.0,,,,,,3.0
Ace Ventura: Pet Detective (1994),,,,,1.0,,1.0,,2.0,,...,,,,,,,3.0,,3.5,
Aladdin (1992),,,3.0,,3.5,,,,,,...,3.0,,,,,,,,,3.0
Alien (1979),,,,,,,5.0,,,,...,,,,5.0,,2.0,,4.0,,4.0
Aliens (1986),,,,,,,5.0,,,,...,,,,,,,,,,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
While You Were Sleeping (1995),,,3.0,,,,,,,,...,3.0,,,,,,,,3.0,2.0
Who Framed Roger Rabbit? (1988),,,,5.0,,,,,,,...,,,,,,,,3.0,,2.0
Willy Wonka & the Chocolate Factory (1971),,4.0,,,,5.0,,5.0,,,...,4.5,,3.0,5.0,,,4.0,,,
"Wizard of Oz, The (1939)",,,,5.0,,,,,,,...,,,,5.0,,,,3.0,,5.0


### Normalization

We normalize the data on the mean to improve the model performance.

In [21]:
matrix_norm = matrix.subtract(matrix.mean(axis=1), axis = 0)
matrix_norm.head()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2001: A Space Odyssey (1968),,,,,,,,,,,...,,,,1.039683,,,,,,-0.960317
Ace Ventura: Pet Detective (1994),,,,,-1.849711,,-1.849711,,-0.849711,,...,,,,,,,0.150289,,0.650289,
Aladdin (1992),,,-0.602094,,-0.102094,,,,,,...,-0.602094,,,,,,,,,-0.602094
Alien (1979),,,,,,,0.935897,,,,...,,,,0.935897,,-2.064103,,-0.064103,,-0.064103
Aliens (1986),,,,,,,0.853503,,,,...,,,,,,,,,,-0.646497


In [23]:
matrix_norm.columns = matrix_norm.columns.droplevel()

In [24]:
matrix_norm.fillna(0, inplace=True)

#### Creating KNN model for the recommender system

In [25]:
model_nn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=7, n_jobs=-1)

model_nn.fit(matrix_norm)



NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=7)

### Example of ratings prediction for 1 user

In [26]:
user = 1

watched_movie = matrix_norm[user].loc[matrix_norm[user].notna()].sort_values(ascending=False)\
                                            .reset_index()\
                                            .rename(columns={user:'rating'})

watched_movie[:10]

KeyError: 1

In [27]:
movie = 'Full Metal Jacket (1987)' 

distances, indices = model_nn.kneighbors(matrix_norm.loc[movie,:].values.reshape(1,-1))

In [28]:
distances

array([[0.        , 0.70749585, 0.76161408, 0.77934181, 0.77951983,
        0.77985229, 0.78371593]])

In [56]:
indices

array([[ 56,  42, 116,  28,  52,  40,   3]], dtype=int64)

In [79]:
matrix_norm.index[28]

'Breakfast Club, The (1985)'

In [89]:
recommand = []

for i in range(0, len(distances.flatten())):
    if i != 0:
        reco = {}
        reco['title'] = matrix_norm.index[indices.flatten()[i]]
        reco['distance'] =distances.flatten()[i]
        recommand.append(reco)
        
        
        
result= pd.DataFrame(recommand)       

In [90]:
result

Unnamed: 0,movie,distance
0,Dr. Strangelove or: How I Learned to Stop Worr...,0.707496
1,"Shining, The (1980)",0.761614
2,"Breakfast Club, The (1985)",0.779342
3,"Fish Called Wanda, A (1988)",0.77952
4,Die Hard (1988),0.779852
5,Alien (1979),0.783716


In [124]:
unwatched_movie = matrix_norm[user].loc[matrix_norm[user]==0].reset_index().iloc[:,0:1]

In [147]:
unwatched_movie['title']

0                    2001: A Space Odyssey (1968)
1               Ace Ventura: Pet Detective (1994)
2                                  Aladdin (1992)
3                                    Alien (1979)
4                                   Aliens (1986)
                          ...                    
139                While You Were Sleeping (1995)
140               Who Framed Roger Rabbit? (1988)
141    Willy Wonka & the Chocolate Factory (1971)
142                      Wizard of Oz, The (1939)
143                                  X-Men (2000)
Name: title, Length: 144, dtype: object

#### Creation of the KNN recommandations function

In [148]:
user = 12

def recommendation_item (user, matrix_norm):
    unwatched_movie = matrix_norm[user].loc[matrix_norm[user]==0].reset_index().iloc[:,0:1]

    watched_movie = matrix_norm[user].loc[matrix_norm[user].notna()].sort_values(ascending=False)\
                                                .reset_index()\
                                                .rename(columns={user:'rating'})


    model_nn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=7, n_jobs=-1)

    model_nn.fit(matrix_norm)

    recommand = []

    for movie in watched_movie['title'][:10]:
        distances, indices = model_nn.kneighbors(matrix_norm.loc[movie,:].values.reshape(1,-1))



        for i in range(0, len(distances.flatten())):
            if i != 0:
                rec = {}
                rec['title'] = matrix_norm.index[indices.flatten()[i]]
                rec['distance'] =distances.flatten()[i]
                recommand.append(rec)



    result = pd.DataFrame(recommand).sort_values(by='distance')   

    result = result.drop_duplicates(subset =['title'])
    
    reco = result.loc[result['title'].isin(unwatched_movie['title'])]
    
    reco = reco[:5].to_html(index = False) 
    
    return reco



#### Test of recommandation for an user

In [149]:
user = 12
matrix_norm = matrix_norm

recommendation_item (user, matrix_norm)

Unnamed: 0,title,distance
48,Clear and Present Danger (1994),0.575911
6,Monty Python and the Holy Grail (1975),0.582794
18,"Mask, The (1994)",0.658699
30,Broken Arrow (1996),0.662349
19,Pretty Woman (1990),0.675286
