In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [3]:
userInput = [{'title':'Toy Story (1995)', 'rating':4},
             {'title':'Mortal Kombat (1995)', 'rating':1},
             {'title':'Postman, The (Postino, Il) (1994)', 'rating':3},
             {'title':'Broken Arrow (1996)', 'rating':4},
             {'title':'Lawnmower Man 2: Beyond Cyberspace (1996)', 'rating':3}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                       title  rating
0                           Toy Story (1995)       4
1                       Mortal Kombat (1995)       1
2          Postman, The (Postino, Il) (1994)       3
3                        Broken Arrow (1996)       4
4  Lawnmower Man 2: Beyond Cyberspace (1996)       3


In [4]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                      title  rating
0        1                           Toy Story (1995)       4
1       44                       Mortal Kombat (1995)       1
2       58          Postman, The (Postino, Il) (1994)       3
3       66  Lawnmower Man 2: Beyond Cyberspace (1996)       3
4       95                        Broken Arrow (1996)       4


In [5]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1           215     215        215
44           46      46         46
58           37      37         37
66            9       9          9
95           84      84         84


In [6]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])


In [7]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(559,        userId  movieId  rating  timestamp
85984     559        1     5.0  865095758
85995     559       58     4.0  865095758
85996     559       66     3.0  865096234
85998     559       95     4.0  865095801), (32,       userId  movieId  rating  timestamp
4929      32        1     3.0  856736119
4942      32       58     5.0  856736227
4946      32       95     3.0  856736119), (44,       userId  movieId  rating  timestamp
6429      44        1     3.0  869251860
6436      44       66     3.0  869252563
6438      44       95     4.0  869251861), (68,        userId  movieId  rating   timestamp
10360      68        1     2.5  1158531426
10377      68       44     3.0  1158534993
10384      68       95     3.5  1158532180), (91,        userId  movieId  rating   timestamp
14121      91        1     4.0  1112713037
14138      91       58     2.0  1112710933
14141      91       95     3.0  1112711260)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [8]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()


    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.707107     559
1        -1.000000      32
2         0.500000      44
3         0.000000      68
4         0.866025      91


In [9]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
30              1.0      17
79              1.0     514
27              1.0       7
26              1.0       6
60              1.0     304


In [10]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0      17        1     4.5  1305696483
1               1.0      17       44     3.5  1305696245
2               1.0      17       47     4.0  1307262715
3               1.0      17       50     4.5  1305697013
4               1.0      17      110     4.5  1305696470
..              ...     ...      ...     ...         ...
95              1.0      17    60069     3.5  1305696839
96              1.0      17    64839     3.5  1322629112
97              1.0      17    68157     4.0  1305696894
98              1.0      17    68358     4.5  1322629267
99              1.0      17    77455     3.5  1307262449

[100 rows x 5 columns]


In [11]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0      17        1     4.5  1305696483             4.5
1              1.0      17       44     3.5  1305696245             3.5
2              1.0      17       47     4.0  1307262715             4.0
3              1.0      17       50     4.5  1305697013             4.5
4              1.0      17      110     4.5  1305696470             4.5


In [12]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  26.921660          108.699596
2                  16.720820           58.227726
3                   9.202776           29.405552
4                   1.000000            3.000000
5                   5.944911           18.334734


In [13]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     4.037626        1
2                                     3.482349        2
3                                     3.195291        3
4                                     3.000000        4
5                                     3.084106        5
6                                     4.103739        6
7                                     3.181818        7
8                                     3.000000        8
9                                     4.071797        9
10                                    3.576044       10


In [14]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
334                                        5.0      334
3925                                       5.0     3925
3792                                       5.0     3792
3813                                       5.0     3813
8235                                       5.0     8235
...                                        ...      ...
171023                                     NaN   171023
176101                                     NaN   176101
180095                                     NaN   180095
180777                                     NaN   180777
185135                                     NaN   185135

[5866 rows x 2 columns]


In [15]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                  title  \
1           2                         Jumanji (1995)   
2           3                Grumpier Old Men (1995)   
3           4               Waiting to Exhale (1995)   
4           5     Father of the Bride Part II (1995)   
5           6                            Heat (1995)   
...       ...                                    ...   
9706   186587                         Rampage (2018)   
9707   187031  Jurassic World: Fallen Kingdom (2018)   
9709   187593                      Deadpool 2 (2018)   
9710   187595         Solo: A Star Wars Story (2018)   
9716   188797                             Tag (2018)   

                                      genres  
1                 Adventure|Children|Fantasy  
2                             Comedy|Romance  
3                       Comedy|Drama|Romance  
4                                     Comedy  
5                      Action|Crime|Thriller  
...                                      ... 