In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [53]:
movies_df = pd.read_csv('../pythonproject/ml-latest-small/movies.csv')
rating_df = pd.read_csv('../pythonproject/ml-latest-small/ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [54]:
userInput = [{'title':'Breakfast Club, The (1985)', 'rating':5},
             {'title':'Toy Story (1995)', 'rating':1},
             {'title':'Jumanji (1995)', 'rating':1},
             {'title':'Pulp Fiction (1994)', 'rating':5},
             {'title':'Akira (1988)', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                        title  rating
0  Breakfast Club, The (1985)     5.0
1            Toy Story (1995)     1.0
2              Jumanji (1995)     1.0
3         Pulp Fiction (1994)     5.0
4                Akira (1988)     4.5


In [55]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                       title  rating
0        1            Toy Story (1995)     1.0
1        2              Jumanji (1995)     1.0
2      296         Pulp Fiction (1994)     5.0
3     1274                Akira (1988)     4.5
4     1968  Breakfast Club, The (1985)     5.0


In [59]:
userSubset = rating_df[rating_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())
print(userSubset['userId'].unique)

         userId  rating  timestamp
movieId                           
1           215     215        215
2           110     110        110
296         307     307        307
1274         39      39         39
1968        113     113        113
<bound method Series.unique of 0          1
16         1
320        4
422        4
516        5
        ... 
99510    609
99534    610
99552    610
99636    610
99664    610
Name: userId, Length: 784, dtype: int64>


In [58]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((91,),        userId  movieId  rating   timestamp
14121      91        1     4.0  1112713037
14122      91        2     3.0  1112713392
14173      91      296     4.5  1112711264
14316      91     1274     5.0  1112713057
14383      91     1968     3.0  1112713409), ((177,),        userId  movieId  rating   timestamp
24900     177        1     5.0  1435533535
24901     177        2     3.5  1435534109
24930     177      296     5.0  1435530409
25069     177     1274     2.0  1435535036
25129     177     1968     3.5  1435534080), ((219,),        userId  movieId  rating   timestamp
31524     219        1     3.5  1194681084
31525     219        2     2.5  1194740185
31554     219      296     4.0  1198522553
31628     219     1274     2.5  1194686351
31680     219     1968     3.0  1194931899), ((274,),        userId  movieId  rating   timestamp
39229     274        1     4.0  1171410158
39230     274        2     3.5  1171934785
39288     274      296     5.0  1171493995
39448     27

In [46]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [63]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF['userId'] = pearsonDF['userId'].apply(lambda x: int(str(x)[1:-2]))
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.351124      91
1        -0.254967     177
2         0.199967     219
3         0.616658     274
4         0.916619     298


In [64]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
49              1.0     160
43              1.0     132
70              1.0     373
82              1.0     489
63              1.0     305


In [65]:
topUsersRating=topUsers.merge(rating_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating  timestamp
0               1.0     160        1     4.0  971115026
1               1.0     160        2     4.0  971619578
2               1.0     160        6     2.0  971115114
3               1.0     160       10     1.0  971196422
4               1.0     160       22     1.0  971437089
..              ...     ...      ...     ...        ...
95              1.0     160      880     3.0  971114725
96              1.0     160      903     4.0  971195858
97              1.0     160      904     4.0  971195858
98              1.0     160      908     3.0  971112529
99              1.0     160      924     5.0  971113925

[100 rows x 5 columns]


In [66]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0              1.0     160        1     4.0  971115026             4.0
1              1.0     160        2     4.0  971619578             4.0
2              1.0     160        6     2.0  971115114             2.0
3              1.0     160       10     1.0  971196422             1.0
4              1.0     160       22     1.0  971437089             1.0


In [67]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  33.529084          114.442383
2                  27.842790           87.478362
3                   8.012997           23.082393
4                   0.549730            0.824594
5                   6.961129           19.941031


In [68]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.413227        1
2                                     3.141868        2
3                                     2.880619        3
4                                     1.500000        4
5                                     2.864626        5
6                                     3.879256        6
7                                     3.181050        7
8                                     3.000000        8
9                                     2.075569        9
10                                    3.210736       10


In [69]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
5537                                       5.0     5537
7579                                       5.0     7579
986                                        5.0      986
456                                        5.0      456
3678                                       5.0     3678
...                                        ...      ...
5323                                       0.5     5323
122627                                     0.5   122627
61729                                      0.5    61729
156609                                     0.5   156609
78039                                      0.5    78039

[6137 rows x 2 columns]


In [70]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                               title  \
2           3             Grumpier Old Men (1995)   
3           4            Waiting to Exhale (1995)   
4           5  Father of the Bride Part II (1995)   
5           6                         Heat (1995)   
6           7                      Sabrina (1995)   
...       ...                                 ...   
9709   187593                   Deadpool 2 (2018)   
9710   187595      Solo: A Star Wars Story (2018)   
9711   187717    Won't You Be My Neighbor? (2018)   
9712   188189          Sorry to Bother You (2018)   
9723   189713               BlacKkKlansman (2018)   

                                genres  
2                       Comedy|Romance  
3                 Comedy|Drama|Romance  
4                               Comedy  
5                Action|Crime|Thriller  
6                       Comedy|Romance  
...                                ...  
9709              Action|Comedy|Sci-Fi  
9710  Action|Adventure|Children|Sci