In [1]:
import pandas as pd
from math import sqrt

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [3]:
userInput = [{'title':'Toy Story (1995)', 'rating':2},
             {'title':'Jumanji (1995)', 'rating':1},
             {'title':'Grumpier Old Men (1995)', 'rating':3},
             {'title':'Waiting to Exhale (1995)', 'rating':5},
             {'title':'Pie in the Sky (1996)', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                      title  rating
0          Toy Story (1995)     2.0
1            Jumanji (1995)     1.0
2   Grumpier Old Men (1995)     3.0
3  Waiting to Exhale (1995)     5.0
4     Pie in the Sky (1996)     4.5


In [5]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                     title  rating
0        1          Toy Story (1995)     2.0
1        2            Jumanji (1995)     1.0
2        3   Grumpier Old Men (1995)     3.0
3        4  Waiting to Exhale (1995)     5.0
4      129     Pie in the Sky (1996)     4.5


In [6]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1           215     215        215
2           110     110        110
3            52      52         52
4             7       7          7
129           1       1          1


In [7]:
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(6,      userId  movieId  rating  timestamp
560       6        2     4.0  845553522
561       6        3     5.0  845554296
562       6        4     3.0  845554349), (19,       userId  movieId  rating  timestamp
2274      19        1     4.0  965705637
2275      19        2     3.0  965704331
2276      19        3     3.0  965707636), (68,        userId  movieId  rating   timestamp
10360      68        1     2.5  1158531426
10361      68        2     2.5  1158532776
10362      68        3     2.0  1158533415), (91,        userId  movieId  rating   timestamp
14121      91        1     4.0  1112713037
14122      91        2     3.0  1112713392
14123      91        3     3.0  1112712323), (169,        userId  movieId  rating   timestamp
24321     169        1     4.5  1059427918
24322     169        2     4.0  1078284713
24323     169        3     5.0  1078284750)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [8]:
pearsonCorrelationDict = {}

for name, group in userSubsetGroup:

    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    nRatings = len(group)

    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    tempRatingList = temp_df['rating'].tolist()
   
    tempGroupList = group['rating'].tolist()
   
    
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [9]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0        -0.500000       6
1         0.000000      19
2        -0.866025      68
3         0.000000      91
4         1.000000     169


In [10]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
70              1.0     434
86              1.0     570
56              1.0     304
55              1.0     298
78              1.0     501


In [11]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     434        1     4.0  1270604402
1               1.0     434        2     2.5  1271039378
2               1.0     434        6     4.0  1270603905
3               1.0     434       10     3.5  1270606683
4               1.0     434       32     3.5  1270604323
..              ...     ...      ...     ...         ...
95              1.0     434     2011     3.5  1270606698
96              1.0     434     2012     3.5  1270606822
97              1.0     434     2028     5.0  1270606647
98              1.0     434     2054     3.5  1271039349
99              1.0     434     2115     4.5  1270606806

[100 rows x 5 columns]


In [12]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     434        1     4.0  1270604402             4.0
1              1.0     434        2     2.5  1271039378             2.5
2              1.0     434        6     4.0  1270603905             4.0
3              1.0     434       10     3.5  1270606683             3.5
4              1.0     434       32     3.5  1270604323             3.5


In [13]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  29.066000          119.208951
2                  29.066000           88.057021
3                   7.066000           29.580293
5                   5.949281           16.398562
6                  11.193352           43.243720


In [14]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     4.101319        1
2                                     3.029554        2
3                                     4.186286        3
5                                     2.756394        5
6                                     3.863340        6
7                                     3.255710        7
8                                     3.000000        8
9                                     3.000000        9
10                                    3.437303       10
11                                    3.735441       11


In [15]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
89904                                      5.0    89904
5416                                       5.0     5416
5034                                       5.0     5034
5666                                       5.0     5666
38304                                      5.0    38304
...                                        ...      ...
184471                                     NaN   184471
185029                                     NaN   185029
185435                                     NaN   185435
187593                                     NaN   187593
188301                                     NaN   188301

[5790 rows x 2 columns]


In [16]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]
print(recommended_movie)

      movieId                               title  \
4           5  Father of the Bride Part II (1995)   
5           6                         Heat (1995)   
6           7                      Sabrina (1995)   
7           8                 Tom and Huck (1995)   
8           9                 Sudden Death (1995)   
...       ...                                 ...   
9702   185135   Sherlock - A Study in Pink (2010)   
9703   185435              Game Over, Man! (2018)   
9709   187593                   Deadpool 2 (2018)   
9710   187595      Solo: A Star Wars Story (2018)   
9713   188301         Ant-Man and the Wasp (2018)   

                                      genres  
4                                     Comedy  
5                      Action|Crime|Thriller  
6                             Comedy|Romance  
7                         Adventure|Children  
8                                     Action  
...                                      ...  
9702                              