In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
file_movie = '/Users/kiananasiri/Desktop/Machine Learning/Recommender System/movies.csv'
file_rating = '/Users/kiananasiri/Desktop/Machine Learning/Recommender System/ratings.csv'

In [3]:
dfm = pd.read_csv(file_movie)
dfr = pd.read_csv(file_rating)

In [4]:
dfm['year'] = dfm.title.str.extract ('(\(\d\d\d\d\))' , expand=False )
dfm['year'] = dfm.title.str.extract ('(\d\d\d\d)' , expand=False )
dfm['title'] = dfm.title.str.replace('(\(\d\d\d\d\))', '' , regex=True)
dfm['title'] = dfm['title'].apply(lambda x: x.strip())

In [5]:
dfm.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


Dropping genres because we wont use it at all, we would try to collabrate similar users or items.

In [6]:
dfm = dfm.drop('genres' , axis = 1)

also dropping timestmap from rating data frame.

In [7]:
dfr = dfr.drop ( 'timestamp' , axis = 1)

### Recommedation system

In [8]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


Adding movieId to user input : 
- filtering out movies using title to only select input user-watched movies
- then merging selected movieIds with movies using merge function 
- dropping year because we wont use it

In [9]:
inputId = dfm [ dfm['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('year', axis=1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


filtering out raters by who has watched at least one of the movies the user has watched and rated : 
if there was a common movieId in sb's rated list and input list ,the userId will come in teh subset below

In [10]:
userSubset = dfr [ dfr['movieId'].isin( inputMovies['movieId'].tolist() ) ]
userSubset

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
...,...,...,...
99510,609,296,4.0
99534,610,1,5.0
99552,610,296,5.0
99636,610,1274,5.0


making a df that would grouped by userId:

In [11]:
userSubsetGroup = userSubset.groupby(['userId'])

In [12]:
userSubsetGroup.get_group(609)

  userSubsetGroup.get_group(609)


Unnamed: 0,userId,movieId,rating
99497,609,1,3.0
99510,609,296,4.0


sorting also these groups by users who has most in common movies , so we could have richer recommendation system and a prioritized list.
here if the users have most in common movies , the user would appear near to top of the list so we wont need to go through every single user.

In [13]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
userSubsetGroup[:4]

[((91,),
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 ((177,),
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 ((219,),
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0),
 ((274,),
         userId  movieId  rating
  39229     274        1     4.0
  39230     274        2     3.5
  39288     274      296     5.0
  39448     274     1274     4.0
  39549     274     1968     4.0)]

we're going to find out how similar each user is to the input through the **Pearson Correlation Coefficient**. 

for this we wont use all users but only who has most in common movies watched so we consider fisrt 200 people

In [14]:
userSubsetGroup = userSubsetGroup[:200]

Now, we calculate the Pearson Correlation between input user and subset group, and store it in a dictionary, where the key is the user Id and the value is the coefficient.


In [39]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


Converting userIds to int number from tuple to help us merge two df based on userID

In [55]:
new_dic= {}
for i , val in pearsonCorrelationDict.items() :
    new_dic[int(i[0])] = val
pearsonCorrelationDict = new_dic

here we could clearly see how users are alike to input according to Pearson in a pandas dataframe:

In [56]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.438529,91
1,0.0,177
2,0.451243,219
3,0.716115,274
4,0.959271,298


now see most alike users to input: (first 60 alike users)

In [57]:
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:60]
topUsers.head()

Unnamed: 0,similarityIndex,userId
43,1.0,132
100,1.0,5
144,1.0,191
129,1.0,125
130,1.0,130


merging top silimiar users to input with their ratings to movies.

In [60]:
topUsersRating= topUsers.merge ( dfr , left_on= 'userId' , right_on ='userId' , how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,132,1,2.0
1,1.0,132,17,3.0
2,1.0,132,29,2.0
3,1.0,132,32,3.0
4,1.0,132,34,1.5


by multiplying similarity matrix and rates we add a new column named 'weightedRating'

In [67]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,132,1,2.0,2.0
1,1.0,132,17,3.0,3.0
2,1.0,132,29,2.0,2.0
3,1.0,132,32,3.0,3.0
4,1.0,132,34,1.5,1.5


sumsimilarity is sum of ratings of similar users to input grouped by movie and sum_wiehted is whited sum of similarty based on the movies.

In [68]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45.617814,160.113649
2,27.644391,81.400566
3,8.632175,28.868134
5,4.898383,13.796767
6,17.291698,66.854171


In [63]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.509893,1
2,2.94456,2
3,3.344248,3
5,2.816596,5
6,3.866258,6


sort the recommendation score 

In [64]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
7579,5.0,7579
931,5.0,931
85,5.0,85
7767,5.0,7767
1150,5.0,1150
3090,5.0,3090
3951,5.0,3951
69529,5.0,69529
158027,5.0,158027
69275,5.0,69275


recommend to 10 movies by name which has been extracted by movieId

In [66]:
dfm.loc[dfm['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
76,85,Angels and Insects,1995
712,931,Spellbound,1945
869,1150,"Return of Martin Guerre, The (Retour de Martin...",1982
2333,3090,Matewan,1987
2947,3951,Two Family House,2000
4969,7579,Pride and Prejudice,1940
5013,7767,"Best of Youth, The (La meglio gioventù)",2003
7051,69275,Dead Snow (Død snø),2009
7065,69529,Home,2009
9284,158027,SORI: Voice from the Heart,2016
