# Collaborative based Recommendation System

## Importing Libraries

In [7]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Processing

In [8]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('movies.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [10]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [11]:
#Dropping the genres column
movies_df = movies_df.drop('genres', 1)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [12]:
ratings_df = ratings_df.drop('timestamp', 1)

In [13]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


<h2> Recommender System </h2>
The process for creating a User Based recommendation system is as follows:
<br>- Select a user with the movies the user has watched </br>
<br>- Based on his rating to movies, find the top X neighbours 
<br>- Get the watched movie record of the user for each neighbour.
<br>- Calculate a similarity score using some formula
<br>- Recommend the items with the highest score


In [14]:
userInput = [
            {'title':'Mission: Impossible - Rogue Nation','rating':4.5},
            {'title':'Maze Runner: Scorch Trials', 'rating':4.5},
            {'title':'Mad Max: Fury Road', 'rating':4.0},
            {'title':'The Lightkeepers', 'rating':2},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,4.5,Mission: Impossible - Rogue Nation
1,4.5,Maze Runner: Scorch Trials
2,4.0,Mad Max: Fury Road
3,2.0,The Lightkeepers
4,4.5,Akira


In [15]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,1274,Akira,4.5
1,111781,Mission: Impossible - Rogue Nation,4.5
2,117895,Maze Runner: Scorch Trials,4.5
3,122882,Mad Max: Fury Road,4.0
4,124839,The Lightkeepers,2.0


In [16]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
216,5,122882,4.5
531,13,1274,5.0
2749,22,122882,3.5
4401,39,1274,5.0
5114,46,111781,5.0


In [17]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [18]:
userSubsetGroup.get_group(1130)

Unnamed: 0,userId,movieId,rating
104363,1130,1274,4.5
105156,1130,122882,1.0


In [19]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [20]:
userSubsetGroup[0:3]

[(9101,         userId  movieId  rating
  842540    9101     1274     3.5
  843390    9101   111781     3.0
  843415    9101   117895     3.5
  843426    9101   122882     3.5), (10143,         userId  movieId  rating
  939097   10143     1274     5.0
  939432   10143   111781     4.0
  939450   10143   117895     3.0
  939458   10143   122882     3.0), (24567,          userId  movieId  rating
  2280336   24567     1274     4.5
  2281427   24567   111781     4.5
  2281482   24567   117895     2.5
  2281489   24567   122882     5.0)]

In [21]:
userSubsetGroup = userSubsetGroup[0:100]

In [22]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [23]:
pearsonCorrelationDict.items()

dict_items([(9101, -0.3333333333333333), (10143, 0.5222329678670935), (24567, -0.5261522196019802), (29300, -0.47140452079103173), (39142, -0.6531972647421809), (45232, -0.7333333333333333), (53735, -0.3333333333333333), (59853, -0.7302967433402214), (91951, 0.0), (150198, -0.676481425202546), (165943, -0.40422604172722165), (169491, 0.5922200922639821), (180362, 0.3333333333333333), (182192, -0.5773502691896257), (193091, -0.5555555555555556), (195637, -0.5773502691896258), (203329, -0.2581988897471611), (221046, 0.5222329678670935), (236028, -0.6622661785325219), (244116, -0.5773502691896258), (815, -0.8660254037844448), (2569, -0.8660254037844448), (2787, -0.8660254037844448), (2824, 0.49999999999999734), (3024, 1.0000000000000107), (3266, -0.9449111825230704), (3388, -0.5000000000000036), (3734, 0.6933752452815377), (4208, -0.9999999999999893), (4938, 0.0), (5040, -0.49999999999999734), (6190, -0.39735970711951596), (6439, -0.8660254037844448), (8824, -0.7857142857142891), (9003, 1

In [24]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,-0.333333,9101
1,0.522233,10143
2,-0.526152,24567
3,-0.471405,29300
4,-0.653197,39142


In [25]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
34,1.0,9003
24,1.0,3024
46,1.0,24431
43,1.0,20156
63,0.970725,40145


In [26]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,9003,1,5.0
1,1.0,9003,10,2.0
2,1.0,9003,17,3.0
3,1.0,9003,25,1.0
4,1.0,9003,32,5.0


In [27]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,9003,1,5.0,5.0
1,1.0,9003,10,2.0,2.0
2,1.0,9003,17,3.0,3.0
3,1.0,9003,25,1.0,1.0
4,1.0,9003,32,5.0,5.0


In [28]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.841575,21.652824
2,2.671414,7.518637
3,0.333333,1.666667
4,0.193375,0.580126
5,0.09222,-0.482226


In [29]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.706676,1
2,2.814478,2
3,5.0,3
4,3.0,4
5,-5.229083,5


In [30]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
92192,inf,92192
104419,inf,104419
58404,inf,58404
104272,inf,104272
5051,inf,5051
88950,inf,88950
5598,inf,5598
104144,inf,104144
95963,inf,95963
2380,inf,2380


## Results

In [31]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
2296,2380,Police Academy 3: Back in Training,1986
4956,5051,Italian for Beginners (Italiensk for begyndere),2000
5500,5598,Surf Ninjas,1993
12520,58404,Justice League: The New Frontier,2008
17727,88950,"Conspirator, The",2010
18550,92192,Apollo 18,2011
19353,95963,Dragon Ball Z: Wrath of the Dragon (Doragon bô...,1995
21545,104144,Green Lantern: Emerald Knights,2011
21569,104272,Blackfish,2013
21614,104419,Justice League: Crisis on Two Earths,2010
