# Movie Recommender - Collaborative Filtering (User-User)

- Dataset: https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%205/data/moviedataset.zip


- User input: [{'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}] 
            
- Use the 50 most similar users to the input

- Recommend the top10 movies to input user

In [1]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### Loading Data

In [6]:
movies_df = pd.read_csv(r'movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.shape)
movies_df.head()

(34208, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Preprocessing
- Movie df

In [7]:
# Isolate (year) from title
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

#Removing () from year column
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)

#Removing  years from  'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

#strip ending whitespaces if any
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [9]:
#Dropping irrelevant column
movies_df = movies_df.drop('genres', 1)

In [10]:
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


### Preprocessing
- Ratings df

In [12]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [13]:
#Dropping irrelevant column
ratings_df = ratings_df.drop('timestamp', 1)

In [14]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


### Collaborative Filtering
- Selecting user

In [8]:
# User input
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 

# To df
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,5.0,"Breakfast Club, The"
1,3.5,Toy Story
2,2.0,Jumanji
3,5.0,Pulp Fiction
4,4.5,Akira


In [15]:
# Movies Id from 'movies_df' to 'inputMovies'

  #Filtering  by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

  #Merging results (by title)
inputMovies = pd.merge(inputId, inputMovies)

  #Dropping irrelevant info
inputMovies = inputMovies.drop('year', 1)

inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


### Collaborative Filtering
- Other users ratings of selected input movies

In [16]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [28]:
# Group by userId
userSubsetGroup = userSubset.groupby(['userId'])

print(type(userSubsetGroup))
userSubsetGroup.nunique()

<class 'pandas.core.groupby.groupby.DataFrameGroupBy'>


Unnamed: 0_level_0,userId,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,1,1,1
12,1,1,1
13,1,2,2
14,1,1,1
15,1,3,2
17,1,4,3
18,1,1,1
20,1,1,1
23,1,1,1
26,1,1,1


In [32]:
# Testing - checking userId=1230
userSubsetGroup.get_group(1230)

Unnamed: 0,userId,movieId,rating
114338,1230,1,4.0
114353,1230,296,5.0


### Collaborative Filtering
- Sort by users with most watched movies that the input watched

In [35]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

# Check first 3
userSubsetGroup[0:3]

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0)]

In [40]:
# Selecting Top100 users from subset group 
userSubsetGroup = userSubsetGroup[0:100]
userSubsetGroup

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0),
 (815,
         userId  movieId  rating
  73747     815        1     4.5
  73748     815        2     3.0
  73922     815      296     5.0
  74362     815     1274     3.0
  74678     815     1968     4.5),
 (1040,
         userId  movieId  rating
  96689    1040        1     3.0
  96690    1040        2     1.5
  96733    1040      296     3.5
  96859    1040     1274     3.0
  96922    1

### Collaborative Filtering
- Pearson Correlation

In [41]:
#Store Pearson Correlation in a dict,  key = user Id, value = coefficient
pearsonCorrelationDict = {}

# Pearson Correlation between each top100 users and input user
for name, group in userSubsetGroup:
    
    #Sorting input and current user by 'movieId'
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    
    #'n' for formula
    nRatings = len(group)
    
    # movieId from input user in users group
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    
    # ratings to list
    tempRatingList = temp_df['rating'].tolist()
    
    # current user reviews to list
    tempGroupList = group['rating'].tolist()
    
    # Calculate Pearson Correlation between users x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If  denominator is != 0, divide. Else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [42]:
pearsonCorrelationDict.items()

dict_items([(75, 0.8272781516947562), (106, 0.5860090386731182), (686, 0.8320502943378437), (815, 0.5765566601970551), (1040, 0.9434563530497265), (1130, 0.2891574659831201), (1502, 0.8770580193070299), (1599, 0.4385290096535153), (1625, 0.716114874039432), (1950, 0.179028718509858), (2065, 0.4385290096535153), (2128, 0.5860090386731196), (2432, 0.1386750490563073), (2791, 0.8770580193070299), (2839, 0.8204126541423674), (2948, -0.11720180773462392), (3025, 0.45124262819713973), (3040, 0.89514359254929), (3186, 0.6784622064861935), (3271, 0.26989594817970664), (3429, 0.0), (3734, -0.15041420939904673), (4099, 0.05860090386731196), (4208, 0.29417420270727607), (4282, -0.4385290096535115), (4292, 0.6564386345361464), (4415, -0.11183835382312353), (4586, -0.9024852563942795), (4725, -0.08006407690254357), (4818, 0.4885967564883424), (5104, 0.7674257668936507), (5165, -0.4385290096535153), (5547, 0.17200522903844556), (6082, -0.04728779924109591), (6207, 0.9615384615384616), (6366, 0.65779

In [65]:
# Pearson Coef. dict to df
pearson_df = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearson_df

Unnamed: 0,0
75,0.827278
106,0.586009
686,0.832050
815,0.576557
1040,0.943456
1130,0.289157
1502,0.877058
1599,0.438529
1625,0.716115
1950,0.179029


In [66]:
# Organizing Pearson df
pearson_df.columns = ['similarityIndex'] 
pearson_df['userId'] = pearson_df.index 
pearson_df.reset_index(inplace=True)
del pearson_df['index']
print(pearson_df.shape)
pearson_df

(100, 2)


Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.832050,686
3,0.576557,815
4,0.943456,1040
5,0.289157,1130
6,0.877058,1502
7,0.438529,1599
8,0.716115,1625
9,0.179029,1950


### Top50 similar users to input
- Order by similarity score

In [68]:
topUsers = pearson_df.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers

Unnamed: 0,similarityIndex,userId
64,0.961678,12325
34,0.961538,6207
55,0.961538,10707
67,0.960769,13053
4,0.943456,1040
59,0.937614,11769
62,0.929294,12120
80,0.903584,15157
70,0.895144,13366
17,0.895144,3040


In [70]:
# Get top50 users most watched movies and ratings to df, from ratings_df
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')

topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5


### Recommending
- Weighted Ratings

In [71]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196


In [73]:
# Temporary df - Group by movieId, sum similarity and weighted ratings
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,38.376281,140.800834
2,38.376281,96.656745
3,10.253981,27.254477
4,0.929294,2.787882
5,11.723262,27.151751


### Recommending
- Weighted average recommendation score

In [75]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']

# movieId as index
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.shape)
recommendation_df.head()

(8134, 2)


Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.668955,1
2,2.518658,2
3,2.657941,3
4,3.0,4
5,2.316058,5


# Recommendation - Result
- Top 10 movies recommended by the algorithm to user input

In [76]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
5073,5.0,5073
3329,5.0,3329
2284,5.0,2284
26801,5.0,26801
6776,5.0,6776
6672,5.0,6672
3759,5.0,3759
3769,5.0,3769
3775,5.0,3775
90531,5.0,90531


In [77]:
# Localizing movies names through 'movieId'
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
2200,2284,Bandit Queen,1994
3243,3329,"Year My Voice Broke, The",1987
3669,3759,Fun and Fancy Free,1947
3679,3769,Thunderbolt and Lightfoot,1974
3685,3775,Make Mine Music,1946
4978,5073,"Son's Room, The (Stanza del figlio, La)",2001
6563,6672,War Photographer,2001
6667,6776,Lagaan: Once Upon a Time in India,2001
9064,26801,Dragon Inn (Sun lung moon hak chan),1992
18106,90531,Shame,2011
