<a href="https://colab.research.google.com/github/jchen8000/MachineLearning/blob/master/6%20Recommender%20System/Recommendation_Collaborative_Filtering_Memory_Based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommendation System with Collaborative Filtering

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
from pathlib import Path
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.metrics.pairwise import cosine_similarity

## MovieLens Dataset

In [None]:
# Download the MovieLens dataset
movielens = 'ml-latest-small'
url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
movielens_file = keras.utils.get_file(
    fname = movielens + '.zip', 
    origin = url, 
    cache_dir='./'
)
datasets_path = Path(movielens_file).parents[0]
movielens_dir = datasets_path / movielens
# Extract the data file.
if not movielens_dir.exists():
    with ZipFile(movielens_file, "r") as zip:
        zip.extractall(path=datasets_path)
# Load movies and ratings data
ratings = pd.read_csv(movielens_dir/"ratings.csv")
movies = pd.read_csv(movielens_dir/"movies.csv")

Downloading data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip


In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [None]:
merged = ratings.merge(movies, how="left", on="movieId")
merged

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [None]:
n_reviews = merged.groupby(["movieId", "title", "genres"]) \
                  .agg({"rating": "mean", "userId": "count"}) \
                  .rename(columns={"userId": "n_reviews"}) \
                  .sort_values("rating", ascending=False)
n_reviews                  

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rating,n_reviews
movieId,title,genres,Unnamed: 3_level_1,Unnamed: 4_level_1
88448,Paper Birds (Pájaros de papel) (2010),Comedy|Drama,5.0,1
100556,"Act of Killing, The (2012)",Documentary,5.0,1
143031,Jump In! (2007),Comedy|Drama|Romance,5.0,1
143511,Human (2015),Documentary,5.0,1
143559,L.A. Slasher (2015),Comedy|Crime|Fantasy,5.0,1
...,...,...,...,...
157172,Wizards of the Lost Kingdom II (1989),Action|Fantasy,0.5,1
85334,Hard Ticket to Hawaii (1987),Action|Comedy,0.5,1
53453,Starcrash (a.k.a. Star Crash) (1978),Action|Adventure|Fantasy|Sci-Fi,0.5,1
8494,"Cincinnati Kid, The (1965)",Drama,0.5,1


In [None]:
top_review = merged.groupby(["movieId", "title", "genres"]) \
                   .agg({"rating": "mean", "userId": "count"}) \
                   .rename(columns={"userId": "n_reviews"})
top_review = top_review[top_review.n_reviews > 10]
top_review.sort_values("rating", ascending=False)[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rating,n_reviews
movieId,title,genres,Unnamed: 3_level_1,Unnamed: 4_level_1
1041,Secrets & Lies (1996),Drama,4.590909,11
3451,Guess Who's Coming to Dinner (1967),Drama,4.545455,11
1178,Paths of Glory (1957),Drama|War,4.541667,12
1104,"Streetcar Named Desire, A (1951)",Drama,4.475,20
2360,"Celebration, The (Festen) (1998)",Drama,4.458333,12
1217,Ran (1985),Drama|War,4.433333,15
318,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
951,His Girl Friday (1940),Comedy|Romance,4.392857,14
922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance,4.333333,27
3468,"Hustler, The (1961)",Drama,4.333333,18


In [None]:
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
n_users, n_movies

(610, 9724)

## Collaborative Filtering

Transform ratings data with the following conditions:

1. a movieID is valid at least 30 users have rated a movie.
2. a userID is valid at least 20 ratings by the user.

In [None]:
ratings = ratings[ratings.groupby("movieId")["movieId"].transform("size") > 30]
ratings = ratings[ratings.groupby("userId")["userId"].transform("size") > 20]
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100742,610,122904,3.0,1493845981
100760,610,134130,3.5,1479543002
100763,610,134853,3.5,1493845106
100780,610,139385,4.5,1493846777


In [None]:
user_movie = merged.pivot_table(index="userId", columns="movieId", values="rating")
user_movie = user_movie.subtract(user_movie.mean(axis=1), axis = 0)
user_movie = user_movie.fillna(0) 
user_movie


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,0.000000,-0.366379,0.0,0.0,-0.366379,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.363636,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,0.000000,0.000000,0.0,0.0,0.000000,-1.157399,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.213904,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,-0.634176,-1.134176,-1.134176,0.0,0.0,0.000000,0.000000,0.0,0.0,0.865824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,-0.270270,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.729730,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Identify similar users with Cosine Similarity

In [None]:
user_similarity = cosine_similarity(user_movie)
user_similarity_df = pd.DataFrame(user_similarity, columns=user_movie.index.values,
                                  index=user_movie.index.values)
user_similarity_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,1.000000,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.006200,0.047013,0.019510,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.000000,0.000000,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.000000,0.003012,...,-0.050551,-0.031581,-0.001688,0.000000,0.000000,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.000000,1.000000,-0.011260,-0.031539,0.004800,0.000000,-0.032471,0.000000,0.000000,...,-0.004904,-0.016117,0.017749,0.000000,-0.001431,-0.037289,-0.007789,-0.013001,0.000000,0.019550
4,0.048419,-0.017164,-0.011260,1.000000,-0.029620,0.013956,0.058091,0.002065,-0.005874,0.051590,...,-0.037687,0.063122,0.027640,-0.013782,0.040037,0.020590,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.029620,1.000000,0.009111,0.010117,-0.012284,0.000000,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.012016,0.006226,-0.037289,0.020590,0.026319,-0.009137,0.028326,0.022277,0.031633,-0.039946,...,0.053683,0.016384,0.098011,0.061078,0.019678,1.000000,0.017927,0.056676,0.038422,0.075464
607,0.055261,-0.020504,-0.007789,0.014628,0.031896,0.045501,0.030981,0.048822,-0.012161,-0.017656,...,0.049059,0.038197,0.049317,0.002355,-0.029381,0.017927,1.000000,0.044514,0.019049,0.021860
608,0.075224,-0.006001,-0.013001,-0.037569,-0.001751,0.021727,0.028414,0.071759,0.032783,-0.052000,...,0.069198,0.051388,0.012801,0.006319,-0.007978,0.056676,0.044514,1.000000,0.050714,0.054454
609,-0.025713,-0.060091,0.000000,-0.017884,0.093829,0.053017,0.008754,0.077180,0.000000,-0.040090,...,0.043465,0.062400,0.015334,0.094038,-0.054722,0.038422,0.019049,0.050714,1.000000,-0.012471


In [None]:
RECOMM_USER = 6
RECOMM_COUNT = 10

# Remove picked user ID from the candidate list
user_similarity_df.drop(index=RECOMM_USER, inplace=True)
user_similarity_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,1.000000,0.001265,0.000553,0.048419,0.021847,-0.045497,-0.006200,0.047013,0.019510,-0.008754,...,0.018127,-0.017172,-0.015221,-0.037059,-0.029121,0.012016,0.055261,0.075224,-0.025713,0.010932
2,0.001265,1.000000,0.000000,-0.017164,0.021796,-0.021051,-0.011114,-0.048085,0.000000,0.003012,...,-0.050551,-0.031581,-0.001688,0.000000,0.000000,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,0.000000,1.000000,-0.011260,-0.031539,0.004800,0.000000,-0.032471,0.000000,0.000000,...,-0.004904,-0.016117,0.017749,0.000000,-0.001431,-0.037289,-0.007789,-0.013001,0.000000,0.019550
4,0.048419,-0.017164,-0.011260,1.000000,-0.029620,0.013956,0.058091,0.002065,-0.005874,0.051590,...,-0.037687,0.063122,0.027640,-0.013782,0.040037,0.020590,0.014628,-0.037569,-0.017884,-0.000995
5,0.021847,0.021796,-0.031539,-0.029620,1.000000,0.009111,0.010117,-0.012284,0.000000,-0.033165,...,0.015964,0.012427,0.027076,0.012461,-0.036272,0.026319,0.031896,-0.001751,0.093829,-0.000278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.012016,0.006226,-0.037289,0.020590,0.026319,-0.009137,0.028326,0.022277,0.031633,-0.039946,...,0.053683,0.016384,0.098011,0.061078,0.019678,1.000000,0.017927,0.056676,0.038422,0.075464
607,0.055261,-0.020504,-0.007789,0.014628,0.031896,0.045501,0.030981,0.048822,-0.012161,-0.017656,...,0.049059,0.038197,0.049317,0.002355,-0.029381,0.017927,1.000000,0.044514,0.019049,0.021860
608,0.075224,-0.006001,-0.013001,-0.037569,-0.001751,0.021727,0.028414,0.071759,0.032783,-0.052000,...,0.069198,0.051388,0.012801,0.006319,-0.007978,0.056676,0.044514,1.000000,0.050714,0.054454
609,-0.025713,-0.060091,0.000000,-0.017884,0.093829,0.053017,0.008754,0.077180,0.000000,-0.040090,...,0.043465,0.062400,0.015334,0.094038,-0.054722,0.038422,0.019049,0.050714,1.000000,-0.012471


In [None]:
similar_users=user_similarity_df[RECOMM_USER] \
                    .sort_values(ascending=False) \
                    .reset_index() \
                    .head(5)
similar_users

Unnamed: 0,index,6
0,181,0.263749
1,126,0.223044
2,584,0.218799
3,411,0.195856
4,179,0.192669


In [None]:
user_watched = ratings[ratings['userId'] == RECOMM_USER][['userId', 'movieId', 'rating']] \
                      .sort_values(by='rating', ascending=False) \
                      .merge(movies[['movieId', 'title']], on='movieId', how='inner')
user_watched.to_csv("user_watched.csv")
user_watched                      

Unnamed: 0,userId,movieId,rating,title
0,6,457,5.0,"Fugitive, The (1993)"
1,6,590,5.0,Dances with Wolves (1990)
2,6,160,5.0,Congo (1995)
3,6,168,5.0,First Knight (1995)
4,6,377,5.0,Speed (1994)
...,...,...,...,...
132,6,21,2.0,Get Shorty (1995)
133,6,466,2.0,Hot Shots! Part Deux (1993)
134,6,762,1.0,Striptease (1996)
135,6,327,1.0,Tank Girl (1995)


In [None]:
similar_users_watched = ratings[ratings['userId'].isin(similar_users.index)][['userId', 'movieId', 'rating']] \
                            .groupby(['movieId'])['rating'].mean() \
                            .reset_index() \
                            .sort_values(by='rating', ascending=False)
similar_users_watched

Unnamed: 0,movieId,rating
228,5060,5.0
160,2502,5.0
81,1197,5.0
80,1196,5.0
162,2542,5.0
...,...,...
197,3175,1.0
225,4308,1.0
226,4641,1.0
124,1704,1.0


In [None]:
recommendations = similar_users_watched[similar_users_watched['movieId'].isin(user_watched['movieId']) == False] \
                        .merge(movies[['movieId', 'title']], on='movieId', how='inner') \
                        .head(RECOMM_COUNT)
recommendations

Unnamed: 0,movieId,rating,title
0,5060,5.0,M*A*S*H (a.k.a. MASH) (1970)
1,2502,5.0,Office Space (1999)
2,1197,5.0,"Princess Bride, The (1987)"
3,1196,5.0,Star Wars: Episode V - The Empire Strikes Back...
4,2542,5.0,"Lock, Stock & Two Smoking Barrels (1998)"
5,1136,5.0,Monty Python and the Holy Grail (1975)
6,2580,5.0,Go (1999)
7,1097,5.0,E.T. the Extra-Terrestrial (1982)
8,2599,5.0,Election (1999)
9,1092,5.0,Basic Instinct (1992)
