In [529]:
# Importing libraries

import pandas as pd
import numpy as np
import operator

from sklearn.model_selection import train_test_split

#### Importing datasets

In [530]:
DF1 = pd.read_csv("movies.csv")

In [531]:
DF1.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [532]:
DF2 = pd.read_csv("ratings.csv")

In [533]:
DF2.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [534]:
DF = DF2.merge(DF1, how = "left", on = "movieId")

# Collaborative Filtering (KNN with means, User-based and Item Based)

In [535]:
UCF = DF.drop(["genres","timestamp"],axis=1)

#### Taking a subset of 100,000

In [536]:
UCF = UCF[:100000]

#### Train-test split

In [537]:
train_UCF,test_UCF = train_test_split(UCF,test_size=0.2, random_state = 1)

#### Fitting Collaborative Filtering Model using Surprise library

In [538]:
from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV
from surprise.prediction_algorithms import KNNWithMeans

In [539]:
reader = Reader(rating_scale=(1, 5))

In [540]:
data = Dataset.load_from_df(train_UCF[['userId','movieId','rating']], reader)

#### Grid Search for best parameters

In [541]:
#Defining the parameter grid with k as the neighbourhood size & trying 2 similarity measures KNNwithMeans & 5 folds
param_grid = {"k":list(range(1,100,5)),
              "sim_options":{"name":["cosine","pearson"],
              'user_based': [True,False]}}

#KNNWithMeans by default does user based collaborative filtering and here we are trying to find the best set of parameters
gs = GridSearchCV(KNNWithMeans, 
                  param_grid, 
                  measures=['rmse'], 
                  cv=5, 
                  n_jobs = -1)

#We fit the grid search on data to find out the best score
gs.fit(data)

#Printing the best score
print(gs.best_score['rmse'])

#Printing the best set of parameters
print(gs.best_params['rmse'])

0.912826301222468
{'k': 96, 'sim_options': {'name': 'cosine', 'user_based': True}}


#### Fitting the model on the complete train set

In [550]:
#Defining similarity measure as per the best parameters
sim_options = {'name': 'pearson'}

#Fitting the model on train data
model = KNNWithMeans(k = 96, sim_options = sim_options)

#Build full trainset will essentially fit the knnwithmeans on the complete train set instead of a part of it like we do in cross validation
model.fit(data.build_full_trainset())

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2073dc50f70>

#### RMSE predictions for test set

In [551]:
#id pairs for test set
id_pairs = zip(test_UCF['userId'], test_UCF['movieId'])


In [552]:
#Making predictions for test set using predict method from Surprise

# [model.predict(uid = user, iid = movie) for (user, movie) in id_pairs]

In [553]:
id_pairs = zip(test_UCF['userId'], test_UCF['movieId'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

#Actual rating values for test set
y_true = test_UCF['rating']

### RMSE score for CF model

In [554]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_true, y_pred, squared=False)
rmse

0.9094960898640264

### Top k recommended items for user i

In [555]:
id_pairs = zip(test_UCF['userId'], test_UCF['movieId'])

rec_data = pd.DataFrame([model.predict(uid = user, iid = movie) for (user, movie) in id_pairs])

rec_data = rec_data.drop(["details"],axis=1)

rec_data = rec_data.rename({'iid': 'movieId'}, axis=1)

rec_data = rec_data.merge(DF1, how = "left", on = "movieId")

rec_data = rec_data.drop(["r_ui","movieId","genres"],axis=1)

rec_data = rec_data.rename({'est': 'pred_rating'}, axis=1)


In [556]:
# Function that gives k recommended movies for any user i

# Function takes userID and the number of movies to be recommended to the user as the input

def mov_rec(userID,number_of_movies):
    temp_rec_data = rec_data.loc[rec_data['uid'] == userID]
    temp_rec_data = temp_rec_data.sort_values(by='pred_rating',ascending=False)
    print(temp_rec_data[:number_of_movies]["title"])

In [557]:
# Checking top 5 recommended movies for userID 478

mov_rec(300,5)

17509    Star Wars: Episode VI - Return of the Jedi (1983)
1252                             Back to the Future (1985)
3120                                          Shrek (2001)
3667             Indiana Jones and the Last Crusade (1989)
2267                               Dark Knight, The (2008)
Name: title, dtype: object


### Verifying recommendation items

In [558]:
# Top true 15 movies for user i

h = train_UCF.loc[train_UCF['userId'] == 300]

h = h.sort_values(by='rating',ascending=False)

h[:15]

Unnamed: 0,userId,movieId,rating,title
38258,300,2011,5.0,Back to the Future Part II (1989)
38306,300,98809,5.0,"Hobbit: An Unexpected Journey, The (2012)"
38267,300,3793,5.0,X-Men (2000)
38295,300,79132,5.0,Inception (2010)
38244,300,260,5.0,Star Wars: Episode IV - A New Hope (1977)
38299,300,88140,5.0,Captain America: The First Avenger (2011)
38294,300,70286,5.0,District 9 (2009)
38279,300,7153,5.0,"Lord of the Rings: The Return of the King, The..."
38302,300,91500,5.0,The Hunger Games (2012)
38251,300,1196,5.0,Star Wars: Episode V - The Empire Strikes Back...


# Collaborative Filtering (SVD)

In [559]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import GridSearchCV

In [560]:
reader = Reader(rating_scale=(1, 5))

In [561]:
data = Dataset.load_from_df(train_UCF[['userId','movieId','rating']], reader)

#### Grid Search for best parameters

In [562]:
#Defining the parameter grid for SVD and fixing the random state
param_grid = {'n_factors':list(range(1,100,5)), 'n_epochs': [5, 10, 20], 'random_state': [42]}

#Defining the grid search with the parameter grid and SVD algorithm optimizing for RMSE
gs = GridSearchCV(SVD, 
                  param_grid, 
                  measures=['rmse'], 
                  cv=5, 
                  n_jobs = -1)

#Fitting the mo
gs.fit(data)
 
#Printing the best score
print(gs.best_score['rmse'])

#Printing the best set of parameters
print(gs.best_params['rmse'])

0.8767349586293195
{'n_factors': 1, 'n_epochs': 20, 'random_state': 42}


In [563]:
#Fitting the model on train data with the best parameters
model = SVD(n_factors = 1, n_epochs = 20, random_state = 42)

#Build full trainset will essentially fits the SVD on the complete train set instead of a part of it like we do in cross validation for grid search
model.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2073300c580>

In [564]:
id_pairs = zip(test_UCF['userId'], test_UCF['movieId'])

#Making predictions for test set using predict method from Surprise
y_pred = [model.predict(uid = user, iid = movie)[3] for (user, movie) in id_pairs]

#Actual rating values for test set
y_true = test_UCF['rating']

In [565]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_true, y_pred, squared=False)
rmse

0.8756271509429002

# Content Based Filtering

In [566]:
DF.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [567]:
CBF = DF[:100000]

In [568]:
CBF

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance
...,...,...,...,...,...,...
99995,757,2115,3.0,1184014093,Indiana Jones and the Temple of Doom (1984),Action|Adventure|Fantasy
99996,757,2117,3.0,1184015776,1984 (Nineteen Eighty-Four) (1984),Drama|Sci-Fi
99997,757,2118,4.0,1184014221,"Dead Zone, The (1983)",Thriller
99998,757,2124,3.5,1184073900,"Addams Family, The (1991)",Children|Comedy|Fantasy


In [569]:
# code for getting all genre names from the dataframe

# separating genres from the main dataframe
Mx = CBF["genres"]

# create empty list for storing genres
a = []

# loop for adding all genres to list a
for i in range(len(Mx)):
    a.append(Mx[i].split("|"))


# flattens 2D a into 1D a
a = [item for sublist in a for item in sublist]

# genre set 
b = set(a)

# genre list
c = list(b)

In [571]:
c

['Documentary',
 'Adventure',
 'Western',
 'Thriller',
 'Crime',
 'Fantasy',
 'IMAX',
 'Drama',
 'War',
 'Comedy',
 'Animation',
 'Sci-Fi',
 'Mystery',
 'Children',
 'Romance',
 'Film-Noir',
 'Musical',
 '(no genres listed)',
 'Action',
 'Horror']

In [572]:
# matrix with...row as movie names....columns as genres

M1 = pd.DataFrame(columns=c, index=CBF["title"])

In [573]:
M1 = M1.fillna(0)

In [574]:
# genre x movie matrix

M1.head()

Unnamed: 0_level_0,Documentary,Adventure,Western,Thriller,Crime,Fantasy,IMAX,Drama,War,Comedy,Animation,Sci-Fi,Mystery,Children,Romance,Film-Noir,Musical,(no genres listed),Action,Horror
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Pulp Fiction (1994),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Three Colors: Red (Trois couleurs: Rouge) (1994),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Three Colors: Blue (Trois couleurs: Bleu) (1993),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Underground (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Singin' in the Rain (1952),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [575]:
# Genre list

Mx.head()

0    Comedy|Crime|Drama|Thriller
1                          Drama
2                          Drama
3               Comedy|Drama|War
4         Comedy|Musical|Romance
Name: genres, dtype: object

In [576]:
# code for adding 1 to the genre column corresponding to the movie name

for i in range(len(Mx)):
    
    d = Mx[i].split("|")        

    for j in range(len(d)):
        for k in M1.columns:     
            if d[j]==k:
                M1[k].iloc[i] = 1
            else:
                pass

In [577]:
M1.head()

Unnamed: 0_level_0,Documentary,Adventure,Western,Thriller,Crime,Fantasy,IMAX,Drama,War,Comedy,Animation,Sci-Fi,Mystery,Children,Romance,Film-Noir,Musical,(no genres listed),Action,Horror
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Pulp Fiction (1994),0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
Three Colors: Red (Trois couleurs: Rouge) (1994),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Three Colors: Blue (Trois couleurs: Bleu) (1993),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Underground (1995),0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0
Singin' in the Rain (1952),0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0


In [578]:
M2 = M1.reset_index()

M2 = M2.drop("title", axis=1)

In [579]:
CBF2 = pd.merge(CBF, M2, left_index=True, right_index=True)

In [580]:
CBF2.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Documentary,Adventure,Western,Thriller,...,Animation,Sci-Fi,Mystery,Children,Romance,Film-Noir,Musical,(no genres listed),Action,Horror
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [581]:
CBF3 = pd.merge(CBF, M2, left_index=True, right_index=True)

In [582]:
CBF3.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Documentary,Adventure,Western,Thriller,...,Animation,Sci-Fi,Mystery,Children,Romance,Film-Noir,Musical,(no genres listed),Action,Horror
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


In [583]:
# Multiplying ratings with the movie columns

for i in range(len(CBF3)):
    CBF3.iloc[i,6:] = CBF3.iloc[i,6:]*CBF3["rating"][i]


In [584]:
# Creates recommendation dictionary based on content based method

# Takes userID as the input

def recom_dict(userId):
    
    
    
    CBF4 = CBF3[CBF3["userId"]==userId]           # Subsetting datatframe for user i
    e = CBF4.iloc[:,6:].sum()                     # Summing up all columns for user i
    f = np.array(e)/sum(np.array(e))              # Normalizing e, f is the preference score matrix for user i
    
    
    
    
    # List of movies not seen by user i = 134

    m = CBF3[CBF3["userId"]==userId].loc[:,"title"]            # m is the list of movies seen by user i 
    m = np.array(m)
    
    n = CBF3["title"].unique()      # Array of all movies
    n = n.tolist()                  # Converting array to list
    
    
    for i in n:                      # Removing common terms from list n
        if i in m:
            n.remove(i)
 
    n = np.array(n)
    
    
    
    
    # Matrix for movies not watched 

    M3 = CBF2.drop_duplicates(subset='movieId', keep="last")
    M4 = M3[M3["userId"]!=1]
    M5 = M4.iloc[:,4:]
    M5 = M5.drop(['genres'], axis=1)
    
    
    Dict = {}
    
    
    for i in range(len(M5)):
        y = (M5.iloc[i,1:]*f).sum()
        z = M5.iloc[i,:][0]
        Dict.update({z: y})
    
    
    return(Dict)


In [585]:
# Creating recommendation list for user 300

d = recom_dict(300)

sorted_d = sorted(d.items(), key=operator.itemgetter(1),reverse=True)    # sorting the list

In [586]:
# Top 10 recommended movies for user i = 300

sorted_d[:10]

[('Mars Needs Moms (2011)', 0.7197730248799651),
 ('Maximum Ride (2016)', 0.7193365342645134),
 ('G.I. Joe: Retaliation (2013)', 0.6931470973374072),
 ('Spider-Man 3 (2007)', 0.6931470973374072),
 ('Iron Man 2 (2010)', 0.6931470973374072),
 ('Matrix Reloaded, The (2003)', 0.6931470973374072),
 ('Matrix Revolutions, The (2003)', 0.6931470973374072),
 ('Star Wars: Episode VII - The Force Awakens (2015)', 0.6700130947184636),
 ('Man of Steel (2013)', 0.6700130947184636),
 ('Under the Mountain (2009)', 0.6678306416412048)]