In [165]:
import subprocess

In [166]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [167]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [168]:
subprocess.run(["unzip", "/content/drive/MyDrive/Colab Notebooks/ml-20m.zip"])
# subprocess.run(["unzip", "/content/ml-100k.zip"])

CompletedProcess(args=['unzip', '/content/drive/MyDrive/Colab Notebooks/ml-20m.zip'], returncode=1)

In [169]:
df_movies = pd.read_csv("/content/ml-20m/movies.csv")
df_ratings = pd.read_csv("/content/ml-20m/ratings.csv")
df_tags = pd.read_csv("/content/ml-20m/tags.csv")

In [170]:
# Trimming the data from 20m to 100k as 20m is crashing the session.
# Taking first 100k data from 20m dataset instead of using 100k dataset seperately because the column and file names are different in both datasets
df_movies = df_movies.head(100000)
df_ratings = df_ratings.head(100000)
df_tags = df_tags.head(100000)

In [171]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [172]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [173]:
df_tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [174]:
movies = df_movies.drop('genres', axis=1)
movies.head(5)

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [175]:
tags = df_tags.drop(columns = ['userId', 'timestamp'])
tags.head(5)

Unnamed: 0,movieId,tag
0,4141,Mark Waters
1,208,dark hero
2,353,dark hero
3,521,noir thriller
4,592,dark hero


In [176]:
# Merging the tags into single row with same movieId
# Some tags contains just integer or float values so using str(value)
merged_tags = df_tags.groupby('movieId')['tag'].agg(lambda x: ' '.join(str(value) for value in x)).reset_index()
merged_tags.head(5)

Unnamed: 0,movieId,tag
0,1,Watched computer animation Disney animated fea...
1,2,time travel adapted from:book board game child...
2,3,old people that is actually funny sequel fever...
3,4,chick flick revenge
4,5,Diane Keaton family sequel Steve Martin weddin...


In [177]:
merged_df = pd.merge(movies, merged_tags, on='movieId', how='outer')
merged_df.tail(5)

Unnamed: 0,movieId,title,tag
27273,131254,Kein Bund für's Leben (2007),
27274,131256,"Feuer, Eis & Dosenbier (2002)",
27275,131258,The Pirates (2014),
27276,131260,Rentun Ruusu (2001),
27277,131262,Innocence (2014),


In [178]:
# We want to filter based on content and those movies that doesn't have any tags cannot be filtered.
# So, removing the movies with NaN tags and resetting the index
unique_df = merged_df.dropna().reset_index()
unique_df.tail(5)

Unnamed: 0,index,movieId,title,tag
8230,26875,129235,Les Invisibles (2012),documentary homosexuality life sexuality
8231,26888,129303,Camp (2013),camp
8232,26955,129530,Slingshot Hip Hop (2008),Documentary Palestine Palestinians
8233,27017,129857,Bright Days Ahead (2013),french romance
8234,27069,130073,Cinderella (2015),Cinderella unoriginal


In [179]:
# grouped_df = merged_df.groupby('movieId').agg({'title': list, 'tag': list}).reset_index()
# grouped_df.head(5)

In [180]:
unique_df.shape

(8235, 4)

In [181]:
# Trimming the unique_df to match the size with user_movie_matrix. Troubleshooting this.
unique_df = unique_df.head(8227)
unique_df.shape

(8227, 4)

TFIDF Vectorizer

In [182]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [183]:
# We need to convert the current dataframe into tfidf matrix for tags column
movie_ids = unique_df['movieId']
movie_names = unique_df['title']
reviews = unique_df['tag']

In [184]:
tfidf = TfidfVectorizer(stop_words='english')

In [185]:
# This will take all unique words except stop words
tfidf_matrix = tfidf.fit_transform(reviews)

In [186]:
# This will create the dataframe of unique words as a column and default index
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())


In [187]:
# This will merge movie id and name with the tfidf matrix
result_df = pd.concat([movie_ids, movie_names, tfidf_df], axis=1)
result_df.head(5)

Unnamed: 0,movieId,title,000,007,01,06,10,100,101,11,...,zuzu,zwick,zylberstein,zzzzzzzzzzzzzz,ãœnel,ãƒâ,åberg,øªùš,ø³ø,übergang
0,1,Toy Story (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [188]:
result_df.shape

(8227, 11502)

TruncatedSVD

In [189]:
from sklearn.decomposition import TruncatedSVD

In [190]:
# n_components to be adjusted. This number will decide how many features (words) you want.
# This will take 50 most important words in the document and create the matrix for that.
n = 50
svd = TruncatedSVD(n_components=n)

# Here we will not put the whole dataframe but just tfidf_df which is a matrix excluding movieId and name
latent_matrix = svd.fit_transform(tfidf_df)

In [191]:
# This matrix will take LF (Latent Features) and truncate the sparce matrix into a smaller matrix
latent_matrix_df = pd.DataFrame(latent_matrix, columns=[f'LF_{i}' for i in range(1, n + 1)])
latent_matrix_df.head(5)

Unnamed: 0,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,LF_8,LF_9,LF_10,...,LF_41,LF_42,LF_43,LF_44,LF_45,LF_46,LF_47,LF_48,LF_49,LF_50
0,0.006785,0.017872,0.051876,0.097259,0.013724,-0.041588,-0.016455,-0.003522,-0.127049,-0.004257,...,-0.054571,-0.065085,0.082127,0.118303,0.035347,-0.046067,0.08968,-0.069464,0.054075,-0.001107
1,0.010404,0.024018,0.093192,0.024888,-0.001614,-0.03939,-0.039228,0.009198,-0.079582,-0.026418,...,0.026654,-0.085835,0.013597,0.01907,-0.055728,0.008341,0.043453,0.002531,-0.079833,0.022192
2,0.01144,0.002319,0.009365,0.021348,-0.003582,-0.007676,-0.007001,-0.003978,-0.016965,-0.002402,...,0.037998,0.078875,0.084496,-0.072789,-0.013344,-0.015529,-0.062665,0.0493,-0.024917,0.061774
3,0.001969,0.006982,0.019364,0.036142,4.1e-05,-0.009764,-0.014312,-0.009296,-0.024604,-0.003873,...,0.064218,0.021405,0.013014,0.019541,-0.049284,0.022687,0.020978,0.129799,0.192607,-0.022852
4,0.004221,0.009755,0.024064,0.04531,-0.00177,-0.021981,-0.012586,-0.008773,-0.04099,-0.002622,...,0.070271,-0.025787,0.051331,0.083469,-0.037155,0.046335,0.102905,-0.012339,-0.083068,0.009882


In [192]:
print(latent_matrix_df.shape)

(8227, 50)


In [193]:
result_with_latent_matrix = pd.concat([result_df[['movieId', 'title']], latent_matrix_df], axis=1)
result_with_latent_matrix.head(5)

Unnamed: 0,movieId,title,title.1,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,...,LF_41,LF_42,LF_43,LF_44,LF_45,LF_46,LF_47,LF_48,LF_49,LF_50
0,1,Toy Story (1995),0.0,0.006785,0.017872,0.051876,0.097259,0.013724,-0.041588,-0.016455,...,-0.054571,-0.065085,0.082127,0.118303,0.035347,-0.046067,0.08968,-0.069464,0.054075,-0.001107
1,2,Jumanji (1995),0.0,0.010404,0.024018,0.093192,0.024888,-0.001614,-0.03939,-0.039228,...,0.026654,-0.085835,0.013597,0.01907,-0.055728,0.008341,0.043453,0.002531,-0.079833,0.022192
2,3,Grumpier Old Men (1995),0.0,0.01144,0.002319,0.009365,0.021348,-0.003582,-0.007676,-0.007001,...,0.037998,0.078875,0.084496,-0.072789,-0.013344,-0.015529,-0.062665,0.0493,-0.024917,0.061774
3,4,Waiting to Exhale (1995),0.0,0.001969,0.006982,0.019364,0.036142,4.1e-05,-0.009764,-0.014312,...,0.064218,0.021405,0.013014,0.019541,-0.049284,0.022687,0.020978,0.129799,0.192607,-0.022852
4,5,Father of the Bride Part II (1995),0.0,0.004221,0.009755,0.024064,0.04531,-0.00177,-0.021981,-0.012586,...,0.070271,-0.025787,0.051331,0.083469,-0.037155,0.046335,0.102905,-0.012339,-0.083068,0.009882


Content Based Filter - Cosine Similarity

In [194]:
from sklearn.metrics.pairwise import cosine_similarity

In [195]:
input_movie_name = "Father of the Bride Part II (1995)"

In [196]:
# Create similar matrix based on cosine similarity
similar_matrix = cosine_similarity(latent_matrix)

In [197]:
movie_index = result_df.loc[unique_df['title'] == input_movie_name].index[0]
movie_index

4

In [198]:
similarity_scores = similar_matrix[movie_index]

# Define the number of recommendation we want
num_recommendation = 10

# Get the indices of the movies with highest similarity score

# argsort = sorting the similarity scores in ascending order
# [::-1] = slicing to reverse the array - to get the scores in descending order - highest on top
# [1:num_recommendation = 10+1] = To get first top 10 values from the list
similar_movies_indices = similarity_scores.argsort()[::-1][1:num_recommendation+1]

In [199]:
# Get the names of the movies for the selected indices
# Using unique_df here instead of result_df
similar_movies = unique_df.iloc[similar_movies_indices]['title'].tolist()
similar_movies

['Despicable Me 2 (2013)',
 'Welcome Home, Roscoe Jenkins (2008)',
 'Ordinary People (1980)',
 'My First Mister (2001)',
 'Free Willy 2: The Adventure Home (1995)',
 'Shaggy Dog, The (1959)',
 'Free Willy (1993)',
 'Dan in Real Life (2007)',
 'Weather Man, The (2005)',
 'Croods, The (2013)']

User Matrix - Rating Matrix

In [200]:
from scipy.sparse import csr_matrix

In [201]:
user_movie_matrix = df_ratings.pivot_table(index='movieId', columns='userId', values='rating', aggfunc='mean', fill_value=0)

In [202]:
user_movie_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,693,694,695,696,697,698,699,700,701,702
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0,4,0,0,5,0,4,0,4,...,0.0,4.5,0.0,0,0.0,0,4,4.0,0.0,3.5
2,3.5,0,0,0,3,0,0,0,0,0,...,0.0,1.5,0.0,0,0.0,5,0,0.0,0.0,0.0
3,0.0,4,0,0,0,3,3,5,0,0,...,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0
4,0.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0
5,0.0,0,0,0,0,0,0,0,0,0,...,0.0,2.5,0.0,0,3.5,0,2,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118997,0.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0
119141,0.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0
125916,0.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0
128488,0.0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0,0,0.0,0.0,0.0


TFIDF Vectorizer and TruncatedSVD for User Matrix

In [203]:
# svd.fit_transform requires the column names in string format. Here the column names are userid which is in integer format so converting in string format
user_movie_matrix.columns = user_movie_matrix.columns.astype(str)

# n_components to be adjusted. This number will decide how many features (words) you want.
# This will take 50 most important words in the document and create the matrix for that.
n_2 = 50
svd_2 = TruncatedSVD(n_components=n_2)

# Here we will not put the whole dataframe but just tfidf_df which is a matrix excluding movieId and name
latent_matrix_2 = svd.fit_transform(user_movie_matrix)

In [204]:
# This matrix will take LF (Latent Features) and truncate the sparce matrix into a smaller matrix
latent_matrix_df_2 = pd.DataFrame(latent_matrix_2, columns=[f'LF_{i}' for i in range(1, n_2 + 1)])
latent_matrix_df_2.head(5)

Unnamed: 0,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,LF_8,LF_9,LF_10,...,LF_41,LF_42,LF_43,LF_44,LF_45,LF_46,LF_47,LF_48,LF_49,LF_50
0,40.311089,7.732363,-8.529024,6.559551,0.517817,12.040331,-2.182096,7.117814,2.20548,-2.312526,...,-2.427187,3.050616,6.89408,-5.802812,0.996266,-3.328494,0.475031,2.1741,-2.164839,-5.123118
1,17.82901,10.108804,-6.608957,1.004087,-3.640667,3.87712,0.354045,0.969682,0.389041,-2.541728,...,-0.710149,-1.047781,-3.765931,-4.209561,-1.661278,-1.831852,-2.513422,-1.346269,-2.211087,2.954757
2,9.02462,7.138244,-3.030521,-0.609534,-4.274803,0.541907,-1.011295,1.461047,0.470951,0.176847,...,-0.7823,-2.431726,0.698816,1.601455,-0.156794,1.997781,1.422443,0.302645,-1.798794,1.63326
3,1.834689,3.263111,0.461236,0.58046,-2.546145,-0.556496,2.306687,0.304014,0.935239,-0.024086,...,-0.026981,-0.057078,-0.127269,-0.420521,0.246716,0.134918,-0.263765,0.146863,-0.404648,0.772975
4,8.640309,9.21496,-3.369465,0.400537,-4.340977,0.020053,-1.637576,1.043168,0.832708,-1.119207,...,-0.519509,-0.417728,1.526745,-0.383601,2.07076,1.51745,0.573524,1.303473,-2.038111,-1.705586


In [205]:
result_with_latent_matrix_2 = pd.concat([result_df[['movieId', 'title']], latent_matrix_df_2], axis=1)
result_with_latent_matrix_2.head(5)

Unnamed: 0,movieId,title,title.1,LF_1,LF_2,LF_3,LF_4,LF_5,LF_6,LF_7,...,LF_41,LF_42,LF_43,LF_44,LF_45,LF_46,LF_47,LF_48,LF_49,LF_50
0,1,Toy Story (1995),0.0,40.311089,7.732363,-8.529024,6.559551,0.517817,12.040331,-2.182096,...,-2.427187,3.050616,6.89408,-5.802812,0.996266,-3.328494,0.475031,2.1741,-2.164839,-5.123118
1,2,Jumanji (1995),0.0,17.82901,10.108804,-6.608957,1.004087,-3.640667,3.87712,0.354045,...,-0.710149,-1.047781,-3.765931,-4.209561,-1.661278,-1.831852,-2.513422,-1.346269,-2.211087,2.954757
2,3,Grumpier Old Men (1995),0.0,9.02462,7.138244,-3.030521,-0.609534,-4.274803,0.541907,-1.011295,...,-0.7823,-2.431726,0.698816,1.601455,-0.156794,1.997781,1.422443,0.302645,-1.798794,1.63326
3,4,Waiting to Exhale (1995),0.0,1.834689,3.263111,0.461236,0.58046,-2.546145,-0.556496,2.306687,...,-0.026981,-0.057078,-0.127269,-0.420521,0.246716,0.134918,-0.263765,0.146863,-0.404648,0.772975
4,5,Father of the Bride Part II (1995),0.0,8.640309,9.21496,-3.369465,0.400537,-4.340977,0.020053,-1.637576,...,-0.519509,-0.417728,1.526745,-0.383601,2.07076,1.51745,0.573524,1.303473,-2.038111,-1.705586


In [206]:
# For further filtering, we need t0 merge user_movie_matrix with the movies matrix so the title of the movie can be compared and extracted
user_movie_matrix_titles = pd.merge(movies, user_movie_matrix, on='movieId', how='outer')
user_movie_matrix_titles.head(5)

Unnamed: 0,movieId,title,1,2,3,4,5,6,7,8,...,693,694,695,696,697,698,699,700,701,702
0,1,Toy Story (1995),0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,...,0.0,4.5,0.0,0.0,0.0,0.0,4.0,4.0,0.0,3.5
1,2,Jumanji (1995),3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,1.5,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,0.0,3.5,0.0,2.0,0.0,0.0,0.0


Rating Based Filtering

In [207]:
# Already imported
# from sklearn.metrics.pairwise import cosine_similarity

In [208]:
input_movie_name_2 = "Jumanji (1995)"

In [209]:
# Create similar matrix based on cosine similarity
similar_matrix_1 = cosine_similarity(latent_matrix)
similar_matrix_2 = cosine_similarity(latent_matrix_2)

In [210]:
# Here movie index refers to the movieId which is same for both matrix
movie_index_new = unique_df.loc[unique_df['title'] == input_movie_name].index[0]

In [211]:
# Finding similarity scores for both type of filtering
similarity_scores_1 = similar_matrix_1[movie_index]
similarity_scores_2 = similar_matrix_2[movie_index]
similarity_scores_average = (similarity_scores_1 + similarity_scores_2)/2

In [212]:
# argsort = sorting the similarity scores in ascending order
# [::-1] = slicing to reverse the array - to get the scores in descending order - highest on top
# [1:num_recommendation = 10+1] = To get first top 10 values from the list
similar_movies_indices_1 = similarity_scores_1.argsort()[::-1][1:num_recommendation+1]
similar_movies_indices_2 = similarity_scores_2.argsort()[::-1][1:num_recommendation+1]

similar_movies_indices_average = similarity_scores_average.argsort()[::-1][1:num_recommendation+1]

In [213]:
# Get the names of the movies for the selected indices

similar_movies_1 = unique_df.iloc[similar_movies_indices_1]['title'].tolist()
similar_movies_2 = user_movie_matrix_titles.iloc[similar_movies_indices_2]['title'].tolist()

similar_movies_average = user_movie_matrix_titles.iloc[similar_movies_indices_average]['title'].tolist()

In [214]:
print("\nReview Based Filtering: \n")
print(similar_movies_1)

print("\nRating Based Filtering: \n")
print(similar_movies_2)

print("\nHybrid Filtering: \n")
print(similar_movies_average)


Review Based Filtering: 

['Despicable Me 2 (2013)', 'Welcome Home, Roscoe Jenkins (2008)', 'Ordinary People (1980)', 'My First Mister (2001)', 'Free Willy 2: The Adventure Home (1995)', 'Shaggy Dog, The (1959)', 'Free Willy (1993)', 'Dan in Real Life (2007)', 'Weather Man, The (2005)', 'Croods, The (2013)']

Rating Based Filtering: 

['Grumpier Old Men (1995)', 'Chungking Express (Chung Hing sam lam) (1994)', 'To Live (Huozhe) (1994)', 'Tommy Boy (1995)', 'Fatal Instinct (1993)', 'Cold Fever (Á köldum klaka) (1995)', 'Kids of the Round Table (1995)', 'Three Colors: White (Trzy kolory: Bialy) (1994)', 'Free Willy 2: The Adventure Home (1995)', 'Clear and Present Danger (1994)']

Hybrid Filtering: 

['Mr. Wrong (1996)', 'Simple Twist of Fate, A (1994)', 'Once Upon a Time... When We Were Colored (1995)', 'Devil in a Blue Dress (1995)', 'Super Mario Bros. (1993)', 'Major Payne (1995)', 'South Park: Bigger, Longer and Uncut (1999)', 'Seven (a.k.a. Se7en) (1995)', 'Murder, My Sweet (1944)'

Surprise

In [215]:
!pip install surprise



In [238]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

In [239]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [240]:
# A Reader object is instantiated with a specified rating scale (in this case, ratings range from 1 to 5).
reader = Reader(rating_scale=(1, 5))

# The Dataset is loaded from the DataFrame df_ratings containing columns 'userId', 'movieId', and 'rating'.
data = Dataset.load_from_df(df_ratings[['userId','movieId','rating']], reader)

In [241]:
# The train_test_split function is used to split the dataset into training and testing sets.
trainset, testset = train_test_split(data, test_size=.25)

# An SVD (Singular Value Decomposition) algorithm is chosen and instantiated.
# The algorithm is trained on the training set
algorithm = SVD()
algorithm.fit(trainset)
predictions = algorithm.test(testset)

In [242]:
# Check the accuracy using Root Mean Square Error
accuracy.rmse(predictions)

RMSE: 0.9024


0.9024367886551654

In [243]:
# A mapping (Mapping_file) is created to map movie titles to movieIds.

Mapping_file = dict(zip(df_movies.title.tolist(), df_movies.movieId.tolist()))

In [244]:
# Example of Collaborative Filtering with Surprise

def pred_user_rating(ui):
    if ui in df_ratings.userId.unique():
        ui_list = df_ratings[df_ratings.userId == ui].movieId.tolist()
        d = {k: v for k,v in Mapping_file.items() if not v in ui_list}
        predictedL = []
        # For a given user (ui), movies that the user has not rated are selected.
        for i, j in d.items():
            # Predictions are made for the selected movies using the trained SVD algorithm.
            predicted = algorithm.predict(ui, j)
            predictedL.append((i, predicted[3]))
        # A DataFrame (pdf) is created from the list of predicted ratings.
        pdf = pd.DataFrame(predictedL, columns = ['title', 'rating'])
        # The predictions are then sorted by predicted rating in descending order.
        pdf.sort_values('rating', ascending=False, inplace=True)
        pdf.set_index('title', inplace=True)
        # The top 10 movie recommendations with their predicted ratings are returned for the user.
        return pdf.head(10)
    else:
        print("User Id does not exist in the list!")
        return None

In [245]:
pred_user_rating(5)

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
"Dark Knight Rises, The (2012)",5.0
"Godfather, The (1972)",4.934051
One Flew Over the Cuckoo's Nest (1975),4.854176
12 Angry Men (1957),4.854143
"Maltese Falcon, The (1941)",4.849578
V for Vendetta (2006),4.839844
"Dark Knight, The (2008)",4.834767
My Life in Pink (Ma vie en rose) (1997),4.834488
Saving Private Ryan (1998),4.828051
Nausicaä of the Valley of the Wind (Kaze no tani no Naushika) (1984),4.82636


In [246]:
pred_user_rating(99)

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
"Pianist, The (2002)",4.744962
"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",4.709965
"Godfather: Part II, The (1974)",4.693311
Rear Window (1954),4.675414
"Bourne Ultimatum, The (2007)",4.645969
Band of Brothers (2001),4.616318
Delicatessen (1991),4.61488
North by Northwest (1959),4.613073
Mulholland Drive (2001),4.600767
Charade (1963),4.598265
