**Recommender Systems - User Based Collaborative Filtering Assignment**


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/MyDrive/Recommender Systems/ml-latest-small/ml-latest-small

/content/drive/MyDrive/Recommender Systems/ml-latest-small/ml-latest-small


In [3]:
ls

links.csv  movies.csv  ratings.csv  README.txt  tags.csv


In [4]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import statistics 
%matplotlib inline

In [5]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [6]:
df_ratings.head(n=10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [7]:
df_ratings.shape[0]

100836

In [8]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
df_movies['year'] = df_movies.title.str.slice(-5,-1)

In [10]:
df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [11]:
df_movies.title = df_movies.title.str.replace('\(\d\d\d\d\)',"")

In [12]:
df_movies.head(n=20)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
5,6,Heat,Action|Crime|Thriller,1995
6,7,Sabrina,Comedy|Romance,1995
7,8,Tom and Huck,Adventure|Children,1995
8,9,Sudden Death,Action,1995
9,10,GoldenEye,Action|Adventure|Thriller,1995


In [13]:
#It's time to drop the genre column as we don't need it
df_movies.drop('genres', 1, inplace=True)

In [14]:
df_movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [15]:
# Same goes for timestamp column in rating dataframe
df_ratings.drop('timestamp', 1, inplace=True)

In [16]:
df_ratings.shape

(100836, 3)

In [17]:
df_ratings.loc[df_ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
227,1,3744,4.0
228,1,3793,5.0
229,1,3809,4.0
230,1,4006,4.0


In [18]:
input_user = df_ratings.loc[df_ratings['userId'] == 1]

In [19]:
input_user = pd.merge(input_user ,df_movies[df_movies['movieId'].isin(input_user['movieId'])]).drop('year',1)

In [20]:
input_user

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story
1,1,3,4.0,Grumpier Old Men
2,1,6,4.0,Heat
3,1,47,5.0,Seven (a.k.a. Se7en)
4,1,50,5.0,"Usual Suspects, The"
...,...,...,...,...
227,1,3744,4.0,Shaft
228,1,3793,5.0,X-Men
229,1,3809,4.0,What About Bob?
230,1,4006,4.0,Transformers: The Movie


In [21]:
input_user_movies_watched = input_user['movieId']

In [22]:
# looking for those users who have watched the same movies like our input user

similar_users = df_ratings[df_ratings['movieId'].isin(input_user['movieId']).tolist()]
similar_users = similar_users[similar_users.userId != 1]

In [23]:
similar_users

Unnamed: 0,userId,movieId,rating
233,2,333,4.0
235,2,3578,4.0
262,3,527,0.5
272,3,1275,3.5
275,3,1587,4.5
...,...,...,...
99742,610,3671,5.0
99748,610,3703,4.5
99752,610,3740,4.5
99753,610,3744,3.0


In [24]:
similar_users = similar_users.groupby(['userId'])


In [25]:
similar_users.get_group(610)

Unnamed: 0,userId,movieId,rating
99534,610,1,5.0
99535,610,6,5.0
99538,610,47,5.0
99539,610,50,4.0
99540,610,70,4.0
...,...,...,...
99742,610,3671,5.0
99748,610,3703,4.5
99752,610,3740,4.5
99753,610,3744,3.0


In [26]:
# Most similar user groups sorted
similar_users = sorted(similar_users,  key=lambda x: len(x[1]), reverse=True)

In [27]:
similar_users[0:5]

[(414,        userId  movieId  rating
  62294     414        1     4.0
  62296     414        3     4.0
  62298     414        6     3.0
  62322     414       47     4.0
  62324     414       50     5.0
  ...       ...      ...     ...
  63491     414     3729     3.0
  63493     414     3740     5.0
  63512     414     3793     4.0
  63515     414     3809     3.0
  63851     414     5060     4.0
  
  [200 rows x 3 columns]), (599,        userId  movieId  rating
  92623     599        1     3.0
  92625     599        3     1.5
  92626     599        6     4.5
  92651     599       47     4.0
  92652     599       50     3.5
  ...       ...      ...     ...
  93713     599     3703     5.0
  93720     599     3740     4.0
  93732     599     3793     3.5
  93735     599     3809     2.5
  93780     599     4006     2.5
  
  [180 rows x 3 columns]), (474,        userId  movieId  rating
  73092     474        1     4.0
  73095     474        6     3.0
  73119     474       47     4.0
  7

In [28]:
pearson_coff = {}
for name, group in similar_users:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    input_user = input_user.sort_values(by='movieId')

    total_ratings = len(group)
    # Review scores for the movies that they both have in common
    temp_df = input_user[input_user['movieId'].isin(group['movieId'].tolist())]
    user_in_ques_ratings = temp_df['rating'].tolist()
    # Let's choose the simple variable name i.e. 'x'
    x = user_in_ques_ratings
    x_mean = statistics.mean(x)
    x_minus_x_mean = list((i-x_mean for i in x))
    x_minus_x_mean_squre = [i*i for i in x_minus_x_mean]
    sum_of_x_minus_x_mean_square = sum(x_minus_x_mean_squre)

    #Let's also put the current user group reviews in a list format
    ratings_of_similar_user = group['rating'].tolist()
    # Let's choose the simple variable name i.e. 'y'
    y = ratings_of_similar_user
    y_mean = statistics.mean(y)
    y_minus_y_mean = list((i-y_mean for i in y))
    y_minus_y_mean_square = [i*i for i in y_minus_y_mean]
    sum_of_y_minus_y_mean_square = sum(y_minus_y_mean_square)

    x_minus_x_mean_mul_y_minus_y_mean = (i*j for i,j in zip(x_minus_x_mean, y_minus_y_mean))
    sum_of_x_minus_x_mean_mul_y_minus_y_mean = sum(x_minus_x_mean_mul_y_minus_y_mean)


    denominator = sqrt(sum_of_x_minus_x_mean_square*sum_of_y_minus_y_mean_square)
    
    if denominator != 0:
      pearson_coff[name] = sum_of_x_minus_x_mean_mul_y_minus_y_mean/denominator
    else:
      pearson_coff[name] = 0

In [29]:
pearson_coff.items()

dict_items([(414, 0.41186410352090536), (599, 0.21976809443118978), (474, 0.13462166738326414), (68, 0.028221900216028616), (288, 0.18182772066934832), (274, 0.05681933617507971), (448, 0.3451787982895458), (608, 0.2680699374101865), (182, 0.18182353240556265), (480, 0.2264122983258445), (590, 0.3111824770205233), (380, 0.049271049584399804), (19, 0.32518024337203466), (387, 0.2704146241426443), (217, -0.017673398399180078), (600, 0.25364908813995984), (91, 0.0956622956952898), (307, 0.32096231902537775), (57, 0.3542598962415167), (64, 0.20142767777641482), (469, 0.287975311224966), (603, -0.06150271405737388), (313, 0.058983438040488397), (368, 0.23910305126173534), (483, 0.29789244524828706), (177, 0.11350773061827389), (45, 0.26740222880029524), (555, 0.3208317102277071), (160, 0.32282378045705906), (226, 0.3126532505950559), (477, 0.4053425045684789), (561, 0.2054174462589411), (597, 0.42702914061318026), (202, 0.060703237148243445), (249, 0.309438905126688), (391, 0.18902464114524

In [30]:
len(pearson_coff)

602

In [31]:
df_pearson_coff = pd.DataFrame.from_dict(pearson_coff, orient='index')
df_pearson_coff.columns = ['SimilarityIndex']
df_pearson_coff['UserId'] = df_pearson_coff.index
df_pearson_coff.index = range(len(df_pearson_coff))
df_pearson_coff.head(n=10)

Unnamed: 0,SimilarityIndex,UserId
0,0.411864,414
1,0.219768,599
2,0.134622,474
3,0.028222,68
4,0.181828,288
5,0.056819,274
6,0.345179,448
7,0.26807,608
8,0.181824,182
9,0.226412,480


In [32]:
# show the 10 most similar users to user in question i.e. user with usesrId = 1 (input_user)
most_similar_users = df_pearson_coff.sort_values(by='SimilarityIndex', ascending=False)
most_similar_users.shape

(602, 2)

In [49]:
most_similar_users_ratings=most_similar_users.merge(df_ratings, left_on='UserId', right_on='userId', how='inner').drop('userId', 1)
most_similar_users_ratings

Unnamed: 0,SimilarityIndex,UserId,movieId,rating
0,1.0,146,32,5.0
1,1.0,146,344,2.0
2,1.0,146,410,3.5
3,1.0,146,502,2.5
4,1.0,146,765,3.0
...,...,...,...,...
100310,-1.0,518,3897,4.5
100311,-1.0,518,6323,4.5
100312,-1.0,518,6333,4.5
100313,-1.0,518,6365,4.0


In [34]:
 # Movies not watched by input user 
 not_seen_movies = most_similar_users_ratings[~most_similar_users_ratings['movieId'].isin(input_user['movieId'])].sort_values(by='SimilarityIndex', ascending=False)

In [35]:
not_seen_movies.sort_values(by='movieId', inplace=True)

In [36]:
# input_user[input_user['movieId'] == 345]

list_of_not_seen_movies = not_seen_movies.movieId.to_list()

In [37]:
set_of_not_seen_movies = set(list_of_not_seen_movies)

In [38]:
unique_list_of_not_seen_movies = list(sorted(set_of_not_seen_movies))

In [39]:
unique_list_of_not_seen_movies

[2,
 4,
 5,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 34,
 36,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 48,
 49,
 52,
 53,
 54,
 55,
 57,
 58,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 68,
 69,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 85,
 86,
 87,
 88,
 89,
 92,
 93,
 94,
 95,
 96,
 97,
 99,
 100,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 111,
 112,
 113,
 116,
 117,
 118,
 119,
 121,
 122,
 123,
 125,
 126,
 128,
 129,
 132,
 135,
 137,
 140,
 141,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 152,
 153,
 154,
 155,
 156,
 158,
 159,
 160,
 161,
 162,
 164,
 165,
 166,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 193,
 194,
 195,
 196,
 198,
 199,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 217,
 218,
 219,
 220,
 222,
 224,
 225,
 227,
 2

In [40]:
# mean of ratings of input_user (user in question) with user id 1
mean_of_ratings_of_userId_1 = statistics.mean(input_user['rating'].tolist())
prediction_of_movies = {}

for movie in unique_list_of_not_seen_movies:
  users_watched_movie = df_ratings[df_ratings['movieId'] == int(movie)].sort_values(by='userId')
  ratings_of_users_watched_movie = users_watched_movie['rating'].tolist()
  # ratings_of_users_watched_movie = ratings_of_users_watched_movie[:4]

  mean_of_users_watched_movie = statistics.mean(ratings_of_users_watched_movie)

  # print("Similarity index of users watched movie: ",movie)
  sim_index_of_users_watched_movie = most_similar_users[most_similar_users['UserId'].isin(users_watched_movie['userId'])].sort_values(by='UserId')
  list_of_sim_index = sim_index_of_users_watched_movie['SimilarityIndex'].tolist()

  # list_of_sim_index = list_of_sim_index[:4]

  r_bp_minus_mean_of_rb = [i-mean_of_users_watched_movie for i in ratings_of_users_watched_movie]

  temp_numerator = (i*j for i, j in list(zip(list_of_sim_index, r_bp_minus_mean_of_rb)))

  sum_temp_numerator = sum(temp_numerator)
  numerator_pred = sum_temp_numerator + mean_of_ratings_of_userId_1

  if sum(list_of_sim_index) != 0:
    prediction_of_movie = numerator_pred/sum(list_of_sim_index)
    prediction_of_movies[movie] = prediction_of_movie
  else:
    prediction_of_movies[movie] = 0

In [41]:
prediction_of_movies[32]

0.24556480421655216

In [42]:
prediction_of_movies.items()

dict_items([(2, 0.1852986326836714), (4, 5.427412829755495), (5, 0.8716690742854026), (7, 0.971170407014836), (8, 125.18060209113675), (9, 1.9803263263773587), (10, 0.049759949227382944), (11, 0.7272360164987381), (12, 1.4884822718804533), (13, 2.628029004904121), (14, 3.8001199066640385), (15, 49.39700583672653), (16, 1.037341112577403), (17, 0.7773025334609567), (18, 1.559897152811102), (19, 0.6195262047847285), (20, 3.988020555226509), (21, 0.5090496412428431), (22, 0.5347494732984465), (23, 1.4720624387832597), (24, 1.4295393144100117), (25, 0.7800224940870387), (26, 2.515504674156951), (27, -3.9554672965580098), (28, 3.99406308487236), (29, 1.405003282150983), (30, -50.104829749816844), (31, 1.8823848798051879), (32, 0.24556480421655216), (34, 0.26468231496203026), (36, 0.5655030934879464), (38, 8.009439381446532), (39, 0.19389949526147246), (40, -8.42766100198344), (41, 1.4173022989425537), (42, 2.967953346996653), (43, 4.997152689298197), (44, 0.7367757197836916), (45, 1.5321883

In [43]:
dict_values = prediction_of_movies.values()
list_of_dict_values = list(dict_values)
sorted_values = sorted(list_of_dict_values)

In [53]:
df_recommended_movies = pd.DataFrame.from_dict(prediction_of_movies, orient='index')
df_recommended_movies.columns = ['rating']
df_recommended_movies['movieId'] = df_recommended_movies.index
df_recommended_movies.index = range(len(df_recommended_movies))
df_recommended_movies.head(n=10)

Unnamed: 0,rating,movieId
0,0.185299,2
1,5.427413,4
2,0.871669,5
3,0.97117,7
4,125.180602,8
5,1.980326,9
6,0.04976,10
7,0.727236,11
8,1.488482,12
9,2.628029,13


In [45]:
# df_recommended_movies = df_recommended_movies[(df_recommended_movies['rating'] > 0.0) & (df_recommended_movies['rating'] < 10.0)]

In [50]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

df_recommended_movies['rating'] = MinMaxScaler(feature_range=(0,5)).fit_transform(np.array(df_recommended_movies['rating']).reshape(-1,1))

In [54]:
recommended_movies = df_recommended_movies.sort_values(by='rating', ascending=False)[:20]
movies_ids = recommended_movies['movieId'].tolist()

In [55]:
recommended_movies

Unnamed: 0,rating,movieId
3098,25916.4257,4517
2913,6670.428655,4236
9452,6670.428655,188189
8918,6670.428655,148982
3919,6670.428655,5986
5272,6670.428655,26524
6321,6670.428655,55156
4320,6670.428655,6775
4578,6670.428655,7181
9417,6670.428655,183199


In [48]:
# Top 20 recommended movies

df_movies[df_movies['movieId'].isin(movies_ids)]

Unnamed: 0,movieId,title,year
3149,4236,Keep the River on Your Right: A Modern Canniba...,2000
3334,4517,Lady in White (a.k.a. The Mystery of the Lady ...,1988
4157,5986,Fat City,1972
4202,6064,"Harder They Fall, The",1956
4560,6775,Life and Debt,2001
4640,6935,"Revolution Will Not Be Televised, The (a.k.a. ...",2003
4820,7181,Ship of Fools,1965
5519,26524,"Times of Harvey Milk, The",1984
5584,26838,"Snapper, The",1993
6572,55156,"Unreasonable Man, An",2006
