In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/Recommender Systems/ml-latest-small/ml-latest-small

/content/drive/MyDrive/Recommender Systems/ml-latest-small/ml-latest-small


In [None]:
ls

links.csv  movies.csv  ratings.csv  README.txt  tags.csv


In [None]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import statistics 
%matplotlib inline

In [None]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [None]:
df_ratings.head(n=10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [None]:
df_ratings.shape[0]

100836

In [None]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
df_movies['year'] = df_movies.title.str.slice(-5,-1)

In [None]:
df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [None]:
df_movies.title = df_movies.title.str.replace('\(\d\d\d\d\)',"")

In [None]:
df_movies.head(n=20)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
5,6,Heat,Action|Crime|Thriller,1995
6,7,Sabrina,Comedy|Romance,1995
7,8,Tom and Huck,Adventure|Children,1995
8,9,Sudden Death,Action,1995
9,10,GoldenEye,Action|Adventure|Thriller,1995


In [None]:
#It's time to drop the genre column as we don't need it
df_movies.drop('genres', 1, inplace=True)

In [None]:
df_movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [None]:
# Same goes for timestamp column in rating dataframe
df_ratings.drop('timestamp', 1, inplace=True)

In [None]:
df_ratings.shape

(100836, 3)

## ***GROUP RECOMMENDATIONS***

In [None]:
group_of_users = [1, 2, 3]

In [None]:
df_ratings.loc[df_ratings['userId'].isin(group_of_users)]

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
295,3,7899,4.5
296,3,7991,5.0
297,3,26409,4.5
298,3,70946,5.0


In [None]:
df_group = df_ratings.loc[df_ratings['userId'].isin(group_of_users)]

In [None]:
df_group

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
295,3,7899,4.5
296,3,7991,5.0
297,3,26409,4.5
298,3,70946,5.0


In [None]:
df_group = pd.merge(df_group ,df_movies[df_movies['movieId'].isin(df_group['movieId'])]).drop('year',1)

In [None]:
df_group

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story
1,1,3,4.0,Grumpier Old Men
2,1,6,4.0,Heat
3,1,47,5.0,Seven (a.k.a. Se7en)
4,1,50,5.0,"Usual Suspects, The"
...,...,...,...,...
295,3,7899,4.5,Master of the Flying Guillotine (Du bi quan wa...
296,3,7991,5.0,Death Race 2000
297,3,26409,4.5,"Clonus Horror, The"
298,3,70946,5.0,Troll 2


In [None]:
# looking for those users who have watched the same movies like our users in group

similar_users = df_ratings[df_ratings['movieId'].isin(df_group['movieId']).tolist()]
similar_users = similar_users[~similar_users.userId.isin(group_of_users)]

In [None]:
similar_users

Unnamed: 0,userId,movieId,rating
303,4,47,2.0
316,4,235,2.0
318,4,260,5.0
320,4,296,1.0
329,4,441,1.0
...,...,...,...
100542,610,91658,4.0
100596,610,99114,4.5
100657,610,106782,4.5
100673,610,109487,3.5


In [None]:
similar_users = similar_users.groupby(['userId'])

In [None]:
# Most similar user groups sorted
similar_users = sorted(similar_users,  key=lambda x: len(x[1]), reverse=True)

In [None]:
pearson_coff = {}

for user in group_of_users:
  pearson_coff[user] = {}
  similar_users = df_group[df_group['userId'] == user]
  similar_users = df_ratings[df_ratings['movieId'].isin(similar_users['movieId']).tolist()]
  similar_users = similar_users[~similar_users.userId.isin([user])]
  similar_users = similar_users.groupby(['userId'])
  similar_users = sorted(similar_users,  key=lambda x: len(x[1]), reverse=True)
  for name, group in similar_users:
      #Let's start by sorting the input and current user group so the values aren't mixed up later on
      group = group.sort_values(by='movieId')
      input_user = df_group[df_group['userId'] == user].sort_values(by='movieId')

      total_ratings = len(group)
      # Review scores for the movies that they both have in common
      temp_df = input_user[input_user['movieId'].isin(group['movieId'].tolist())]
      user_in_ques_ratings = temp_df['rating'].tolist()
      # Let's choose the simple variable name i.e. 'x'
      x = user_in_ques_ratings
      x_mean = statistics.mean(x)
      x_minus_x_mean = list((i-x_mean for i in x))
      x_minus_x_mean_squre = [i*i for i in x_minus_x_mean]
      sum_of_x_minus_x_mean_square = sum(x_minus_x_mean_squre)

      #Let's also put the current user group reviews in a list format
      ratings_of_similar_user = group['rating'].tolist()
      # Let's choose the simple variable name i.e. 'y'
      y = ratings_of_similar_user
      y_mean = statistics.mean(y)
      y_minus_y_mean = list((i-y_mean for i in y))
      y_minus_y_mean_square = [i*i for i in y_minus_y_mean]
      sum_of_y_minus_y_mean_square = sum(y_minus_y_mean_square)

      x_minus_x_mean_mul_y_minus_y_mean = (i*j for i,j in zip(x_minus_x_mean, y_minus_y_mean))
      sum_of_x_minus_x_mean_mul_y_minus_y_mean = sum(x_minus_x_mean_mul_y_minus_y_mean)


      denominator = sqrt(sum_of_x_minus_x_mean_square*sum_of_y_minus_y_mean_square)
      
      if denominator != 0:
        pearson_coff[user][name] = sum_of_x_minus_x_mean_mul_y_minus_y_mean/denominator
      else:
        pearson_coff[user][name] = 0

In [None]:
pearson_coff.items()

dict_items([(1, {414: 0.41186410352090536, 599: 0.21976809443118978, 474: 0.13462166738326414, 68: 0.028221900216028616, 288: 0.18182772066934832, 274: 0.05681933617507971, 448: 0.3451787982895458, 608: 0.2680699374101865, 182: 0.18182353240556265, 480: 0.2264122983258445, 590: 0.3111824770205233, 380: 0.049271049584399804, 19: 0.32518024337203466, 387: 0.2704146241426443, 217: -0.017673398399180078, 600: 0.25364908813995984, 91: 0.0956622956952898, 307: 0.32096231902537775, 57: 0.3542598962415167, 64: 0.20142767777641482, 469: 0.287975311224966, 603: -0.06150271405737388, 313: 0.058983438040488397, 368: 0.23910305126173534, 483: 0.29789244524828706, 177: 0.11350773061827389, 45: 0.26740222880029524, 555: 0.3208317102277071, 160: 0.32282378045705906, 226: 0.3126532505950559, 477: 0.4053425045684789, 561: 0.2054174462589411, 597: 0.42702914061318026, 202: 0.060703237148243445, 249: 0.309438905126688, 391: 0.18902464114524936, 489: 0.19882703241768437, 219: 0.26422115408093483, 294: 0.04

## ***Prediction***

In [None]:
# Function to get the most similar users for the user in question.

def get_most_similar_users(user):
  df_pearson_coff = pd.DataFrame.from_dict(pearson_coff[user], orient='index')
  df_pearson_coff.columns = ['SimilarityIndex']
  df_pearson_coff['UserId'] = df_pearson_coff.index
  df_pearson_coff.index = range(len(df_pearson_coff))
  most_similar_users = df_pearson_coff.sort_values(by='SimilarityIndex', ascending=False)
  return most_similar_users

In [None]:
# Function to get the list of movies that are not watched by the user in question but the similar users have watched.

def get_list_of_not_seen_movies(user):
  most_similar_users = get_most_similar_users(user)
  most_similar_users = most_similar_users.merge(df_ratings, left_on='UserId', right_on='userId', how='inner').drop('userId', 1)
  not_seen_movies = most_similar_users[~most_similar_users['movieId'].isin(df_group[df_group['userId'] == user]['movieId'])].sort_values(by='SimilarityIndex', ascending=False)
  not_seen_movies.sort_values(by='movieId', inplace=True)
  list_of_not_seen_movies = not_seen_movies.movieId.to_list()
  set_of_not_seen_movies = set(list_of_not_seen_movies)
  unique_list_of_not_seen_movies = list(sorted(set_of_not_seen_movies))
  return unique_list_of_not_seen_movies

In [None]:
# mean of ratings of users in group
list_mean_ratings = []
for user in group_of_users:
  list_mean_ratings.append(statistics.mean(df_group[df_group['userId'] == user]['rating'].tolist()))

In [None]:
# Function to get the similarity index of all the similar users for the user in question.

def get_list_of_sim_index(user, users_watched_movie):
  most_similar_users = get_most_similar_users(user)
  sim_index = most_similar_users[most_similar_users['UserId'].isin(users_watched_movie)].sort_values(by='UserId')
  return sim_index['SimilarityIndex'].to_list()

In [None]:
# This block of code calculates the prediction score for all the movies that are not watched by the user in a group but similar users have watched these movies.

prediction_of_movies = {}

for user in group_of_users:
  prediction_of_movies[user] = {}
  not_seen_movies = get_list_of_not_seen_movies(user)
  for movie in not_seen_movies:
    users_watched_movie = df_ratings[df_ratings['movieId'] == int(movie)].sort_values(by='userId')
    ratings_of_users_watched_movie = users_watched_movie['rating'].tolist()

    mean_of_ratings_users_watched_movie = statistics.mean(ratings_of_users_watched_movie)

    list_of_sim_index = get_list_of_sim_index(user, users_watched_movie['userId'].to_list())

    r_bp_minus_mean_of_rb = [i-mean_of_ratings_users_watched_movie for i in ratings_of_users_watched_movie]

    temp_numerator = (i*j for i, j in list(zip(list_of_sim_index, r_bp_minus_mean_of_rb)))

    sum_temp_numerator = sum(temp_numerator)

    numerator_pred = sum_temp_numerator + list_mean_ratings[user - 1]

    if sum(list_of_sim_index) != 0:
      prediction_of_movie = numerator_pred/sum(list_of_sim_index)
      prediction_of_movies[user][movie] = prediction_of_movie
    else:
      prediction_of_movies[user][movie] = 0

## ***Preprocessing for aggregation methods***

In [None]:
# This preprocessing method will select the id's of movies predicted for all the users or you can say the this method will select the common predicted 
# movies among the users. For simplicity I am taking 100 movies.
def preprocessing(ratings_of_users_in_group_dic):
  common_movies_id = set()
  movie_ids = []
  for user in range(1, len(ratings_of_users_in_group_dic) + 1):
    df_user = pd.DataFrame.from_dict(ratings_of_users_in_group_dic[user], orient='index')
    df_user.columns = ['rating']
    df_user['movieId'] = df_user.index
    df_user.index = range(len(ratings_of_users_in_group_dic[user]))
    df_user = df_user[(df_user['rating'] >= 0) & (df_user['rating'] <= 5)]
    movie_ids.append(df_user['movieId'].to_list())
  
  common_movies_id = set(movie_ids[0])
  for li in movie_ids[1:]:
    common_movies_id = common_movies_id.intersection(set(li))
  return sorted(common_movies_id)

## ***Average Aggregation*** 

In [None]:
# This method will take the movie id (predicted movie id from above preprocessing method) and compute the rating for the whole group.
def avg_aggregation(movie_id):
  ratings_given_by_users = []
  for user in range(1, len(prediction_of_movies)+1):
    ratings_given_by_users.append((prediction_of_movies[user][movie_id]))
  return statistics.mean(ratings_given_by_users)
  

## ***Least Misery Aggregation***

In [None]:
# # This method will take the movie id (predicted movie id from above preprocessing method) and compute the rating for the whole group.
def least_misery(movie_id):
  ratings_given_by_users = []
  for user in range(1, len(prediction_of_movies)+1):
    ratings_given_by_users.append((prediction_of_movies[user][movie_id]))
  return min(ratings_given_by_users)

***show the top-20 recommendations, i.e., the 20 movies with the highest prediction scores that (i) the average method suggests, and (ii) the least misery method suggest***

In [None]:
movie_ids = preprocessing(prediction_of_movies)
avg = []
l_misery = []
for id in movie_ids:
  avg.append(avg_aggregation(id))
  l_misery.append(least_misery(id))
  

***Produce a group of 3 users, and for this group, show the top-20 recommendations, i.e., the 20 movies with the highest prediction scores that (i) the average method suggests***

In [None]:
df_average = pd.DataFrame(list(zip(movie_ids, avg)), columns=["Movie Id", "Rating"])

In [None]:
pd.merge(df_average.sort_values(by='Rating', ascending=False)[:20], df_movies, left_on="Movie Id", right_on="movieId").drop("movieId", 1)

Unnamed: 0,Movie Id,Rating,title,year
0,4158,4.144849,Monkeybone,2001
1,78039,4.074203,Blue Valentine,2010
2,169984,4.01156,Alien: Covenant,2017
3,51709,3.675471,"Host, The (Gwoemul)",2006
4,79057,3.670889,Predators,2010
5,114935,3.609955,Predestination,2014
6,167746,3.608054,The Lego Batman Movie,2017
7,48322,3.597013,Jackass Number Two,2006
8,4855,3.536192,Dirty Harry,1971
9,93363,3.509755,John Carter,2012


***Produce a group of 3 users, and for this group, show the top-20 recommendations, i.e., the 20 movies with the highest prediction scores that the least misery method suggest***

In [None]:
df_least_misery = pd.DataFrame(list(zip(movie_ids, l_misery)), columns=['Movie Id', 'Rating'])
pd.merge(df_least_misery.sort_values(by='Rating', ascending=False)[:20], df_movies, left_on="Movie Id", right_on="movieId").drop("movieId", 1)

Unnamed: 0,Movie Id,Rating,title,year
0,4158,3.657139,Monkeybone,2001
1,169984,3.579269,Alien: Covenant,2017
2,167746,3.394417,The Lego Batman Movie,2017
3,93363,3.311979,John Carter,2012
4,78039,2.921451,Blue Valentine,2010
5,79057,2.864718,Predators,2010
6,149406,2.755928,Kung Fu Panda 3,2016
7,48322,2.721634,Jackass Number Two,2006
8,6006,2.622691,Just Married,2003
9,55995,2.523432,Beowulf,2007


## ***Kendal Tau Distance***

In [None]:
# List of indexes for the movies suggested by the least misery method
least_misery_index = df_least_misery.sort_values(by='Rating', ascending=False).index.to_list()

In [None]:
# List of indexes for the movies suggested by the average method
average_index = df_average.sort_values(by='Rating', ascending=False).index.to_list()

In [None]:
# This block of code calculates the indexes of movies - predicted for each user in group. for this assignment there are three users in group, therefore 
# last line of code of this block generates dynamic variables for three of users and assigns the indexes of movies.
for user in range(1, len(prediction_of_movies)+1):
  df_user = pd.DataFrame.from_dict(prediction_of_movies[user], orient='index')
  df_user.columns = ['rating']
  df_user['movieId'] = df_user.index
  df_user.index = range(len(prediction_of_movies[user]))
  df_user = df_user[(df_user['rating'] >= 0) & (df_user['rating'] <= 5)].sort_values(by='rating', ascending=False)
  df_user = df_user[:99].index.to_list()
  globals()[f'user_{user}_index'] = df_user

In [None]:
#This is the method to calcculate the Kendal Tau Distance between the two ranked lists.

from itertools import combinations

def kendal_tau_distance(group_recom, user_recom):

  total_combinations = combinations(range(0, len(group_recom)), 2)
  
  kendal_distance = 0
  
  for x,y in total_combinations:
    a = group_recom[x] - group_recom[y]
    b = user_recom[x] - user_recom[y]

    #calculating disagreements or discordants
    if (a*b) < 0:
      kendal_distance += 1

  return kendal_distance

### ***Use again the group of 3 users, and for this group, show the top-20 recommendations, i.e., the 20 movies with the highest prediction scores that your method suggests***

In [None]:
average_kendal_distance = kendal_tau_distance(average_index, user_1_index) + kendal_tau_distance(average_index, user_2_index) + kendal_tau_distance(average_index, user_3_index)
least_misery_kendal_distance = kendal_tau_distance(least_misery_index, user_1_index) + kendal_tau_distance(least_misery_index, user_2_index) + kendal_tau_distance(least_misery_index, user_3_index)

if average_kendal_distance < least_misery_kendal_distance:
  print("After calculating the Kendal Tau Distance, it is obvious that that we should select Average Aggregation Method instead of Least Misery Aggregation Method. Therefore, the top 20 movies recommended by the Average Aggregation Method are:")
  print(pd.merge(df_average.sort_values(by='Rating', ascending=False)[:20], df_movies, left_on="Movie Id", right_on="movieId").drop("movieId", 1))
else:
  print("After calculating the Kendal Tau Distance, it is obvious that that we should select Least Misery Aggregation Method instead of Average Aggregation Method. Therefore, the top 20 movie recommended by the Least Misery Aggregation Method are:")
  print(pd.merge(df_least_misery.sort_values(by='Rating', ascending=False)[:20], df_movies, left_on="Movie Id", right_on="movieId").drop("movieId", 1))

After calculating the Kendal Tau Distance, it is obvious that that we should select Least Misery Aggregation Method instead of Average Aggregation Method. Therefore, the top 20 movie recommended by the Least Misery Aggregation Method are:
    Movie Id    Rating                   title  year
0       4158  3.657139             Monkeybone   2001
1     169984  3.579269        Alien: Covenant   2017
2     167746  3.394417  The Lego Batman Movie   2017
3      93363  3.311979            John Carter   2012
4      78039  2.921451         Blue Valentine   2010
5      79057  2.864718              Predators   2010
6     149406  2.755928        Kung Fu Panda 3   2016
7      48322  2.721634     Jackass Number Two   2006
8       6006  2.622691           Just Married   2003
9      55995  2.523432                Beowulf   2007
10    140247  2.522913               The Gift   2015
11     95875  2.520428           Total Recall   2012
12    104241  2.438094             Kick-Ass 2   2013
13    127198  2.401