In [129]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [130]:
pip install tqdm



In [131]:
from tqdm.notebook import tqdm

In [132]:
cd /content/drive/MyDrive/Recommender Systems/ml-latest-small/ml-latest-small

/content/drive/MyDrive/Recommender Systems/ml-latest-small/ml-latest-small


In [133]:
ls

links.csv  movies.csv  ratings.csv  README.txt  tags.csv


In [134]:
import pandas as pd
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
import statistics 
%matplotlib inline

In [135]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [136]:
df_ratings.groupby(['userId']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

Unnamed: 0,userId,counts
413,414,2698
598,599,2478
473,474,2108
447,448,1864
273,274,1346
...,...,...
441,442,20
568,569,20
319,320,20
575,576,20


In [137]:
df_ratings.shape[0]

100836

In [138]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [139]:
df_movies['year'] = df_movies.title.str.slice(-5,-1)

In [140]:
df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [141]:
df_movies.title = df_movies.title.str.replace('\(\d\d\d\d\)',"")

In [142]:
df_movies.head(n=20)

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
5,6,Heat,Action|Crime|Thriller,1995
6,7,Sabrina,Comedy|Romance,1995
7,8,Tom and Huck,Adventure|Children,1995
8,9,Sudden Death,Action,1995
9,10,GoldenEye,Action|Adventure|Thriller,1995


In [143]:
#It's time to drop the genre column as we don't need it
df_movies.drop('genres', 1, inplace=True)

In [144]:
df_movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [145]:
# Same goes for timestamp column in rating dataframe
df_ratings.drop('timestamp', 1, inplace=True)

In [146]:
df_ratings.shape

(100836, 3)

In [147]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


## ***GROUP RECOMMENDATIONS***

In [148]:
group_of_users = [414, 599, 474]

In [149]:
df_ratings.loc[df_ratings['userId'].isin(group_of_users)]

Unnamed: 0,userId,movieId,rating
62294,414,1,4.0
62295,414,2,3.0
62296,414,3,4.0
62297,414,5,2.0
62298,414,6,3.0
...,...,...,...
95096,599,179817,3.0
95097,599,180031,3.5
95098,599,180297,3.0
95099,599,181315,3.5


In [150]:
df_group = df_ratings.loc[df_ratings['userId'].isin(group_of_users)]

In [151]:
df_group

Unnamed: 0,userId,movieId,rating
62294,414,1,4.0
62295,414,2,3.0
62296,414,3,4.0
62297,414,5,2.0
62298,414,6,3.0
...,...,...,...
95096,599,179817,3.0
95097,599,180031,3.5
95098,599,180297,3.0
95099,599,181315,3.5


In [152]:
df_group = pd.merge(df_group ,df_movies[df_movies['movieId'].isin(df_group['movieId'])]).drop('year',1)

In [153]:
df_group

Unnamed: 0,userId,movieId,rating,title
0,414,1,4.0,Toy Story
1,474,1,4.0,Toy Story
2,599,1,3.0,Toy Story
3,414,2,3.0,Jumanji
4,474,2,3.0,Jumanji
...,...,...,...,...
7279,599,178129,3.5,Adventures in Plymptoons!
7280,599,179053,3.5,2048: Nowhere to Run
7281,599,180297,3.0,The Disaster Artist
7282,599,181315,3.5,Phantom Thread


In [154]:
# looking for those users who have watched the same movies like our users in group

similar_users = df_ratings[df_ratings['movieId'].isin(df_group['movieId']).tolist()]
similar_users = similar_users[~similar_users.userId.isin(group_of_users)]

In [155]:
similar_users

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100829,610,164179,5.0
100830,610,166528,4.0
100832,610,168248,5.0
100833,610,168250,5.0


In [156]:
similar_users = similar_users.groupby(['userId'])

In [157]:
# Most similar user groups sorted
similar_users = sorted(similar_users,  key=lambda x: len(x[1]), reverse=True)

In [158]:
def similar_users_fn(user, df_data):
    similar_users = df_data[df_data['userId'] == user]
    similar_users = df_ratings[df_ratings['movieId'].isin(similar_users['movieId']).tolist()]
    similar_users = similar_users[~similar_users.userId.isin([user])]
    similar_users = similar_users.groupby(['userId'])
    similar_users = sorted(similar_users,  key=lambda x: len(x[1]), reverse=True)
    return similar_users

In [159]:
def pearson_coefficient(user, df_data):
  pearson_coff = {}
  similar_users = similar_users_fn(user, df_data)
  for name, group in similar_users:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    input_user = df_data[df_data['userId'] == user].sort_values(by='movieId')

    total_ratings = len(group)
    # Review scores for the movies that they both have in common
    temp_df = input_user[input_user['movieId'].isin(group['movieId'].tolist())]
    user_in_ques_ratings = temp_df['rating'].tolist()
    # Let's choose the simple variable name i.e. 'x'
    x = user_in_ques_ratings
    x_mean = statistics.mean(x)
    x_minus_x_mean = list((i-x_mean for i in x))
    x_minus_x_mean_squre = [i*i for i in x_minus_x_mean]
    sum_of_x_minus_x_mean_square = sum(x_minus_x_mean_squre)

    #Let's also put the current user group reviews in a list format
    ratings_of_similar_user = group['rating'].tolist()
    # Let's choose the simple variable name i.e. 'y'
    y = ratings_of_similar_user
    y_mean = statistics.mean(y)
    y_minus_y_mean = list((i-y_mean for i in y))
    y_minus_y_mean_square = [i*i for i in y_minus_y_mean]
    sum_of_y_minus_y_mean_square = sum(y_minus_y_mean_square)

    x_minus_x_mean_mul_y_minus_y_mean = (i*j for i,j in zip(x_minus_x_mean, y_minus_y_mean))
    sum_of_x_minus_x_mean_mul_y_minus_y_mean = sum(x_minus_x_mean_mul_y_minus_y_mean)


    denominator = sqrt(sum_of_x_minus_x_mean_square*sum_of_y_minus_y_mean_square)
      
    if denominator != 0:
      pearson_coff[name] = sum_of_x_minus_x_mean_mul_y_minus_y_mean/denominator
    else:
      pearson_coff[name] = 0
  return pearson_coff

## ***Prediction***

In [160]:
# Function to get the most similar users for the user in question.

def get_most_similar_users(user, user_pear_coeff):
  df_pearson_coff = pd.DataFrame.from_dict(user_pear_coeff, orient='index')
  df_pearson_coff.columns = ['SimilarityIndex']
  df_pearson_coff['UserId'] = df_pearson_coff.index
  df_pearson_coff.index = range(len(df_pearson_coff))
  most_similar_users = df_pearson_coff.sort_values(by='SimilarityIndex', ascending=False)
  return most_similar_users

In [161]:
# Function to get the list of movies that are not watched by the user in question but the similar users have watched.

def get_list_of_not_seen_movies(user, user_pear_coeff, df_data):
  most_similar_users = get_most_similar_users(user, user_pear_coeff)
  most_similar_users = most_similar_users.merge(df_ratings, left_on='UserId', right_on='userId', how='inner').drop('userId', 1)
  not_seen_movies = most_similar_users[~most_similar_users['movieId'].isin(df_data[df_data['userId'] == user]['movieId'])].sort_values(by='SimilarityIndex', ascending=False)
  not_seen_movies.sort_values(by='movieId', inplace=True)
  list_of_not_seen_movies = not_seen_movies.movieId.to_list()
  set_of_not_seen_movies = set(list_of_not_seen_movies)
  unique_list_of_not_seen_movies = list(sorted(set_of_not_seen_movies))
  return unique_list_of_not_seen_movies

In [162]:
# mean of ratings of users in group
def list_mean_ratings(user, df_data):
  return (statistics.mean(df_data[df_data['userId'] == user]['rating'].tolist()))


In [163]:
# Function to get the similarity index of all the similar users for the user in question.

def get_list_of_sim_index(user, users_watched_movie, user_pear_coeff):
  most_similar_users = get_most_similar_users(user, user_pear_coeff)
  sim_index = most_similar_users[most_similar_users['UserId'].isin(users_watched_movie)].sort_values(by='UserId')
  return sim_index['SimilarityIndex'].to_list()

In [164]:
# This block of code calculates the prediction score for all the movies that are not watched by the user in a group but similar users have watched these movies.

def prediction_of_movies_for_user(user, user_pear_coeff, df_data):
  prediction_of_movies = {}
  not_seen_movies = get_list_of_not_seen_movies(user, user_pear_coeff, df_data)
  for movie in not_seen_movies:
    users_watched_movie = df_ratings[df_ratings['movieId'] == int(movie)].sort_values(by='userId')
    ratings_of_users_watched_movie = users_watched_movie['rating'].tolist()

    mean_of_ratings_users_watched_movie = statistics.mean(ratings_of_users_watched_movie)

    list_of_sim_index = get_list_of_sim_index(user, users_watched_movie['userId'].to_list(), user_pear_coeff)

    r_bp_minus_mean_of_rb = [i-mean_of_ratings_users_watched_movie for i in ratings_of_users_watched_movie]

    temp_numerator = (i*j for i, j in list(zip(list_of_sim_index, r_bp_minus_mean_of_rb)))

    sum_temp_numerator = sum(temp_numerator)

    numerator_pred = sum_temp_numerator + list_mean_ratings(user, df_data)

    if sum(list_of_sim_index) != 0:
      prediction_of_movie = numerator_pred/sum(list_of_sim_index)
      prediction_of_movies[movie] = prediction_of_movie
    else:
      prediction_of_movies[movie] = 0
  return prediction_of_movies

In [165]:
# This preprocessing method will select the id's of movies predicted for all the users or you can say the this method will select the common predicted 
# movies among the users. For simplicity I am taking 100 movies.
def preprocessing(ratings_of_users_in_group_dic):
  common_movies_id = set()
  movie_ids = []
  for user in ratings_of_users_in_group_dic:
    df_user = pd.DataFrame.from_dict(ratings_of_users_in_group_dic[user], orient='index')
    df_user.columns = ['rating']
    df_user['movieId'] = df_user.index
    df_user.index = range(len(ratings_of_users_in_group_dic[user]))
    movie_ids.append(df_user['movieId'].to_list())
  
  common_movies_id = set(movie_ids[0])
  for li in movie_ids[1:]:
    common_movies_id = common_movies_id.intersection(set(li))
  return sorted(common_movies_id)

## ***Average Aggregation*** 

In [166]:
# This method will take the movie id (predicted movie id from above preprocessing method) and compute the rating for the whole group.
def avg_aggregation(predictions_for_users, movie_id):
  ratings_given_by_users = []
  for user in predictions_for_users:
    ratings_given_by_users.append(predictions_for_users[user][movie_id])
  return statistics.mean(ratings_given_by_users)

## ***Least Misery Aggregation***

In [167]:
# # This method will take the movie id (predicted movie id from above preprocessing method) and compute the rating for the whole group.
def least_misery(predictions_for_users, movie_id):
  ratings_given_by_users = []
  for user in predictions_for_users:
    ratings_given_by_users.append((predictions_for_users[user][movie_id]))
  return min(ratings_given_by_users)

## ***Calculating user satisfaction and group disagreements***

In [168]:
def user_satisfaction_fn(user, Gr_list):
  df = pd.DataFrame.from_dict(user, orient='index')
  df.columns = ['rating']
  df['movieId'] = df.index
  df.index = range(len(user))
  df = df.sort_values(by='rating', ascending=False)
  user_top_ratings = df['rating'][:20].to_list()
  user_top_ratings_movie_ids = df['movieId'].to_list()
  Gr_temp = df[df['movieId'].isin(Gr_list)]
  Gr_temp = Gr_temp['rating'].to_list()
  return (sum(Gr_temp)/sum(user_top_ratings))

## ***Produce a group of 3 users, and for this group, show the top-20 recommendations in 5 different sequences, i.e., the 20 movies with the highest prediction scores in 5 rounds, using the MovieLens 100K rating dataset***

In [169]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

iterations = 5
users_overall_satisfaction = dict()
alpha = 0
remaining_data = df_group

for iter in tqdm(range(1, iterations+1), desc="Processing", unit='keystrokes'):
  # print(f"At iteration {iter} alpha is: ", alpha)
  data_chunk = remaining_data.sample(frac=0.2)
  remaining_data = remaining_data.drop(data_chunk.index)
  predictions_for_users = dict()
  user_satisfaction = []
  user_top_ratings = []
  user_top_ratings_movie_ids = []
  Gr_dict = dict()
  Gr = []
  for user in group_of_users:
    predictions_for_users[user] = {}
    user_pearson_coeff = pearson_coefficient(user, data_chunk)
    predictions_for_users[user] = prediction_of_movies_for_user(user, user_pearson_coeff, data_chunk)
    df = pd.DataFrame.from_dict(predictions_for_users[user], orient='index')
    df.columns = ['rating']
    df['movieId'] = df.index
    df.index = range(len(predictions_for_users[user]))
    df = df.sort_values(by='rating', ascending=False)
    user_top_ratings = df['rating'][:20].to_list()
    user_top_ratings_movie_ids = df['movieId'].to_list()
  Gl = preprocessing(predictions_for_users) # Movie id's for each user with no repetition, it's basically a set!
  if (iter == 1):
    alpha = 0
    for item in Gl:
      Gr_dict[item] = ((1 - alpha) * avg_aggregation(predictions_for_users, item)) + (alpha * least_misery(predictions_for_users, item))
    df_temp = pd.DataFrame.from_dict(Gr_dict, orient='index')
    df_temp.columns = ['rating']
    df_temp['movie id'] = df_temp.index
    df_temp.index = range(len(Gr_dict))
    df_temp = df_temp.sort_values(by='rating', ascending=False)
    Gr = df_temp['movie id'][:20]
    for user in group_of_users:
      user_satisfaction.append(user_satisfaction_fn(predictions_for_users[user], Gr))
    alpha = max(user_satisfaction) - min(user_satisfaction)
  else:
    for item in Gl:
      Gr_dict[item] = ((1 - alpha) * avg_aggregation(predictions_for_users, item)) + (alpha * least_misery(predictions_for_users, item))
    df_temp = pd.DataFrame.from_dict(Gr_dict, orient='index')
    df_temp.columns = ['rating']
    df_temp['movie id'] = df_temp.index
    df_temp.index = range(len(Gr_dict))
    df_temp = df_temp.sort_values(by='rating', ascending=False)
    Gr = df_temp['movie id'][:20]
    for user in group_of_users:
      user_satisfaction.append(user_satisfaction_fn(predictions_for_users[user], Gr))
    alpha = max(user_satisfaction) - min(user_satisfaction)
  print(f"Top 20 recommendations for iteration {iter}\n")
  df_top = pd.DataFrame.from_dict(Gr_dict, orient='index')
  df_top.columns = ['Rating']
  df_top['Movie ID'] = df_top.index
  df_top.index = range(len(Gr_dict))
  df_top['Rating'] = MinMaxScaler(0,5).fit_transform(np.array(df_top['Rating']).reshape(-1,1))
  df_top = df_top.sort_values(by='Rating', ascending=False)[:20].style.hide_index()
  display(df_top)
  print(f"Iteration {iter} completed successfully!\n")

Processing:   0%|          | 0/5 [00:00<?, ?keystrokes/s]

Top 20 recommendations for iteration 1



Rating,Movie ID
4.671059,7982
4.600403,26778
4.578825,3938
4.503297,7225
4.498201,209
4.387486,4862
4.361495,5076
4.337741,3738
4.282002,94018
4.267055,80


Iteration 1 completed successfully!

Top 20 recommendations for iteration 2



Rating,Movie ID
4.700885,27865
4.635211,6022
4.569098,567
4.358051,4428
4.358051,3655
4.276637,5707
4.219956,2962
4.179942,6448
4.153739,3834
4.135863,4146


Iteration 2 completed successfully!

Top 20 recommendations for iteration 3



Rating,Movie ID
4.716891,4442
4.627736,6582
4.610739,2472
4.531541,4152
4.498253,4313
4.498253,3003
4.451755,6185
4.348506,107069
4.304899,808
4.235769,101283


Iteration 3 completed successfully!

Top 20 recommendations for iteration 4



Rating,Movie ID
4.415898,63540
4.411816,96281
4.392878,6143
4.343774,25825
4.277773,42632
4.260276,68159
4.189902,5075
4.188334,44225
4.125528,3370
4.114823,1484


Iteration 4 completed successfully!

Top 20 recommendations for iteration 5



Rating,Movie ID
4.705531,1112
4.554786,1458
4.485574,184791
4.485574,5529
4.344724,69604
4.334555,6798
4.334555,5915
4.272507,178061
4.248458,4256
4.218919,4565


Iteration 5 completed successfully!

