In [1]:
import pandas as pd
import random

### Read the data

In [2]:
movies_df = pd.read_csv('mymovies.csv')
ratings_df = pd.read_csv('myratings.csv')

### Select the data
The recommender system should avoid bias, for example, the recommender system should not recommend movie with just 1 rating which is also a 5-star rating. But should recommend movies with more ratings.
Therefore, we only take into account movies with at least 200 ratings and users who have at least rated 50 movies.

In [3]:
user_threshold = 50
movie_threshold = 200
filtered_users = ratings_df['user'].value_counts()>=user_threshold
filtered_users = filtered_users[filtered_users].index.tolist()

filtered_movies = ratings_df['item'].value_counts()>=movie_threshold
filtered_movies = filtered_movies[filtered_movies].index.tolist()

filtered_df = ratings_df[(ratings_df['user'].isin(filtered_users)) & (ratings_df['item'].isin(filtered_movies))]

In [4]:
display(filtered_df)

Unnamed: 0.1,Unnamed: 0,user,item,rating,timestamp
0,0,1,1,4.0,964982703
3,3,1,47,5.0,964983815
4,4,1,50,5.0,964982931
7,7,1,110,4.0,964982176
15,15,1,260,5.0,964981680
...,...,...,...,...,...
99607,99607,610,1196,5.0,1479544565
99609,99609,610,1198,5.0,1479545833
99684,99684,610,2571,5.0,1479545822
99690,99690,610,2858,3.5,1479545841


### Select a group of n random users
Here we let n = 5, we select 5 random users from the filtered dataset

In [5]:
#Select a random group of user
user_ids = filtered_df['user'].unique()
group_users_ids = random.sample(list(user_ids), 5)

In [6]:
group_users_ids

[387, 384, 460, 76, 151]

### Select rated and unrated movies for the given group
We now can get the rated movies all users in the groups, and from that, we can also get the unrated movies for the whole group of 5 

In [7]:
selected_group_rating = ratings_df.loc[ratings_df['user'].isin(group_users_ids)]
group_rated_movies_ids = selected_group_rating['item'].unique()
group_unrated_movies_ids = set(movies_df['item']) - set(group_rated_movies_ids)
group_rated_movies_df = movies_df.loc[movies_df['item'].isin(group_rated_movies_ids)]
group_unrated_movies_df = movies_df.loc[movies_df['item'].isin(group_unrated_movies_ids)]

In [8]:
group_rated_movies_df

Unnamed: 0.1,Unnamed: 0,item,title,year,genres,plot
0,0,1,toy story,1995,Adventure|Animation|Children|Comedy|Fantasy,In a world where toys are living things who pr...
2,2,3,grumpier old men,1995,Comedy|Romance,The feud between Max (Walter Matthau) and John...
8,8,9,sudden death,1995,Action,Darren McCord (Jean-Claude Van Damme) is a Fre...
9,9,10,goldeneye,1995,Action|Adventure|Thriller,"In 1986, at Arkhangelsk, MI6 agents James Bond..."
10,10,12,dracula: dead and loving it,1995,Comedy|Horror,Solicitor Thomas Renfield travels all the way ...
...,...,...,...,...,...,...
3976,3976,82461,tron: legacy,2010,Action|Adventure|Sci-Fi|IMAX,"In 1989, seven years after the events of the f..."
4009,4009,85414,source code,2011,Action|Drama|Mystery|Sci-Fi|Thriller,U.S. Army pilot Captain Colter Stevens wakes u...
4064,4064,89039,another earth,2011,Drama|Romance|Sci-Fi,"Rhoda Williams (Brit Marling), a brilliant 17-..."
4217,4217,99114,django unchained,2012,Action|Drama|Western,"In 1858 Texas, the Speck brothers, Ace and Dic..."


In [9]:
group_unrated_movies_df

Unnamed: 0.1,Unnamed: 0,item,title,year,genres,plot
1,1,2,jumanji,1995,Adventure|Children|Fantasy,"In 1869, near Brantford, New Hampshire, two br..."
3,3,4,waiting to exhale,1995,Comedy|Drama|Romance,"""Friends are the People who let you be yoursel..."
4,4,5,father of the bride part ii,1995,Comedy,The film begins five years after the events of...
5,5,6,heat,1995,Action|Crime|Thriller,"Neil McCauley, a career criminal, hires Waingr..."
6,6,7,sabrina,1995,Comedy|Romance,Sabrina Fairchild is the young daughter of the...
...,...,...,...,...,...,...
4778,4778,181315,phantom thread,2017,Drama|Romance,"In 1954 London, renowned fashion designer Reyn..."
4779,4779,182823,bright,2017,Action|Crime|Fantasy,"In an alternate present, humans live in uneasy..."
4780,4780,191005,gintama,2017,Action|Adventure|Comedy|Sci-Fi,Yorozuya receives two similar and ultimately c...
4781,4781,193573,love live! the school idol movie,2015,Animation,The movie begins with a scene from the second ...


### Calculate expected ratings for unrated movies
For each users, we need to calculate the expected ratings for the user's unrated movies. To calculate unrated ratings, we first need to train
an algorithm, here, the SVD algorithm from Surprise is used


In [10]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

We perform 5-fold cross validation on the whole ratings dataset to see how well SVD will perform

In [11]:
reader = Reader()
data = Dataset.load_from_df(ratings_df[['user', 'item', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8776  0.8791  0.8701  0.8678  0.8708  0.8731  0.0044  
MAE (testset)     0.6764  0.6764  0.6677  0.6680  0.6679  0.6713  0.0042  
Fit time          9.63    9.58    9.44    9.52    9.58    9.55    0.06    
Test time         0.17    0.16    0.22    0.23    0.16    0.19    0.03    


{'test_rmse': array([0.87758208, 0.87909362, 0.87014683, 0.86776303, 0.87083819]),
 'test_mae': array([0.67641433, 0.67644787, 0.66772578, 0.66804896, 0.66789821]),
 'fit_time': (9.628665208816528,
  9.582191944122314,
  9.44251823425293,
  9.518941164016724,
  9.582505702972412),
 'test_time': (0.1744983196258545,
  0.16363525390625,
  0.21941709518432617,
  0.22539639472961426,
  0.160567045211792)}

Next, We train the SVD model on the dataset

In [12]:
trainset = data.build_full_trainset()
svd = svd.fit(trainset)

In [13]:
def predict(user):
    unrated_movies = list(group_unrated_movies_df['item'].unique())
    pred = pd.DataFrame()
    i = 0
    for item in unrated_movies:
        pred = pred.append({'user':user,'item': item, 'predicted_rating':svd.predict(user, item)[3]}, ignore_index=True)
    return pred
    


In [14]:
users_rating = []
for user in group_users_ids:
    prediction = predict(user)
    prediction = prediction.sort_values('predicted_rating')
    prediction = prediction.merge(movies_df, on= 'item')
    users_rating.append(prediction[['user','item','title','predicted_rating']])

The algorithm will iterate through 5 users, for each user, it will calculate the predicted rating for each unrated movie. Then the algorithm combines the predicted ratings of 5 users into one big dataset, to perform aggregation calculation

In [15]:
final = pd.concat([df for df in users_rating], ignore_index = True)

In [16]:
final

Unnamed: 0,user,item,title,predicted_rating
0,387.0,1556.0,speed 2: cruise control,1.602285
1,387.0,1381.0,grease 2,1.647232
2,387.0,2720.0,inspector gadget,1.724539
3,387.0,312.0,stuart saves his family,1.740207
4,387.0,1499.0,anaconda,1.755109
...,...,...,...,...
20350,151.0,1041.0,secrets & lies,4.510175
20351,151.0,3037.0,little big man,4.518217
20352,151.0,1225.0,amadeus,4.523309
20353,151.0,168252.0,logan,4.594735


### Additive Strategy

In [17]:
additive = final.copy()
additive= additive.groupby(['item','title']).sum()
additive = additive.sort_values(by="predicted_rating", ascending=False).reset_index()
additive

Unnamed: 0,item,title,user,predicted_rating
0,7361.0,eternal sunshine of the spotless mind,1458.0,21.098038
1,38061.0,kiss kiss bang bang,1458.0,20.796788
2,3451.0,guess who's coming to dinner,1458.0,20.648788
3,1217.0,ran,1458.0,20.345213
4,527.0,schindler's list,1458.0,20.270402
...,...,...,...,...
4066,312.0,stuart saves his family,1458.0,10.533393
4067,1882.0,godzilla,1458.0,10.319109
4068,3593.0,battlefield earth,1458.0,10.100776
4069,2720.0,inspector gadget,1458.0,10.057501


### Most Pleasure Strategy

In [18]:
most_pleasure = final.copy()

In [19]:
most_pleasure = final.copy()
most_pleasure= most_pleasure.groupby(['item','title']).max()
most_pleasure = most_pleasure.sort_values(by="predicted_rating", ascending=False).reset_index()
most_pleasure

Unnamed: 0,item,title,user,predicted_rating
0,38061.0,kiss kiss bang bang,460.0,4.744235
1,1225.0,amadeus,460.0,4.694262
2,112552.0,whiplash,460.0,4.650024
3,527.0,schindler's list,460.0,4.645993
4,2028.0,saving private ryan,460.0,4.620220
...,...,...,...,...
4066,1499.0,anaconda,460.0,2.812151
4067,5323.0,jason x,460.0,2.732004
4068,1882.0,godzilla,460.0,2.688639
4069,8666.0,catwoman,460.0,2.681616


### Least Misery Strategy

In [20]:
least_misery = final.copy()
least_misery = final.copy()
least_misery= least_misery.groupby(['item','title']).min()
least_misery = least_misery.sort_values(by="predicted_rating", ascending=False).reset_index()
least_misery

Unnamed: 0,item,title,user,predicted_rating
0,7361.0,eternal sunshine of the spotless mind,76.0,3.874389
1,3451.0,guess who's coming to dinner,76.0,3.807311
2,1246.0,dead poets society,76.0,3.607031
3,1217.0,ran,76.0,3.567273
4,6787.0,all the president's men,76.0,3.555870
...,...,...,...,...
4066,1882.0,godzilla,76.0,1.484548
4067,1556.0,speed 2: cruise control,76.0,1.482402
4068,3593.0,battlefield earth,76.0,1.471684
4069,312.0,stuart saves his family,76.0,1.272409


In [37]:
def fairness():
    titles = []
    for uid in group_users_ids:
        data = final.loc[final['user'] == uid]
        data = data.sort_values(by = 'predicted_rating', ascending = False).reset_index().iloc[0]['title']
        titles.append([uid,data])
    return titles

In [38]:
tt = fairness()
print(tt)

[[387, '12 angry men'], [384, 'eternal sunshine of the spotless mind'], [460, 'amadeus'], [76, "guess who's coming to dinner"], [151, 'kiss kiss bang bang']]


In [48]:
def gen_rec_and_explain():
    most_pleasure = final.copy()
    most_pleasure= most_pleasure.groupby(['item','title']).max()
    most_pleasure = most_pleasure.sort_values(by="predicted_rating", ascending=False).reset_index()
    most_pleasure_movie = most_pleasure.iloc[0:5]['title']
    least_misery = final.copy()
    least_misery= least_misery.groupby(['item','title']).min()
    least_misery = least_misery.sort_values(by="predicted_rating", ascending=False).reset_index()
    least_misery_movie = least_misery.iloc[0:5]['title']
    additive = final.copy()
    additive= additive.groupby(['item','title']).sum()
    additive = additive.sort_values(by="predicted_rating", ascending=False).reset_index()
    additive_movie = additive.iloc[0:5]['title']
    fairnesss = fairness()
    print("#FAIR")
    for uid, title in fairnesss:
        print("The movie {} is the most favorite movie of user {}".format(title, uid))
    print("#ADD: ")
    print("The movies: {} was recommended to you because they have highest additive rating within your group".format(list(additive_movie)))
    print("#LEAST: ")
    print("The movies: {} was recommended to you because they are everyones' preferences ".format(list(least_misery_movie)))
    print("#MOST: ")
    print("The movies: {} was recommended to you because they are the most loved".format(list(most_pleasure_movie)))


In [49]:
gen_rec_and_explain()

#FAIR
The movie 12 angry men is the most favorite movie of user 387
The movie eternal sunshine of the spotless mind is the most favorite movie of user 384
The movie amadeus is the most favorite movie of user 460
The movie guess who's coming to dinner is the most favorite movie of user 76
The movie kiss kiss bang bang is the most favorite movie of user 151
#ADD: 
The movies: ['eternal sunshine of the spotless mind', 'kiss kiss bang bang', "guess who's coming to dinner", 'ran', "schindler's list"] was recommended to you because they have highest additive rating within your group
#LEAST: 
The movies: ['eternal sunshine of the spotless mind', "guess who's coming to dinner", 'dead poets society', 'ran', "all the president's men"] was recommended to you because they are everyones' preferences 
#MOST: 
The movies: ['kiss kiss bang bang', 'amadeus', 'whiplash', "schindler's list", 'saving private ryan'] was recommended to you because they are the most loved


In [None]:
import itertools
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser

user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(ratings_df)
group_unseen_df = pd.DataFrame(list(itertools.product(group_users_ids, group_unrated_movies_ids)), columns=['user', 'item'])
group_unseen_df['predicted_rating'] = recsys.predict(group_unseen_df)
group_unseen_df = group_unseen_df.loc[group_unseen_df['predicted_rating'].notnull()]
display(group_unseen_df)

In [None]:
group_unseen_df

In [None]:
group_unseen_df.groupby('item').sum()

In [None]:
additive_df = group_unseen_df.groupby('item').sum()
additive_df = additive_df.join(movies_df['title'], on='item')
additive_df = additive_df.sort_values(by="predicted_rating", ascending=False).reset_index()[['item', 'title', 'predicted_rating']]
display(additive_df.head(10))

In [None]:
additive_df = group_unseen_df.groupby('item').sum()

In [None]:
additive_df

In [None]:
movies_df.loc[movies_df['item'] == 177593]