In [1]:
import pandas as pd
import random

### Read the data

In [2]:
movies_df = pd.read_csv('mymovies.csv')
ratings_df = pd.read_csv('myratings.csv')

### Select the data
The recommender system should avoid bias, for example, the recommender system should not recommend movie with just 1 rating which is also a 5-star rating. But should recommend movies with more ratings.
Therefore, we only take into account movies with at least 200 ratings and users who have at least rated 50 movies.

In [3]:
user_threshold = 50
movie_threshold = 200
filtered_users = ratings_df['user'].value_counts()>=user_threshold
filtered_users = filtered_users[filtered_users].index.tolist()

filtered_movies = ratings_df['item'].value_counts()>=movie_threshold
filtered_movies = filtered_movies[filtered_movies].index.tolist()

filtered_df = ratings_df[(ratings_df['user'].isin(filtered_users)) & (ratings_df['item'].isin(filtered_movies))]

In [4]:
display(filtered_df)

Unnamed: 0.1,Unnamed: 0,user,item,rating,timestamp
0,0,1,1,4.0,964982703
3,3,1,47,5.0,964983815
4,4,1,50,5.0,964982931
7,7,1,110,4.0,964982176
15,15,1,260,5.0,964981680
...,...,...,...,...,...
99607,99607,610,1196,5.0,1479544565
99609,99609,610,1198,5.0,1479545833
99684,99684,610,2571,5.0,1479545822
99690,99690,610,2858,3.5,1479545841


### Select a group of n random users
Here we let n = 5, we select 5 random users from the filtered dataset

In [5]:
#Select a random group of user
user_ids = filtered_df['user'].unique()
group_users_ids = random.sample(list(user_ids), 5)

In [6]:
group_users_ids

[265, 466, 489, 169, 199]

### Select rated and unrated movies for the given group
We now can get the rated movies all users in the groups, and from that, we can also get the unrated movies for the whole group of 5 

In [7]:
selected_group_rating = ratings_df.loc[ratings_df['user'].isin(group_users_ids)]
group_rated_movies_ids = selected_group_rating['item'].unique()
group_unrated_movies_ids = set(movies_df['item']) - set(group_rated_movies_ids)
group_rated_movies_df = movies_df.loc[movies_df['item'].isin(group_rated_movies_ids)]
group_unrated_movies_df = movies_df.loc[movies_df['item'].isin(group_unrated_movies_ids)]

In [8]:
group_rated_movies_df

Unnamed: 0.1,Unnamed: 0,item,title,year,genres,plot
0,0,1,toy story,1995,Adventure|Animation|Children|Comedy|Fantasy,In a world where toys are living things who pr...
1,1,2,jumanji,1995,Adventure|Children|Fantasy,"In 1869, near Brantford, New Hampshire, two br..."
2,2,3,grumpier old men,1995,Comedy|Romance,The feud between Max (Walter Matthau) and John...
4,4,5,father of the bride part ii,1995,Comedy,The film begins five years after the events of...
5,5,6,heat,1995,Action|Crime|Thriller,"Neil McCauley, a career criminal, hires Waingr..."
...,...,...,...,...,...,...
4373,4373,111759,edge of tomorrow,2014,Action|Sci-Fi|IMAX,"In 2015, an alien race called Mimics arrive in..."
4380,4380,112175,how to train your dragon 2,2014,Action|Adventure|Animation,Five years after the Viking village of Berk an...
4390,4390,112556,gone girl,2014,Drama|Thriller,"The day of their fifth wedding anniversary, wr..."
4396,4396,112852,guardians of the galaxy,2014,Action|Adventure|Sci-Fi,"In 1988, following his mother's death, a young..."


In [9]:
group_unrated_movies_df

Unnamed: 0.1,Unnamed: 0,item,title,year,genres,plot
3,3,4,waiting to exhale,1995,Comedy|Drama|Romance,"""Friends are the People who let you be yoursel..."
7,7,8,tom and huck,1995,Adventure|Children,The movie opens with Injun Joe (Eric Schweig) ...
8,8,9,sudden death,1995,Action,Darren McCord (Jean-Claude Van Damme) is a Fre...
10,10,12,dracula: dead and loving it,1995,Comedy|Horror,Solicitor Thomas Renfield travels all the way ...
11,11,13,balto,1995,Adventure|Animation|Children,"In New York City, an elderly woman, her grandd..."
...,...,...,...,...,...,...
4778,4778,181315,phantom thread,2017,Drama|Romance,"In 1954 London, renowned fashion designer Reyn..."
4779,4779,182823,bright,2017,Action|Crime|Fantasy,"In an alternate present, humans live in uneasy..."
4780,4780,191005,gintama,2017,Action|Adventure|Comedy|Sci-Fi,Yorozuya receives two similar and ultimately c...
4781,4781,193573,love live! the school idol movie,2015,Animation,The movie begins with a scene from the second ...


### Calculate expected ratings for unrated movies
For each users, we need to calculate the expected ratings for the user's unrated movies. To calculate unrated ratings, we first need to train
an algorithm, here, the SVD algorithm from Surprise is used


In [10]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

We perform 5-fold cross validation on the whole ratings dataset to see how well SVD will perform

In [11]:
reader = Reader()
data = Dataset.load_from_df(ratings_df[['user', 'item', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8718  0.8803  0.8708  0.8758  0.8715  0.8740  0.0036  
MAE (testset)     0.6723  0.6770  0.6672  0.6725  0.6691  0.6716  0.0034  
Fit time          11.30   11.82   11.45   11.42   11.44   11.49   0.17    
Test time         0.27    0.33    0.30    0.32    0.25    0.29    0.03    


{'test_rmse': array([0.87179957, 0.88031035, 0.8707583 , 0.87577533, 0.87153836]),
 'test_mae': array([0.6722869 , 0.67702008, 0.66718558, 0.672534  , 0.66913573]),
 'fit_time': (11.298502683639526,
  11.816110610961914,
  11.453230142593384,
  11.422445297241211,
  11.436149835586548),
 'test_time': (0.27227091789245605,
  0.3291206359863281,
  0.29821109771728516,
  0.31566762924194336,
  0.2503325939178467)}

Next, We train the SVD model on the dataset

In [12]:
trainset = data.build_full_trainset()
svd = svd.fit(trainset)

In [13]:
def predict(user):
    unrated_movies = list(group_unrated_movies_df['item'].unique())
    pred = pd.DataFrame()
    i = 0
    for item in unrated_movies:
        pred = pred.append({'item': item, 'predicted_rating':svd.predict(user, item)[3]}, ignore_index=True)
    return pred
    


In [14]:
users_rating = []
for user in group_users_ids:
    prediction = predict(user)
    prediction = prediction.sort_values('predicted_rating')
    prediction = prediction.merge(movies_df, on= 'item')
    users_rating.append(prediction[['item','title','predicted_rating']])

The algorithm will iterate through 5 users, for each user, it will calculate the predicted rating for each unrated movie. Then the algorithm combines the predicted ratings of 5 users into one big dataset, to perform aggregation calculation

In [15]:
final = pd.concat([df for df in users_rating], ignore_index = True)

### Additive Strategy

In [16]:
additive = final.copy()
additive= additive.groupby(['item','title']).sum()
additive = additive.sort_values(by="predicted_rating", ascending=False).reset_index()
additive

Unnamed: 0,item,title,predicted_rating
0,1673.0,boogie nights,21.322259
1,1248.0,touch of evil,20.820506
2,951.0,his girl friday,20.767150
3,1041.0,secrets & lies,20.727145
4,3030.0,yojimbo,20.721459
...,...,...,...
4000,546.0,super mario bros.,11.862595
4001,2798.0,problem child,11.631383
4002,2338.0,i still know what you did last summer,11.542072
4003,5323.0,jason x,11.428916


### Most Pleasure Strategy

In [17]:
most_pleasure = final.copy()

In [18]:
most_pleasure = final.copy()
most_pleasure= most_pleasure.groupby(['item','title']).max()
most_pleasure = most_pleasure.sort_values(by="predicted_rating", ascending=False).reset_index()
most_pleasure

Unnamed: 0,item,title,predicted_rating
0,1283.0,high noon,4.942140
1,8132.0,gladiator,4.934007
2,6787.0,all the president's men,4.884430
3,112552.0,whiplash,4.882014
4,72226.0,fantastic mr. fox,4.878527
...,...,...,...
4000,3997.0,dungeons & dragons,3.090563
4001,312.0,stuart saves his family,3.089474
4002,546.0,super mario bros.,3.025822
4003,1556.0,speed 2: cruise control,3.018573


### Least Misery Strategy

In [19]:
least_misery = final.copy()
least_misery = final.copy()
least_misery= least_misery.groupby(['item','title']).min()
least_misery = least_misery.sort_values(by="predicted_rating", ascending=False).reset_index()
least_misery

Unnamed: 0,item,title,predicted_rating
0,1673.0,boogie nights,3.943918
1,951.0,his girl friday,3.924816
2,1248.0,touch of evil,3.868445
3,930.0,notorious,3.804483
4,3030.0,yojimbo,3.801536
...,...,...,...
4000,2338.0,i still know what you did last summer,1.640643
4001,1556.0,speed 2: cruise control,1.621891
4002,2720.0,inspector gadget,1.574434
4003,1831.0,lost in space,1.482880


In [29]:
def gen_rec_and_explain():
    most_pleasure = final.copy()
    most_pleasure= most_pleasure.groupby(['item','title']).max()
    most_pleasure = most_pleasure.sort_values(by="predicted_rating", ascending=False).reset_index()
    most_pleasure_movie = most_pleasure.iloc[0:5]['title']
    least_misery = final.copy()
    least_misery= least_misery.groupby(['item','title']).min()
    least_misery = least_misery.sort_values(by="predicted_rating", ascending=False).reset_index()
    least_misery_movie = least_misery.iloc[0:5]['title']
    additive = final.copy()
    additive= additive.groupby(['item','title']).sum()
    additive = additive.sort_values(by="predicted_rating", ascending=False).reset_index()
    additive_movie = additive.iloc[0:5]['title']
    print("#ADD: The movies: {} was recommended to you because they have highest additive rating within your group".format(additive_movie))
    print("#LEAST: The movies: {} was recommended to you because they are everyones' preferences ".format(least_misery_movie))
    print("#MOST: The movies: {} was recommended to you because they are the most loved".format(most_pleasure_movie))


In [30]:
gen_rec_and_explain()

#ADD: The movies: 0      boogie nights
1      touch of evil
2    his girl friday
3     secrets & lies
4            yojimbo
Name: title, dtype: object was recommended to you because they have highest additive rating within your group
#LEAST: The movies: 0      boogie nights
1    his girl friday
2      touch of evil
3          notorious
4            yojimbo
Name: title, dtype: object was recommended to you because they have highest rating within your group
#MOST: The movies: 0                  high noon
1                  gladiator
2    all the president's men
3                   whiplash
4          fantastic mr. fox
Name: title, dtype: object was recommended to you because they are the most loved


In [22]:
import itertools
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser

user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(ratings_df)
group_unseen_df = pd.DataFrame(list(itertools.product(group_users_ids, group_unrated_movies_ids)), columns=['user', 'item'])
group_unseen_df['predicted_rating'] = recsys.predict(group_unseen_df)
group_unseen_df = group_unseen_df.loc[group_unseen_df['predicted_rating'].notnull()]
display(group_unseen_df)

Numba is using threading layer omp - consider TBB
BLAS using multiple threads - can cause oversubscription
found 2 potential runtime problems - see https://boi.st/lkpy-perf


Unnamed: 0,user,item,predicted_rating
0,265,4,2.175135
1,265,8,2.767044
2,265,9,2.803275
4,265,12,2.311921
5,265,13,3.084180
...,...,...,...
20014,199,180095,3.001189
20016,199,81847,3.510751
20018,199,98243,3.051744
20019,199,131013,2.829446


In [23]:
group_unseen_df

Unnamed: 0,user,item,predicted_rating
0,265,4,2.175135
1,265,8,2.767044
2,265,9,2.803275
4,265,12,2.311921
5,265,13,3.084180
...,...,...,...
20014,199,180095,3.001189
20016,199,81847,3.510751
20018,199,98243,3.051744
20019,199,131013,2.829446


In [24]:
group_unseen_df.groupby('item').sum()

Unnamed: 0_level_0,user,predicted_rating
item,Unnamed: 1_level_1,Unnamed: 2_level_1
4,1588,12.633326
8,1588,13.379248
9,1122,10.947870
12,1122,10.570121
13,1588,17.109049
...,...,...
179819,1588,15.719347
180031,1588,19.870230
180095,857,10.886995
180985,1122,13.236003


In [25]:
additive_df = group_unseen_df.groupby('item').sum()
additive_df = additive_df.join(movies_df['title'], on='item')
additive_df = additive_df.sort_values(by="predicted_rating", ascending=False).reset_index()[['item', 'title', 'predicted_rating']]
display(additive_df.head(10))

Unnamed: 0,item,title,predicted_rating
0,3030,sky captain and the world of tomorrow,22.509856
1,8132,,22.218595
2,3201,double dragon,22.1922
3,3925,grown ups,22.172686
4,945,sixteen candles,22.047272
5,905,in search of the castaways,22.026679
6,140174,,21.987155
7,1279,thumbelina,21.987122
8,3549,nancy drew,21.966022
9,101,mallrats,21.829995


In [26]:
additive_df = group_unseen_df.groupby('item').sum()

In [27]:
additive_df

Unnamed: 0_level_0,user,predicted_rating
item,Unnamed: 1_level_1,Unnamed: 2_level_1
4,1588,12.633326
8,1588,13.379248
9,1122,10.947870
12,1122,10.570121
13,1588,17.109049
...,...,...
179819,1588,15.719347
180031,1588,19.870230
180095,857,10.886995
180985,1122,13.236003


In [28]:
movies_df.loc[movies_df['item'] == 177593]

Unnamed: 0.1,Unnamed: 0,item,title,year,genres,plot
4753,4753,177593,"three billboards outside ebbing, missouri",2017,Crime|Drama,"In the town of Ebbing, Missouri, Mildred Hayes..."
