In [1]:
import pandas as pd
import random

### Read the data

In [2]:
movies_df = pd.read_csv('mymovies.csv')
ratings_df = pd.read_csv('myratings.csv')

### Select the data
The recommender system should avoid bias, for example, the recommender system should not recommend movie with just 1 rating which is also a 5-star rating. But should recommend movies with more ratings.
Therefore, we only take into account movies with at least 200 ratings and users who have at least rated 50 movies.

In [3]:
user_threshold = 50
movie_threshold = 200
filtered_users = ratings_df['user'].value_counts()>=user_threshold
filtered_users = filtered_users[filtered_users].index.tolist()

filtered_movies = ratings_df['item'].value_counts()>=movie_threshold
filtered_movies = filtered_movies[filtered_movies].index.tolist()

filtered_df = ratings_df[(ratings_df['user'].isin(filtered_users)) & (ratings_df['item'].isin(filtered_movies))]

In [27]:
display(filtered_df)

Unnamed: 0.1,Unnamed: 0,user,item,rating,timestamp
0,0,1,1,4.0,964982703
3,3,1,47,5.0,964983815
4,4,1,50,5.0,964982931
7,7,1,110,4.0,964982176
15,15,1,260,5.0,964981680
...,...,...,...,...,...
99607,99607,610,1196,5.0,1479544565
99609,99609,610,1198,5.0,1479545833
99684,99684,610,2571,5.0,1479545822
99690,99690,610,2858,3.5,1479545841


### Select a group of n random users
Here we let n = 5, we select 5 random users from the filtered dataset

In [29]:
#Select a random group of user
user_ids = filtered_df['user'].unique()
group_users_ids = random.sample(list(user_ids), 5)

In [30]:
group_users_ids

[582, 534, 585, 16, 301]

### Select rated and unrated movies for the given group
We now can get the rated movies all users in the groups, and from that, we can also get the unrated movies for the whole group of 5 

In [31]:
selected_group_rating = ratings_df.loc[ratings_df['user'].isin(group_users_ids)]
group_rated_movies_ids = selected_group_rating['item'].unique()
group_unrated_movies_ids = set(movies_df['item']) - set(group_rated_movies_ids)
group_rated_movies_df = movies_df.loc[movies_df['item'].isin(group_rated_movies_ids)]
group_unrated_movies_df = movies_df.loc[movies_df['item'].isin(group_unrated_movies_ids)]

In [34]:
group_rated_movies_df

Unnamed: 0.1,Unnamed: 0,item,title,year,genres,plot
0,0,1,toy story,1995,Adventure|Animation|Children|Comedy|Fantasy,In a world where toys are living things who pr...
1,1,2,jumanji,1995,Adventure|Children|Fantasy,"In 1869, near Brantford, New Hampshire, two br..."
9,9,10,goldeneye,1995,Action|Adventure|Thriller,"In 1986, at Arkhangelsk, MI6 agents James Bond..."
14,14,16,casino,1995,Crime|Drama,"In 1973, sports handicapper and Mafia associat..."
15,15,17,sense and sensibility,1995,Drama|Romance,"On his deathbed, Mr. Dashwood tells his son fr..."
...,...,...,...,...,...,...
4549,4549,134853,inside out,2015,Adventure|Animation|Children|Comedy|Drama|Fantasy,Riley Andersen is born in Minnesota. Within he...
4559,4559,135885,absolutely anything,2015,Comedy|Sci-Fi,"Decades after being launched into space, a spa..."
4571,4571,136864,batman v superman: dawn of justice,2016,Action|Adventure|Fantasy|Sci-Fi,Eighteen months after the battle between Super...
4574,4574,138036,the man from u.n.c.l.e.,2015,Action|Adventure|Comedy,"In 1963, at the height of the Cold War, profes..."


In [33]:
group_unrated_movies_df

Unnamed: 0.1,index,Unnamed: 0,item,title,year,genres,plot
0,2,2,3,grumpier old men,1995,Comedy|Romance,The feud between Max (Walter Matthau) and John...
1,3,3,4,waiting to exhale,1995,Comedy|Drama|Romance,"""Friends are the People who let you be yoursel..."
2,4,4,5,father of the bride part ii,1995,Comedy,The film begins five years after the events of...
3,5,5,6,heat,1995,Action|Crime|Thriller,"Neil McCauley, a career criminal, hires Waingr..."
4,6,6,7,sabrina,1995,Comedy|Romance,Sabrina Fairchild is the young daughter of the...
...,...,...,...,...,...,...,...
4301,4778,4778,181315,phantom thread,2017,Drama|Romance,"In 1954 London, renowned fashion designer Reyn..."
4302,4779,4779,182823,bright,2017,Action|Crime|Fantasy,"In an alternate present, humans live in uneasy..."
4303,4780,4780,191005,gintama,2017,Action|Adventure|Comedy|Sci-Fi,Yorozuya receives two similar and ultimately c...
4304,4781,4781,193573,love live! the school idol movie,2015,Animation,The movie begins with a scene from the second ...


### Calculate expected ratings for unrated movies
For each users, we need to calculate the expected ratings for the user's unrated movies. To calculate unrated ratings, we first need to train
an algorithm, here, the SVD algorithm from Surprise is used


In [11]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

We perform 5-fold cross validation on the whole ratings dataset to see how well SVD will perform

In [12]:
reader = Reader()
data = Dataset.load_from_df(ratings_df[['user', 'item', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8652  0.8762  0.8706  0.8825  0.8756  0.8740  0.0058  
MAE (testset)     0.6640  0.6722  0.6708  0.6774  0.6734  0.6716  0.0044  
Fit time          8.63    9.14    9.11    9.18    10.17   9.25    0.50    
Test time         0.14    0.15    0.20    0.22    0.17    0.18    0.03    


{'test_rmse': array([0.86521669, 0.87616438, 0.87058146, 0.88249166, 0.87557486]),
 'test_mae': array([0.6639862 , 0.67223677, 0.67079544, 0.6774121 , 0.67335879]),
 'fit_time': (8.627952098846436,
  9.138189554214478,
  9.109722137451172,
  9.183749675750732,
  10.171123027801514),
 'test_time': (0.14461588859558105,
  0.15462851524353027,
  0.20041298866271973,
  0.22437191009521484,
  0.17258024215698242)}

Next, We train the SVD model on the dataset

In [13]:
trainset = data.build_full_trainset()
svd = svd.fit(trainset)

In [35]:
def predict(user):
    unrated_movies = list(group_unrated_movies_df['item'].unique())
    pred = pd.DataFrame()
    i = 0
    for item in unrated_movies:
        pred = pred.append({'item': item, 'predicted_rating':svd.predict(user, item)[3]}, ignore_index=True)
    return pred
    


In [41]:
users_rating = []
for user in group_users_ids:
    prediction = predict(user)
    prediction = prediction.sort_values('predicted_rating')
    prediction = prediction.merge(movies_df, on= 'item')
    users_rating.append(prediction[['item','title','predicted_rating']])

The algorithm will iterate through 5 users, for each user, it will calculate the predicted rating for each unrated movie. Then the algorithm combines the predicted ratings of 5 users into one big dataset, to perform aggregation calculation

In [42]:
final = pd.concat([df for df in users_rating], ignore_index = True)

### Additive Strategy

In [45]:
additive = final.copy()
additive= additive.groupby(['item','title']).sum()
additive = additive.sort_values(by="predicted_rating", ascending=False).reset_index()
additive

Unnamed: 0,item,title,predicted_rating
0,1204.0,lawrence of arabia,22.488587
1,57669.0,in bruges,21.375239
2,908.0,north by northwest,21.317415
3,1242.0,glory,21.261773
4,951.0,his girl friday,21.098919
...,...,...,...
4301,312.0,stuart saves his family,12.199218
4302,2338.0,i still know what you did last summer,11.931828
4303,1882.0,godzilla,11.784167
4304,3593.0,battlefield earth,11.151362


### Most Pleasure Strategy

In [50]:
most_pleasure = final.copy()

In [51]:
most_pleasure = final.copy()
most_pleasure= most_pleasure.groupby(['item','title']).max()
most_pleasure = most_pleasure.sort_values(by="predicted_rating", ascending=False).reset_index()
most_pleasure

Unnamed: 0,item,title,predicted_rating
0,3451.0,guess who's coming to dinner,4.860318
1,1204.0,lawrence of arabia,4.786027
2,44195.0,thank you for smoking,4.755990
3,88163.0,"crazy, stupid, love.",4.694539
4,57669.0,in bruges,4.692954
...,...,...,...
4301,312.0,stuart saves his family,2.872013
4302,1882.0,godzilla,2.850027
4303,2338.0,i still know what you did last summer,2.835949
4304,3593.0,battlefield earth,2.666678


### Least Misery Strategy

In [52]:
least_misery = final.copy()
least_misery = final.copy()
least_misery= least_misery.groupby(['item','title']).min()
least_misery = least_misery.sort_values(by="predicted_rating", ascending=False).reset_index()
least_misery

Unnamed: 0,item,title,predicted_rating
0,1204.0,lawrence of arabia,4.057376
1,1178.0,paths of glory,4.018679
2,1242.0,glory,3.938417
3,57669.0,in bruges,3.889966
4,8132.0,gladiator,3.882949
...,...,...,...
4301,2053.0,"honey, i blew up the kid",1.809918
4302,2338.0,i still know what you did last summer,1.786143
4303,5323.0,jason x,1.656612
4304,1556.0,speed 2: cruise control,1.607871


In [19]:
import itertools
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser

user_user = UserUser(15, min_nbrs=3)  # Minimum (3) and maximum (15) number of neighbors to consider
recsys = Recommender.adapt(user_user)
recsys.fit(ratings_df)
group_unseen_df = pd.DataFrame(list(itertools.product(group_users_ids, group_unrated_movies_ids)), columns=['user', 'item'])
group_unseen_df['predicted_rating'] = recsys.predict(group_unseen_df)
group_unseen_df = group_unseen_df.loc[group_unseen_df['predicted_rating'].notnull()]
display(group_unseen_df)

Numba is using threading layer omp - consider TBB
BLAS using multiple threads - can cause oversubscription
found 2 potential runtime problems - see https://boi.st/lkpy-perf


Unnamed: 0,user,item,predicted_rating
1,68,4,2.448167
5,68,8,2.636244
6,68,9,2.787677
8,68,122890,3.051126
9,68,12,2.425155
...,...,...,...
18307,484,98243,3.766153
18308,484,131013,3.571561
18318,484,65514,3.927637
18321,484,8183,3.629165


In [20]:
group_unseen_df

Unnamed: 0,user,item,predicted_rating
1,68,4,2.448167
5,68,8,2.636244
6,68,9,2.787677
8,68,122890,3.051126
9,68,12,2.425155
...,...,...,...
18307,484,98243,3.766153
18308,484,131013,3.571561
18318,484,65514,3.927637
18321,484,8183,3.629165


In [21]:
group_unseen_df.groupby('item').sum()

Unnamed: 0_level_0,user,predicted_rating
item,Unnamed: 1_level_1,Unnamed: 2_level_1
4,1794,13.658278
8,1794,15.203022
9,1794,15.165710
12,1794,14.111501
13,1794,17.213366
...,...,...
179819,1794,16.904676
180031,1794,20.369439
180095,961,8.818123
180985,1794,18.325941


In [22]:
additive_df = group_unseen_df.groupby('item').sum()
additive_df = additive_df.join(movies_df['title'], on='item')
additive_df = additive_df.sort_values(by="predicted_rating", ascending=False).reset_index()[['item', 'title', 'predicted_rating']]
display(additive_df.head(10))

Unnamed: 0,item,title,predicted_rating
0,3494,because i said so,24.785304
1,5747,,24.641659
2,3030,sky captain and the world of tomorrow,24.412533
3,177593,,24.336911
4,3451,conversations with other women,24.238937
5,2202,last orders,23.748082
6,51931,,23.655843
7,1178,existenz,23.447371
8,98154,,23.304935
9,1411,any given sunday,23.253212


In [23]:
additive_df = group_unseen_df.groupby('item').sum()

In [24]:
additive_df

Unnamed: 0_level_0,user,predicted_rating
item,Unnamed: 1_level_1,Unnamed: 2_level_1
4,1794,13.658278
8,1794,15.203022
9,1794,15.165710
12,1794,14.111501
13,1794,17.213366
...,...,...
179819,1794,16.904676
180031,1794,20.369439
180095,961,8.818123
180985,1794,18.325941


In [25]:
movies_df.loc[movies_df['item'] == 177593]

Unnamed: 0.1,Unnamed: 0,item,title,year,genres,plot
4753,4753,177593,"three billboards outside ebbing, missouri",2017,Crime|Drama,"In the town of Ebbing, Missouri, Mildred Hayes..."
