In [3]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBasic
from surprise.model_selection import cross_validate

import pandas as pd

In [4]:
df = pd.read_csv("data/matrix-factorization.csv")
df.rename(columns={"movies":"users"}, inplace = True)
print(df)
df.set_index('users',inplace=True)

df = df.transpose()
df = df.reset_index()
df = df.rename_axis(None, axis = 1)
df.rename(columns={"index":"users"}, inplace = True)

df = df.melt('users', var_name='Items', value_name='Rating')
df.dropna(subset = ["Rating"], inplace=True)

df

     users  user_0  user_1  user_2  user_3  user_4  user_5  user_6  user_7  \
0  movie_0     4.0     5.0     4.0     3.0     5.0     1.0     NaN     3.0   
1  movie_1     1.0     NaN     1.0     NaN     2.0     4.0     5.0     4.0   
2  movie_2     NaN     1.0     3.0     NaN     NaN     5.0     5.0     NaN   
3  movie_3     NaN     NaN     NaN     2.0     NaN     NaN     4.0     5.0   
4  movie_4     4.0     4.0     NaN     5.0     NaN     3.0     1.0     4.0   
5  movie_5     5.0     4.0     4.0     3.0     4.0     2.0     1.0     NaN   
6  movie_6     2.0     NaN     NaN     2.0     3.0     NaN     NaN     NaN   
7  movie_7     1.0     2.0     1.0     NaN     NaN     4.0     NaN     4.0   
8  movie_8     NaN     NaN     1.0     NaN     2.0     NaN     5.0     4.0   
9  movie_9     5.0     5.0     3.0     3.0     5.0     2.0     NaN     NaN   

   user_8  user_9  
0     2.0     NaN  
1     NaN     4.0  
2     4.0     3.0  
3     NaN     4.0  
4     NaN     NaN  
5     NaN     1.0  
6

Unnamed: 0,users,Items,Rating
0,user_0,movie_0,4.0
1,user_1,movie_0,5.0
2,user_2,movie_0,4.0
3,user_3,movie_0,3.0
4,user_4,movie_0,5.0
...,...,...,...
93,user_3,movie_9,3.0
94,user_4,movie_9,5.0
95,user_5,movie_9,2.0
98,user_8,movie_9,2.0


In [5]:
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(df[['users', 'Items', 'Rating']], reader)

trainSet = data.build_full_trainset()

# Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
# Estimation based on Maximum Likehood Estimation - not very good
algo = NormalPredictor()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5769  1.7567  1.8375  1.7678  2.0084  1.7895  0.1393  
MAE (testset)     1.3068  1.5350  1.4313  1.4626  1.6246  1.4721  0.1061  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.57690455, 1.7567275 , 1.83752731, 1.76783768, 2.00838017]),
 'test_mae': array([1.30682679, 1.53498622, 1.43125411, 1.46264422, 1.62457172]),
 'fit_time': (0.00010538101196289062,
  5.412101745605469e-05,
  5.1975250244140625e-05,
  5.1021575927734375e-05,
  4.696846008300781e-05),
 'test_time': (8.296966552734375e-05,
  3.886222839355469e-05,
  3.504753112792969e-05,
  3.2901763916015625e-05,
  2.9802322387695312e-05)}

In [6]:
anti_testset_user = []
targetUser = 8 #inner_id of the target user
fillValue = trainSet.global_mean
user_item_ratings = trainSet.ur[targetUser]
user_items = [item for (item,_) in (user_item_ratings)]
user_items
ratings = trainSet.all_ratings()

for iid in trainSet.all_items():
    if(iid not in user_items):
        anti_testset_user.append((trainSet.to_raw_uid(targetUser),trainSet.to_raw_iid(iid),fillValue))
        pred = algo.predict(uid=trainSet.to_raw_uid(targetUser), iid=trainSet.to_raw_iid(iid)) 

# print(anti_testset_user)
# algo.fit(trainSet)
predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'], inplace=True, ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
0,user_6,movie_0,3.238095,3.827424,{'was_impossible': False}
1,user_6,movie_6,3.238095,3.464921,{'was_impossible': False}
3,user_6,movie_9,3.238095,1.846498,{'was_impossible': False}
2,user_6,movie_7,3.238095,0.931332,{'was_impossible': False}


In [7]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
algo = KNNBasic(sim_options=sim_options)
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5509  1.4628  1.6774  1.9202  1.3299  1.5883  0.2010  
MAE (testset)     1.3963  1.2634  1.3806  1.4750  1.2110  1.3452  0.0953  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.55087545, 1.46284536, 1.67742118, 1.92022037, 1.32989304]),
 'test_mae': array([1.39628554, 1.26337484, 1.38059744, 1.47496438, 1.21100591]),
 'fit_time': (0.0009467601776123047,
  3.600120544433594e-05,
  2.4080276489257812e-05,
  2.193450927734375e-05,
  2.09808349609375e-05),
 'test_time': (0.0001068115234375,
  7.915496826171875e-05,
  7.82012939453125e-05,
  7.510185241699219e-05,
  7.295608520507812e-05)}

In [8]:
predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],inplace=True,ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
1,user_6,movie_6,3.238095,3.739162,"{'actual_k': 4, 'was_impossible': False}"
0,user_6,movie_0,3.238095,3.048757,"{'actual_k': 5, 'was_impossible': False}"
2,user_6,movie_7,3.238095,2.989242,"{'actual_k': 4, 'was_impossible': False}"
3,user_6,movie_9,3.238095,2.832201,"{'actual_k': 5, 'was_impossible': False}"


In [9]:
algo = SVD()
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8178  1.5223  1.4030  1.4236  1.2781  1.4890  0.1819  
MAE (testset)     1.6582  1.3147  1.2585  1.2186  1.1453  1.3190  0.1783  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.8178251 , 1.52233058, 1.4029925 , 1.42363767, 1.27807275]),
 'test_mae': array([1.65820005, 1.314675  , 1.25849479, 1.21859378, 1.14526432]),
 'fit_time': (0.0014650821685791016,
  0.00041103363037109375,
  0.0004658699035644531,
  0.00037598609924316406,
  0.00045990943908691406),
 'test_time': (0.00032901763916015625,
  5.91278076171875e-05,
  6.604194641113281e-05,
  5.4836273193359375e-05,
  6.67572021484375e-05)}

In [10]:
predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],inplace=True,ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
0,user_6,movie_0,3.238095,3.269639,{'was_impossible': False}
1,user_6,movie_6,3.238095,3.135465,{'was_impossible': False}
2,user_6,movie_7,3.238095,3.072868,{'was_impossible': False}
3,user_6,movie_9,3.238095,3.041007,{'was_impossible': False}


In [11]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [50,100,150],
              'n_epochs': [35, 40, 60, 100, 200, 500],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=3)
grid_search.fit(data) 

print("Best RMSE: ", grid_search.best_score['rmse'])
print("Best MAE: ", grid_search.best_score['mae'])
print("Best Parameters: ", grid_search.best_params['rmse'])

Best RMSE:  1.0462965633983288
Best MAE:  0.8539039381181922
Best Parameters:  {'n_factors': 150, 'n_epochs': 100, 'lr_all': 0.1, 'reg_all': 0.1}


In [12]:
# We can now use the algorithm that yields the best rmse:
algo = grid_search.best_estimator['rmse']
algo.fit(data.build_full_trainset())

predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'], inplace=True, ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
1,user_6,movie_6,3.238095,4.720452,{'was_impossible': False}
2,user_6,movie_7,3.238095,4.161839,{'was_impossible': False}
3,user_6,movie_9,3.238095,1.334498,{'was_impossible': False}
0,user_6,movie_0,3.238095,1.073497,{'was_impossible': False}
