In [3]:
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBasic
from surprise.model_selection import cross_validate

import pandas as pd

In [4]:
df = pd.read_csv("data/matrix-factorization.csv")
df.rename(columns={"movies":"users"}, inplace = True)
print(df)
df.set_index('users',inplace=True)

df = df.transpose()
df = df.reset_index()
df = df.rename_axis(None, axis = 1)
df.rename(columns={"index":"users"}, inplace = True)

df = df.melt('users', var_name='Items', value_name='Rating')
df.dropna(subset = ["Rating"], inplace=True)

df

     users  user_0  user_1  user_2  user_3  user_4  user_5  user_6  user_7  \
0  movie_0     4.0     5.0     4.0     3.0     5.0     1.0     NaN     3.0   
1  movie_1     1.0     NaN     1.0     NaN     2.0     4.0     5.0     4.0   
2  movie_2     NaN     1.0     3.0     NaN     NaN     5.0     5.0     NaN   
3  movie_3     NaN     NaN     NaN     2.0     NaN     NaN     4.0     5.0   
4  movie_4     4.0     4.0     NaN     5.0     NaN     3.0     1.0     4.0   
5  movie_5     5.0     4.0     4.0     3.0     4.0     2.0     1.0     NaN   
6  movie_6     2.0     NaN     NaN     2.0     3.0     NaN     NaN     NaN   
7  movie_7     1.0     2.0     1.0     NaN     NaN     4.0     NaN     4.0   
8  movie_8     NaN     NaN     1.0     NaN     2.0     NaN     5.0     4.0   
9  movie_9     5.0     5.0     3.0     3.0     5.0     2.0     NaN     NaN   

   user_8  user_9  
0     2.0     NaN  
1     NaN     4.0  
2     4.0     3.0  
3     NaN     4.0  
4     NaN     NaN  
5     NaN     1.0  
6

Unnamed: 0,users,Items,Rating
0,user_0,movie_0,4.0
1,user_1,movie_0,5.0
2,user_2,movie_0,4.0
3,user_3,movie_0,3.0
4,user_4,movie_0,5.0
...,...,...,...
93,user_3,movie_9,3.0
94,user_4,movie_9,5.0
95,user_5,movie_9,2.0
98,user_8,movie_9,2.0


In [5]:
reader = Reader(rating_scale=(0, 5))

data = Dataset.load_from_df(df[['users', 'Items', 'Rating']], reader)

trainSet = data.build_full_trainset()

# Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
# Estimation based on Maximum Likehood Estimation - not very good
algo = NormalPredictor()

cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.9115  2.3832  1.9146  2.0738  2.2498  2.1066  0.1860  
MAE (testset)     1.5594  2.1304  1.7061  1.6290  1.9724  1.7995  0.2167  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.9114734 , 2.38323264, 1.91463229, 2.07384716, 2.24980506]),
 'test_mae': array([1.55939225, 2.13038737, 1.70613587, 1.6290049 , 1.97237135]),
 'fit_time': (7.224082946777344e-05,
  4.792213439941406e-05,
  3.814697265625e-05,
  3.814697265625e-05,
  3.6716461181640625e-05),
 'test_time': (0.00021886825561523438,
  2.8133392333984375e-05,
  2.7894973754882812e-05,
  2.5033950805664062e-05,
  2.3126602172851562e-05)}

In [6]:
anti_testset_user = []
targetUser = 8 #inner_id of the target user
fillValue = trainSet.global_mean
user_item_ratings = trainSet.ur[targetUser]
user_items = [item for (item,_) in (user_item_ratings)]
user_items
ratings = trainSet.all_ratings()

for iid in trainSet.all_items():
    if(iid not in user_items):
        anti_testset_user.append((trainSet.to_raw_uid(targetUser),trainSet.to_raw_iid(iid),fillValue))
        pred = algo.predict(uid=trainSet.to_raw_uid(targetUser), iid=trainSet.to_raw_iid(iid)) 

# print(anti_testset_user)
# algo.fit(trainSet)
predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'], inplace=True, ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
2,user_6,movie_7,3.238095,5.0,{'was_impossible': False}
0,user_6,movie_0,3.238095,3.059798,{'was_impossible': False}
3,user_6,movie_9,3.238095,2.319805,{'was_impossible': False}
1,user_6,movie_6,3.238095,1.806374,{'was_impossible': False}


In [7]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
algo = KNNBasic(sim_options=sim_options)
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3894  1.7839  1.6109  1.2553  1.7003  1.5480  0.1968  
MAE (testset)     1.1222  1.6639  1.4471  1.0415  1.4523  1.3454  0.2304  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.38939475, 1.78389273, 1.6108631 , 1.25531138, 1.70034218]),
 'test_mae': array([1.12215899, 1.66394032, 1.44713106, 1.04148839, 1.45228502]),
 'fit_time': (0.00016689300537109375,
  3.1948089599609375e-05,
  2.4080276489257812e-05,
  2.2172927856445312e-05,
  5.078315734863281e-05),
 'test_time': (0.00011515617370605469,
  7.176399230957031e-05,
  6.890296936035156e-05,
  0.0001308917999267578,
  7.128715515136719e-05)}

In [8]:
predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],inplace=True,ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
2,user_6,movie_7,3.238095,3.649821,"{'actual_k': 5, 'was_impossible': False}"
1,user_6,movie_6,3.238095,3.401207,"{'actual_k': 5, 'was_impossible': False}"
3,user_6,movie_9,3.238095,3.312112,"{'actual_k': 5, 'was_impossible': False}"
0,user_6,movie_0,3.238095,3.259382,"{'actual_k': 5, 'was_impossible': False}"


In [9]:
algo = SVD()
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3209  1.4941  1.7852  1.3585  1.4479  1.4813  0.1640  
MAE (testset)     1.2329  1.2516  1.7312  1.0975  1.2224  1.3071  0.2189  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([1.3209403 , 1.49414378, 1.78522476, 1.35849949, 1.44785852]),
 'test_mae': array([1.23285934, 1.25156375, 1.73124628, 1.0974987 , 1.22243728]),
 'fit_time': (0.0017931461334228516,
  0.0007190704345703125,
  0.00044608116149902344,
  0.00042128562927246094,
  0.0005290508270263672),
 'test_time': (0.0001850128173828125,
  0.00014281272888183594,
  4.100799560546875e-05,
  6.198883056640625e-05,
  4.601478576660156e-05)}

In [10]:
predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],inplace=True,ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
3,user_6,movie_9,3.238095,3.7525,{'was_impossible': False}
0,user_6,movie_0,3.238095,3.685225,{'was_impossible': False}
2,user_6,movie_7,3.238095,3.539427,{'was_impossible': False}
1,user_6,movie_6,3.238095,3.478433,{'was_impossible': False}


In [11]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [50,100,150],
              'n_epochs': [35, 40, 60, 100, 200, 500],
              'lr_all':[0.005,0.01,0.1],
              'reg_all':[0.02,0.05,0.1]}
grid_search = GridSearchCV(SVD, param_grid, measures=['rmse','mae'], cv=3)
grid_search.fit(data) 

print("Best RMSE: ", grid_search.best_score['rmse'])
print("Best MAE: ", grid_search.best_score['mae'])
print("Best Parameters: ", grid_search.best_params['rmse'])

Best RMSE:  1.2872443869761092
Best MAE:  1.0513373677808577
Best Parameters:  {'n_factors': 100, 'n_epochs': 100, 'lr_all': 0.1, 'reg_all': 0.1}


In [12]:
# We can now use the algorithm that yields the best rmse:
algo = grid_search.best_estimator['rmse']
algo.fit(data.build_full_trainset())

predictions = algo.test(anti_testset_user)
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'], inplace=True, ascending = False)
pred

Unnamed: 0,uid,iid,r_ui,est,details
1,user_6,movie_6,3.238095,4.746396,{'was_impossible': False}
2,user_6,movie_7,3.238095,4.159585,{'was_impossible': False}
3,user_6,movie_9,3.238095,1.341454,{'was_impossible': False}
0,user_6,movie_0,3.238095,1.104501,{'was_impossible': False}
