## Collaborative filtering

In [45]:
import pandas as pd
from surprise import Reader, Dataset, KNNBasic, SVD, NMF
from surprise.model_selection import GridSearchCV, cross_validate, KFold
import sys
sys.path.insert(0, '../models')
from CrossVal import get_cross_validation
sys.path.insert(0, '../benchmark')
from evaluateCF import find_best_params


In [46]:
ratings = pd.read_csv('../data/interim/data.csv')
ratings.drop(['timestamp'], axis=1, inplace=True)
movies = pd.read_csv('../data/interim/item.csv')

In [47]:
df_combined = pd.merge(ratings, movies, on = 'movie id')

## Create User-Item Matrix

In [48]:
util_mat = df_combined.pivot_table(index = 'user id', columns = 'movie title', values = 'rating')
util_mat.head(20)

movie title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,
6,,,,4.0,,,,5.0,,,...,,,,4.0,,,,,,
7,,,,4.0,,,5.0,5.0,,4.0,...,,,,5.0,3.0,,3.0,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,4.0,...,,,,,,,,,,
10,,,,5.0,,,,5.0,,4.0,...,,,,,,,,,,


## Model Based Collaborative Filtering

### KNN

In [49]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df( ratings[['user id', 'movie id', 'rating']], reader = reader )

In [50]:
# Compute Mean Squared Distance Similarity
sim_options = {'name' : 'msd'}

algo = KNNBasic(k=20, sim_options=sim_options )
get_cross_validation(algo, data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9897  0.9887  0.9845  0.9876  0.0023  
Fit time          0.20    0.20    0.21    0.20    0.01    
Test time         2.39    2.46    2.56    2.47    0.07    


In [51]:
n_neighbours = [5, 10, 20, 30]
n_epochs = [5, 10, 20]

param_grid = {'n_neighbours' : n_neighbours, 'n_epochs' : n_epochs}


best_score, best_params = find_best_params(data, param_grid, KNNBasic)


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [None]:
# Best RMSE score
print('Best Score :', best_score)

# Combination of parameters that gave the best RMSE score
print('Best Parameters :', best_params)

Best Score : 0.9792436781335241
Best Parameters : {'n_neighbours': 5, 'n_epochs': 5}


### SVD

In [None]:
algo = SVD()
get_cross_validation(algo, data)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9455  0.9435  0.9446  0.9445  0.0008  
Fit time          0.71    0.70    0.72    0.71    0.01    
Test time         0.13    0.12    0.25    0.17    0.06    


In [None]:
n_epochs = [5, 10, 20]
n_factors = [50, 75]
lr_all = [0.5, 0.05]
reg_all = [0.06, 0.04]


param_grid = {'n_epochs' : n_epochs, 'n_factors' : n_factors, 'lr_all' : lr_all, 'reg_all' : reg_all}

best_score, best_params = find_best_params(data, param_grid, SVD)


In [None]:
# Best RMSE score
print('Best Score :', best_score)

# Combination of parameters that gave the best RMSE score
print('Best Parameters :', best_params)

Best Score : 0.926118872598591
Best Parameters : {'n_epochs': 5, 'n_factors': 50, 'lr_all': 0.05, 'reg_all': 0.06}


### NMF

In [None]:
algo = NMF()
get_cross_validation(algo, data)

Evaluating RMSE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9717  0.9680  0.9791  0.9729  0.0046  
Fit time          1.18    1.20    1.23    1.21    0.02    
Test time         0.10    0.10    0.24    0.15    0.07    


In [None]:
n_epochs = [5, 10, 20]
n_factors = [15, 50, 75]


param_grid = {'n_epochs' : n_epochs, 'n_factors' : n_factors}

best_score, best_params = find_best_params(data, param_grid, NMF)

In [None]:
# Best RMSE score
print('Best Score :', best_score)

# Combination of parameters that gave the best RMSE score
print('Best Parameters :', best_params)

Best Score : 0.9768100840013746
Best Parameters : {'n_epochs': 20, 'n_factors': 15}
