# Model training

In [24]:
import pandas as pd
import numpy as np
import pickle
from surprise import SVD, Dataset, Reader, accuracy
from sklearn.metrics.pairwise import cosine_similarity
from surprise.model_selection import train_test_split, cross_validate
from surprise.model_selection import GridSearchCV


In [25]:
features = pd.read_pickle('features.pkl')
playcounts = pd.read_pickle('playcounts.pkl')

#### playcounts scaling

In [26]:
# 0.1 sample of data
sample = playcounts.sample(frac=0.01, random_state=42)

In [17]:
# comment this out if you want to use the full dataset
# playcounts = sample

In [27]:
len(playcounts)

9608283

In [28]:
# max and min of playcount if you want to use it instead of scaled playcount
min_pc = playcounts['playcount'].max()
max_pc = playcounts['playcount'].min()
playcounts['playcount_scale_2'] = playcounts['playcount'].apply(lambda x: (x-min_pc)/(max_pc-min_pc))


### SVD cross validation

In [29]:
reader = Reader(rating_scale=(min_pc, max_pc))
data = Dataset.load_from_df(playcounts[['user_id', 'track_id', 'playcount_scale']], reader)


In [30]:
svd_model = SVD()

In [120]:
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

KeyboardInterrupt: 

In [21]:
trainset = data.build_full_trainset()

In [9]:
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fed2cb4a150>

### Hyperparameter tuning and validation

In [31]:
# parameter grid for SVD
svd_param_grid = {'n_factors': [50, 100, 200],
                    'n_epochs': [5, 10, 20],
                    'lr_all': [0.001, 0.005, 0.01],
                    'reg_all': [0.01, 0.02, 0.05]}

In [32]:
# hyperparameter tuning for SVD
gs_svd = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)

gs_svd.fit(data)

# best RMSE score
print(gs_svd.best_score['rmse'])


In [87]:
# best parameters
print(gs_svd.best_params['rmse'])

{'n_factors': 50, 'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.01}


#### KNNWithZscore

In [22]:
from surprise import KNNWithZScore

In [109]:
# parameter grid for knnwithzscore
knnz_param_grid = {'k': [100, 200, 500]}

In [121]:
# hyperparameter tuning for NN
gs_nn = GridSearchCV(KNNWithZScore, knnz_param_grid, n_jobs=2)

gs_nn.fit(data)

# rmse
print(gs_nn.best_score)

# best parameters
print(gs_nn.best_params)

Computing the msd similarity matrix...


MemoryError: Unable to allocate 1.17 TiB for an array with shape (400861, 400861) and data type float64

In [23]:
knnz = KNNWithZScore(k=500)

knnz.fit(trainset)

Computing the msd similarity matrix...


MemoryError: Unable to allocate 6.69 TiB for an array with shape (959075, 959075) and data type float64