# Model training

In [1]:
import pandas as pd
import numpy as np
import pickle
from surprise import SVD, Dataset, Reader
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split, cross_validate
from surprise.model_selection import GridSearchCV
from surprise import KNNWithMeans


In [2]:
playcounts = pd.read_pickle('playcounts.pkl')

In [3]:
playcounts.head()

Unnamed: 0,track_id,user_id,playcount
0,16173,690941,1.0
1,1166,690941,1.0
2,22624,690941,1.0
3,2993,690941,1.0
4,1368,690941,1.0


In [25]:
# 0.1 sample of data
sample = playcounts.sample(frac=0.1, random_state=42)

In [26]:
# comment this out if you want to use the full dataset. sample is for gridsearc
# playcounts = sample

In [4]:
len(playcounts)

9680273

#### create surprise sets

In [5]:
reader = Reader()
data = Dataset.load_from_df(playcounts[['user_id', 'track_id', 'playcount']], reader)

### SVD

In [26]:
# parameter grid for SVD
svd_param_grid = {
    'n_factors': [40, 50],  # Number of factors
    'n_epochs': [30, 40, 50],     # Number of epochs
    'lr_all': [0.005, 0.01, 0.15],      # Learning rate
    'reg_all': [0.07, 0.1]        # Regularization term
}

In [27]:
# hyperparameter tuning for SVD
gs_svd = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=3)

gs_svd.fit(data)

In [25]:
# best RMSE score
print(gs_svd.best_score['rmse'])
# best parameters
svd_best_params = gs_svd.best_params['rmse']
print(svd_best_params)

0.15298486386726692
{'n_factors': 50, 'n_epochs': 40, 'lr_all': 0.01, 'reg_all': 0.1}


### KNN gridsearch

In [33]:
knn_param_grid = {
    'k': [9, 19, 25],
    'sim_options': {
        'name': ['pearson_baseline'],
        'min_support': [5, 10, 15],
        'user_based': [False]
    }
}

In [34]:
gs_knn = GridSearchCV(KNNWithMeans, knn_param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=3)

gs_knn.fit(data)

Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing 

In [35]:
# best RMSE score
print(gs_knn.best_score['rmse'])
# best parameters
knn_best_params = gs_knn.best_params['rmse']
print(knn_best_params)

2.5965210760740853
{'k': 9, 'sim_options': {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False}}


### test knn

In [6]:
k = 9 #knn_best_params['k']
sim_options = {'name': 'pearson_baseline', 'min_support': 5, 'user_based': False} #knn_best_params['sim_options']

In [39]:
trainset, testset = train_test_split(data, test_size=0.2)

In [40]:
# train with best parameters
knn = KNNWithMeans(k=k, sim_options=sim_options)

knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f5b048646d0>

In [41]:
predictions = knn.test(testset)

accuracy_rmse = rmse(predictions)

RMSE: 2.5844


### Train the whole set

In [7]:
trainset = data.build_full_trainset()

In [8]:
knn = KNNWithMeans(k=k, sim_options=sim_options)

knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f766c8bc510>

In [12]:
# create mappings from track_id to inner id and vice versa
inner_to_track = {}
track_to_inner = {}
for inner_id in trainset.all_items():
    track_id = trainset.to_raw_iid(inner_id)
    inner_to_track[inner_id] = track_id
    track_to_inner[track_id] = inner_id



### knn innerid-rawid testing

In [19]:
testcase = 2

In [20]:
def get_neighbors(outer, k):
    neighbors = []
    inner = track_to_inner[outer]
    for inner_id in knn.get_neighbors(inner, k):
        neighbors.append(inner_to_track[inner_id])
    return neighbors

In [21]:
get_neighbors(testcase, 10)

[16173, 1166, 22624, 2993, 1368, 29308, 855, 4712, 37, 383]

### Re-order knn similarity matrix to match track_id order

In [9]:
# create knn_similarities matrix
knn_similarities = knn.compute_similarities()

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [11]:
knn_sim_ordered = np.copy(knn_similarities)

In [22]:
# rearrange vectors
for i in range(len(knn_similarities)):
    knn_sim_ordered[inner_to_track[i]] = knn_similarities[i]

In [23]:
# transpode the matrix for further processing
knn_sim_ordered = knn_sim_ordered.transpose()

In [24]:
knn_sim_ordered_2 = np.copy(knn_sim_ordered)

In [25]:
# rearrange vectors of transposed matrix
for i in range(len(knn_similarities)):
    knn_sim_ordered_2[inner_to_track[i]] = knn_sim_ordered[i]

In [26]:
(knn_sim_ordered_2.transpose() == knn_sim_ordered_2).all()

True

In [27]:
# house cleaning
del knn_similarities
del knn_sim_ordered

In [28]:
# convert float64 to float16 to save space
knn_sim_ordered_2 = np.array(knn_sim_ordered_2, dtype=np.float16)

In [30]:
# store model in pickle
pickle.dump(knn_sim_ordered_2, open('knn_matrix.pkl', 'wb'))