In [10]:
from test import load_data_sparse
from SGD import *
from ALS import *
from surprise_models import *
from surprise.model_selection import train_test_split
from helper import create_submission_from_prediction

In [11]:
def calculate_rmse(real_labels, predictions):
    """Calculate RMSE."""
    return np.linalg.norm(real_labels - predictions) / np.sqrt(len(real_labels))

In [12]:
data_name = "data/47b05e70-6076-44e8-96da-2530dc2187de_data_train.csv"
test_name = "data/sampleSubmission.csv"

Load trainset:

In [13]:
sp_ratings, pd_ratings = load_data_sparse(data_name, False)

In [14]:
_, pd_pred = load_data_sparse(test_name, False)

Convert it into surprise

In [15]:
spr_data = pandas_to_surprise(pd_ratings)

Split into test and dataset

In [16]:
trainset = spr_data.build_full_trainset()
testset = pandas_to_surprise(pd_pred, pred=True)

get labels

In [17]:
_,_,labels = get_testset_indices(testset)

In [18]:
baseline = surprise_baseline(trainset, testset)

Estimating biases using als...


In [19]:
calculate_rmse(labels, baseline)

0.9952563055945158

In [20]:
svd = surprise_SVD(trainset, testset)

In [21]:
calculate_rmse(labels, svd)

0.9704055381993139

In [22]:
so = surprise_slopeOne(trainset, testset)

In [23]:
calculate_rmse(labels, so)

1.0112568469420897

In [24]:
bsknn = surprise_baselineKNN(trainset, testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [25]:
calculate_rmse(labels, bsknn)

1.0276913542358548

In [26]:
num_features = 40  # K in the lecture notes
lambda_user = 0.1
lambda_film = 0.1
stop_criterion = 1e-4

In [27]:
als = ALS_CV(trainset, testset, num_features, lambda_user, lambda_film, stop_criterion)

learn the matrix factorization using ALS...
RMSE: 1.0894514833867237.
RMSE: 1.0483624776035143.
RMSE: 1.0146416435665808.
RMSE: 0.9882882413815345.
RMSE: 0.9708652527607661.
RMSE: 0.9585903673417175.
RMSE: 0.9495531686209909.
RMSE: 0.9428363249826118.
RMSE: 0.9378038881307342.
RMSE: 0.933958635247875.
RMSE: 0.9309521652759987.
RMSE: 0.9285548931808277.
RMSE: 0.9266144169329613.
RMSE: 0.9250255742919575.
RMSE: 0.9237127269791121.
RMSE: 0.9226196283783649.
RMSE: 0.9217033930824404.
RMSE: 0.9209306967591386.
RMSE: 0.9202752687213152.
RMSE: 0.9197161790641709.
RMSE: 0.9192366302902291.
RMSE: 0.9188230750558981.
RMSE: 0.9184645495338126.
RMSE: 0.9181521550057589.
RMSE: 0.9178786469532956.
RMSE: 0.9176381063467502.
RMSE: 0.9174256762139948.
RMSE: 0.9172373510148559.
RMSE: 0.9170698088709124.
RMSE: 0.9169202783955801.
RMSE: 0.9167864332214709.
RMSE: 0.9166663085121035.
RMSE: 0.9165582347999264.
RMSE: 0.9164607854133959.
Iteration stopped, as iteration criterion 0.0001 was reached. RMSE = 0.91

In [28]:
calculate_rmse(labels, als)

0.9132832048615761

In [29]:
sgd = matrix_factorization_SGD_CV(trainset, testset, num_features, lambda_user, lambda_film, stop_criterion)

learn the matrix factorization using SGD...
RMSE: 1.0417471959338522.
RMSE: 1.0181669879272277.
RMSE: 1.0106836766524772.
RMSE: 1.0065829549483438.
RMSE: 1.0050246730356185.
RMSE: 1.0044737054247757.
RMSE: 1.0041436893510656.
RMSE: 1.0039953750886863.
RMSE: 1.0039697606770512.


In [30]:
calculate_rmse(labels, sgd)

0.9686918626282042

In [33]:
def surprise_SVDpp(trainset, finalset):

    options = {'method': 'als',
               'reg_i': 1.e-5,
               'reg_u': 14.6,
               'n_epochs': 10
               }

    algo = spr.SVDpp(n_factors=40, n_epochs=20, lr_all=0.001)

    algo.fit(trainset)
    predictions_final = algo.test(finalset)

    return spr_estimate_to_vect(predictions_final)

In [34]:
svdpp = surprise_SVDpp(trainset, testset)

In [35]:
calculate_rmse(labels, svdpp)

0.955194744531699

In [36]:
coeffs = np.array([-0.33202676, -0.24487176, -0.03659188,  0.29858978,  0.98595659, 0.05355454,  0.30458821])

result = baseline * coeffs[0] + svd * coeffs[1] + so * coeffs[2] + bsknn * coeffs[3] + als * coeffs[4] + sgd * coeffs[5] + svdpp * coeffs[6]

In [37]:
create_submission_from_prediction(result, '6modelsblend.csv')