## Preliminary Model

In [73]:
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset

from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from surprise import SVD, KNNBasic, NMF, KNNWithMeans, KNNWithZScore

import sys
sys.path.append("./../")
from src.utils import percentileMetric

In [74]:
df = pd.read_csv("./../data/training.csv").drop(columns="timestamp")

In [75]:
df.columns

Index(['user', 'movie', 'rating'], dtype='object')

In [76]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user", "movie", "rating"]], reader)

In [77]:
# cross_validate(model_alg, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

In [78]:
trainset, testset = train_test_split(data, test_size=0.2)

### Matrix factorization: SVD

In [79]:
model_alg = SVD()
model_alg.fit(trainset)
pred_alg = model_alg.test(testset)
accuracy.rmse(pred_alg)

RMSE: 0.8807


0.8807349831894097

In [80]:
pd.DataFrame(pred_alg).head()

Unnamed: 0,uid,iid,r_ui,est,details
0,4083,3431,1.0,1.992204,{'was_impossible': False}
1,2100,608,4.0,3.716298,{'was_impossible': False}
2,1120,3698,1.0,3.483819,{'was_impossible': False}
3,1895,1729,3.0,3.646702,{'was_impossible': False}
4,5915,16,5.0,4.3218,{'was_impossible': False}


In [81]:
percentileMetric(pd.DataFrame(pred_alg, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.38264882325796

### KNNBasic

In [82]:
model_knnb = KNNBasic()
model_knnb.fit(trainset)
pred_knnb = model_knnb.test(testset)
accuracy.rmse(pred_knnb)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9285


0.928458983731706

In [83]:
pd.DataFrame(pred_knnb, columns=["user", "movie", "actualrating", "predictedrating", "info"]).head()

Unnamed: 0,user,movie,actualrating,predictedrating,info
0,4083,3431,1.0,2.513383,"{'actual_k': 40, 'was_impossible': False}"
1,2100,608,4.0,4.385161,"{'actual_k': 40, 'was_impossible': False}"
2,1120,3698,1.0,3.231681,"{'actual_k': 40, 'was_impossible': False}"
3,1895,1729,3.0,3.757955,"{'actual_k': 40, 'was_impossible': False}"
4,5915,16,5.0,3.727946,"{'actual_k': 40, 'was_impossible': False}"


In [84]:
percentileMetric(pd.DataFrame(pred_knnb, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.329967577582214

### NMF

In [85]:
model_nmf = NMF()
model_nmf.fit(trainset)
pred_nmf = model_nmf.test(testset)
accuracy.rmse(pred_nmf)

RMSE: 0.9195


0.9195303661621935

In [86]:
percentileMetric(pd.DataFrame(pred_nmf, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.263055194201821

### Other KNN models

In [87]:
def run_model(model, train_data, test_data):
    model_ = model()
    model_.fit(train_data)
    pred = model_.test(test_data)
    acc = accuracy.rmse(pred)
    return model_, pred, acc

In [88]:
model_knnm, pred_knnm, acc_knnm = run_model(KNNWithMeans, trainset, testset)
acc_knnm

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9334


0.9334435820221055

In [89]:
percentileMetric(pd.DataFrame(pred_knnm, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.272701767511456

In [90]:
model_knnz, pred_knnz, acc_knnz = run_model(KNNWithZScore, trainset, testset)
acc_knnz

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9344


0.9344145942568899

In [91]:
percentileMetric(pd.DataFrame(pred_knnz, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.280137559252719

## Tune SVD

In [105]:
nfactors_lst = [25, 50, 100]
n_epochs = [10, 15, 20]
zscore_lst = []
rmse_lst = []
for n_factor, n_epoch in zip(nfactors_lst, n_epochs):
    model_svd = SVD(n_factors=n_factor, n_epochs=n_epoch)
    model_svd.fit(trainset)
    pred_svd = model_svd.test(testset)
    zscore_lst.append(
        percentileMetric(
            pd.DataFrame(pred_svd, 
            columns=["user", "movie", "actualrating", "predictedrating", "info"])))
    rmse_lst.append(accuracy.rmse(pred_svd))

RMSE: 0.9041
RMSE: 0.8867
RMSE: 0.8815


In [106]:
zscore_lst

[4.316184649610679, 4.355403278081304, 4.376994742182456]

In [107]:
rmse_lst

[0.9041319968496241, 0.8867187685605266, 0.8815419538936002]