## Preliminary Model

In [73]:
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset

from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from surprise import SVD, KNNBasic, NMF, KNNWithMeans, KNNWithZScore

import sys
sys.path.append("./../")
from src.utils import percentileMetric

In [74]:
df = pd.read_csv("./../data/training.csv").drop(columns="timestamp")

In [75]:
df.columns

Index(['user', 'movie', 'rating'], dtype='object')

In [76]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["user", "movie", "rating"]], reader)

In [77]:
# cross_validate(model_alg, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

In [78]:
trainset, testset = train_test_split(data, test_size=0.2)

### Matrix factorization: SVD

In [79]:
model_alg = SVD()
model_alg.fit(trainset)
pred_alg = model_alg.test(testset)
accuracy.rmse(pred_alg)

RMSE: 0.8807


0.8807349831894097

In [80]:
pd.DataFrame(pred_alg).head()

Unnamed: 0,uid,iid,r_ui,est,details
0,4083,3431,1.0,1.992204,{'was_impossible': False}
1,2100,608,4.0,3.716298,{'was_impossible': False}
2,1120,3698,1.0,3.483819,{'was_impossible': False}
3,1895,1729,3.0,3.646702,{'was_impossible': False}
4,5915,16,5.0,4.3218,{'was_impossible': False}


In [81]:
percentileMetric(pd.DataFrame(pred_alg, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.38264882325796

### KNNBasic

In [82]:
model_knnb = KNNBasic()
model_knnb.fit(trainset)
pred_knnb = model_knnb.test(testset)
accuracy.rmse(pred_knnb)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9285


0.928458983731706

In [83]:
pd.DataFrame(pred_knnb, columns=["user", "movie", "actualrating", "predictedrating", "info"]).head()

Unnamed: 0,user,movie,actualrating,predictedrating,info
0,4083,3431,1.0,2.513383,"{'actual_k': 40, 'was_impossible': False}"
1,2100,608,4.0,4.385161,"{'actual_k': 40, 'was_impossible': False}"
2,1120,3698,1.0,3.231681,"{'actual_k': 40, 'was_impossible': False}"
3,1895,1729,3.0,3.757955,"{'actual_k': 40, 'was_impossible': False}"
4,5915,16,5.0,3.727946,"{'actual_k': 40, 'was_impossible': False}"


In [84]:
percentileMetric(pd.DataFrame(pred_knnb, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.329967577582214

### NMF

In [85]:
model_nmf = NMF()
model_nmf.fit(trainset)
pred_nmf = model_nmf.test(testset)
accuracy.rmse(pred_nmf)

RMSE: 0.9195


0.9195303661621935

In [86]:
percentileMetric(pd.DataFrame(pred_nmf, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.263055194201821

### Other KNN models

In [87]:
def run_model(model, train_data, test_data):
    model_ = model()
    model_.fit(train_data)
    pred = model_.test(test_data)
    acc = accuracy.rmse(pred)
    return model_, pred, acc

In [88]:
model_knnm, pred_knnm, acc_knnm = run_model(KNNWithMeans, trainset, testset)
acc_knnm

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9334


0.9334435820221055

In [89]:
percentileMetric(pd.DataFrame(pred_knnm, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.272701767511456

In [90]:
model_knnz, pred_knnz, acc_knnz = run_model(KNNWithZScore, trainset, testset)
acc_knnz

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9344


0.9344145942568899

In [91]:
percentileMetric(pd.DataFrame(pred_knnz, columns=["user", "movie", "actualrating", "predictedrating", "info"]))

4.280137559252719

## Tune SVD

In [111]:
nfactors_lst = [90, 95, 100, 105, 110]
n_epochs = [18, 19, 20, 21, 22]
zscore_lst = []
rmse_lst = []
for n_factor, n_epoch in zip(nfactors_lst, n_epochs):
    model_svd = SVD(n_factors=n_factor, n_epochs=n_epoch)
    model_svd.fit(trainset)
    pred_svd = model_svd.test(testset)
    zscore_lst.append(
        percentileMetric(
            pd.DataFrame(pred_svd, 
            columns=["user", "movie", "actualrating", "predictedrating", "info"])))
    rmse_lst.append(accuracy.rmse(pred_svd))

RMSE: 0.8819
RMSE: 0.8819
RMSE: 0.8826
RMSE: 0.8833
RMSE: 0.8848


In [112]:
zscore_lst

[4.3722587211992225,
 4.37159227428149,
 4.373511767420397,
 4.372155411655874,
 4.369479022591055]

In [113]:
rmse_lst

[0.8819050260193491,
 0.8818916015823619,
 0.8825506739682687,
 0.8832556394192941,
 0.8847626286696013]

## Output

In [129]:
pd.read_csv("./../data/requests.csv").head()

Unnamed: 0,user,movie
0,4958,1924
1,4958,3264
2,4958,2634
3,4958,1407
4,4958,2399


In [179]:
data_final = pd.read_csv("./../data/requests.csv", names=["user", "movie"])

In [180]:
# reader_final = Reader(rating_scale=(1, 5))
# data_final = Dataset.load_from_df(df[["user", "movie", "rating"]], reader_final)

In [181]:
model_svd = SVD(n_factors=100, n_epochs=20)
model_svd.fit(trainset)
pred = model_svd.test(testset)

In [182]:
# pred = model_svd.test(data_final)

In [183]:
rows = []
for idx, row in enumerate(data_final.iterrows()):
    if idx == 0:
        continue
#     print(row[1]["user"])
#     print(row[1]["movie"])
    
#     if row[0] == 5:
#         break
    rows.append(model_svd.predict(row[1]["user"], row[1]["movie"]))

# model_svd.predict(4958, 1924)

In [193]:
rows[:1]

[Prediction(uid='4958', iid='1924', r_ui=None, est=3.5911546875, details={'was_impossible': False})]

In [192]:
pd.DataFrame(rows).drop(columns=["r_ui", "details"]).rename(columns={"uid": "user", "iid": "movie", "est": "rating"})
#.to_csv("./../data/submission-svd.csv")

Unnamed: 0,user,movie,rating
0,4958,1924,3.591155
1,4958,3264,3.591155
2,4958,2634,3.591155
3,4958,1407,3.591155
4,4958,2399,3.591155
...,...,...,...
200204,1875,3793,3.591155
200205,1875,2160,3.591155
200206,1875,1035,3.591155
200207,1875,1580,3.591155


In [196]:
full_testset = pd.read_csv("./../data/requests.csv")
full_testset['predicted_rating'] = 0
full_testset

Unnamed: 0,user,movie,predicted_rating
0,4958,1924,0
1,4958,3264,0
2,4958,2634,0
3,4958,1407,0
4,4958,2399,0
...,...,...,...
200204,1875,3793,0
200205,1875,2160,0
200206,1875,1035,0
200207,1875,1580,0


In [198]:
for idx in full_testset.index:
    full_testset.loc[idx, 'predicted_rating'] = model_svd.predict(str(full_testset['user'][idx]), str(full_testset['movie'][idx]))

In [199]:
full_testset

Unnamed: 0,user,movie,predicted_rating
0,4958,1924,"(4958, 1924, None, 3.5911546875, {'was_impossi..."
1,4958,3264,"(4958, 3264, None, 3.5911546875, {'was_impossi..."
2,4958,2634,"(4958, 2634, None, 3.5911546875, {'was_impossi..."
3,4958,1407,"(4958, 1407, None, 3.5911546875, {'was_impossi..."
4,4958,2399,"(4958, 2399, None, 3.5911546875, {'was_impossi..."
...,...,...,...
200204,1875,3793,"(1875, 3793, None, 3.5911546875, {'was_impossi..."
200205,1875,2160,"(1875, 2160, None, 3.5911546875, {'was_impossi..."
200206,1875,1035,"(1875, 1035, None, 3.5911546875, {'was_impossi..."
200207,1875,1580,"(1875, 1580, None, 3.5911546875, {'was_impossi..."


In [None]:
# zscore_lst.append(
#     percentileMetric(
#         pd.DataFrame(pred_svd, 
#         columns=["user", "movie", "actualrating", "predictedrating", "info"])))
# rmse_lst.append(accuracy.rmse(pred_svd))