In [17]:
import os
import numpy as np
import pandas as pd
import AbstractBaseCollabFilterSGD
from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV
from surprise import SVD

In [18]:
import matplotlib
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.0, style='whitegrid')

In [19]:
train_data_tuple, valid_data_tuple, test_data_tuple, total_n_users, total_n_items = load_train_valid_test_datasets()

data_path = 'data_movie_lens_100k/'

ratings_df = pd.read_csv(os.path.join(data_path, 'ratings_all_development_set.csv'))
users_df = pd.read_csv(os.path.join(data_path, 'user_info.csv'))
movies_df = pd.read_csv(os.path.join(data_path, 'movie_info.csv'))
masked_test_df = pd.read_csv(os.path.join(data_path, 'ratings_masked_leaderboard_set.csv'))

train_df, val_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [20]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)

In [21]:
param_grid = {
    'n_factors': [2, 20, 50, 100, 200, 1000],
    'lr_all': [0.005, 0.01, 0.1],
    'reg_all': [0.02, 0.05, 0.1, 1]
}

gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=5, n_jobs=-1, joblib_verbose=2)
gs.fit(data)
best_model = gs.best_estimator['mae']

print("Best MAE score: ", gs.best_score['mae'])
print("Best parameters: ", gs.best_params['mae'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
python(14757) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(14758) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(14759) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(14760) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(14761) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(14762) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(14763) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(14764) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   20.1s


Best MAE score:  0.7402192186186831
Best parameters:  {'n_factors': 200, 'lr_all': 0.01, 'reg_all': 0.1}


[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.0min finished


In [22]:
trainset = data.build_full_trainset()
best_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17b6f5910>

In [24]:
# Should submit final predictions as: predicted_ratings_leaderboard.txt

predicted_ratings_leaderboard = []
for user_id, item_id in zip(masked_test_df['user_id'], masked_test_df['item_id']):
    pred = best_model.predict(user_id, item_id)
    predicted_ratings_leaderboard.append(pred.est)

# Make sure its in range, should we maybe round these values tho?
predicted_ratings_leaderboard = predictions = np.clip(predicted_ratings_leaderboard, 1.0, 5.0)

np.savetxt('predicted_ratings_leaderboard.txt', predicted_ratings_leaderboard)