In [6]:
import os
import numpy as np
import pandas as pd
import AbstractBaseCollabFilterSGD
from CollabFilterOneVectorPerItem import CollabFilterOneVectorPerItem
from train_valid_test_loader import load_train_valid_test_datasets
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV
from surprise import KNNBaseline
from surprise import SVDpp
import surprise


In [7]:
import matplotlib
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.0, style='whitegrid')

In [8]:
train_data_tuple, valid_data_tuple, test_data_tuple, total_n_users, total_n_items = load_train_valid_test_datasets()

data_path = 'data_movie_lens_100k/'

ratings_df = pd.read_csv(os.path.join(data_path, 'ratings_all_development_set.csv'))
users_df = pd.read_csv(os.path.join(data_path, 'user_info.csv'))
movies_df = pd.read_csv(os.path.join(data_path, 'movie_info.csv'))
masked_test_df = pd.read_csv(os.path.join(data_path, 'ratings_masked_leaderboard_set.csv'))

train_df, val_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)

In [9]:

sim_options = {"name": "pearson_baseline"}  

param_grid = {
    'k': [30, 40, 80, 150, 300],
    'min_k': [1, 10, 20, 30],
    'sim_options': {
        'name': ['pearson_baseline'],
    },
}

gs = GridSearchCV(KNNBaseline, param_grid, measures=['mae'], cv=5, n_jobs=-1)
gs.fit(data)
best_model = gs.best_estimator['mae']

print("Best MAE score: ", gs.best_score['mae'])
print("Best parameters: ", gs.best_params['mae'])

Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline si

In [10]:
# trainset = data.build_full_trainset()
# model = surprise.prediction_algorithms.matrix_factorization.SVDpp(n_factors=50, lr_all=.01, reg_all=.1, n_epochs=150)

# model.fit(trainset)             

# predictions = model.test(val_df.values.tolist())
# mae = np.mean([abs(pred[2] - pred[3]) for pred in predictions])
# print("MAE on validation set: ", mae)

In [12]:
# Now fit on the full training set
trainset = data.build_full_trainset()
model = surprise.prediction_algorithms.matrix_factorization.SVDpp(n_factors=50, lr_all=.01, reg_all=.1, n_epochs=150)
SVD_model = surprise.prediction_algorithms.matrix_factorization.SVD(n_factors=200, lr_all=.005, reg_all=.1, n_epochs=100)
model.fit(trainset)
SVD_model.fit(trainset)
best_model.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x17cd2c2d0>

In [13]:
predicted_ratings_KNN = []

for user_id, item_id in zip(masked_test_df['user_id'], masked_test_df['item_id']):
    pred = best_model.predict(user_id, item_id)
    predicted_ratings_KNN.append(pred.est)

In [14]:
predicted_ratings_SVD = []

for user_id, item_id in zip(masked_test_df['user_id'], masked_test_df['item_id']):
    pred = SVD_model.predict(user_id, item_id)
    predicted_ratings_SVD.append(pred.est)

In [15]:
predicted_ratings_SVDpp = []

for user_id, item_id in zip(masked_test_df['user_id'], masked_test_df['item_id']):
    pred = model.predict(user_id, item_id)
    predicted_ratings_SVDpp.append(pred.est)

In [16]:
average_ratings = np.mean([predicted_ratings_SVDpp, predicted_ratings_KNN, predicted_ratings_SVD], axis=0)

In [17]:
print(average_ratings)

[4.14655587 4.08004245 3.84054461 ... 4.23504719 3.23769429 2.87047652]


In [18]:
np.savetxt('predicted_ratings_leaderboard.txt',average_ratings)

In [None]:
# trainset = data.build_full_trainset()
# best_model.fit(trainset)

# predicted_ratings_KNN = []

# for user_id, item_id in zip(masked_test_df['user_id'], masked_test_df['item_id']):
#     pred = best_model.predict(user_id, item_id)
#     predicted_ratings_KNN.append(pred.est)

# predicted_ratings_SVD = []

# for user_id, item_id in zip(masked_test_df['user_id'], masked_test_df['item_id']):
#     pred = model.predict(user_id, item_id)
#     predicted_ratings_SVD.append(pred.est)

# average_ratings = np.mean([predicted_ratings_SVD, predicted_ratings_KNN], axis=0)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


TypeError: 'NoneType' object is not subscriptable

In [None]:
trainset = data.build_full_trainset()
best_model.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x30d9d8250>

In [None]:
# Should submit final predictions as: predicted_ratings_leaderboard.txt

predicted_ratings_leaderboard = []
for user_id, item_id in zip(masked_test_df['user_id'], masked_test_df['item_id']):
    pred = best_model.predict(user_id, item_id)
    predicted_ratings_leaderboard.append(pred.est)

# Make sure its in range, should we maybe round these values tho?
predicted_ratings_leaderboard = predictions = np.clip(predicted_ratings_leaderboard, 1.0, 5.0)

np.savetxt('predicted_ratings_leaderboard.txt', predicted_ratings_leaderboard)

AttributeError: 'KNNBaseline' object has no attribute 'trainset'