In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy as np
import string
import random
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from surprise import SVD, Reader, Dataset, accuracy
from surprise.model_selection import train_test_split, GridSearchCV
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# extract info
# assignment 1 format: ('u93397390', 'b52690052', 3) user id, book id, rating
ratings_df = pd.read_csv('./dataset/ratings_training.csv').drop(columns=['Unnamed: 0'])
ratings_df.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,tamama_kuroyuki,14817,1,0000-00-00,0000-00-00,7,2,,0,2015-09-08 05:17:10,
1,quazzaar,2449,1,0000-00-00,0000-00-00,8,2,0.0,0,2017-01-26 20:02:53,
2,abhay_shinigami,25013,24,0000-00-00,0000-00-00,9,2,0.0,0,2016-07-03 05:20:28,
3,xever,1566,1,0000-00-00,0000-00-00,7,2,0.0,0,2009-12-29 16:52:36,
4,Equintus,5030,12,0000-00-00,0000-00-00,0,2,0.0,0,2014-09-20 13:30:37,


In [4]:
dataset = ratings_df.get(['username', 'anime_id', 'my_score'])
dataset = dataset.rename(columns={'username': 'user', 'anime_id': 'item', 'my_score': 'rating'})
dataset

Unnamed: 0,user,item,rating
0,tamama_kuroyuki,14817,7
1,quazzaar,2449,8
2,abhay_shinigami,25013,9
3,xever,1566,7
4,Equintus,5030,0
...,...,...,...
199995,Nara-Sama,4437,9
199996,rxkxt,164,10
199997,Synizta,10162,8
199998,SlyVengeance,31798,9


In [5]:
testing = pd.read_csv('./dataset/ratings_testing.csv').drop(columns=['Unnamed: 0'])
testing

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,dan_sagara,24,26,0000-00-00,0000-00-00,8,2,0.0,0,2009-03-28 04:12:56,
1,expecional,7593,12,0000-00-00,2010-07-11,9,2,0.0,0,2010-07-11 17:51:12,"em comparação aos OVA, o anime é desenho de cr..."
2,One_PIece,12113,1,2013-07-07,2013-07-07,8,2,,0,2013-07-20 04:01:32,
3,Aseru,30831,10,0000-00-00,0000-00-00,10,2,0.0,0,2018-02-18 19:52:45,
4,Pauli25,27417,1,0000-00-00,0000-00-00,5,2,0.0,0,2015-04-07 14:54:27,
...,...,...,...,...,...,...,...,...,...,...,...
99995,vergil1111,855,26,0000-00-00,0000-00-00,0,2,,0,2009-11-05 16:19:54,
99996,akumaxkr,846,26,0000-00-00,0000-00-00,8,2,0.0,0,2009-08-18 17:58:28,
99997,Rikki,21507,12,0000-00-00,0000-00-00,3,2,0.0,0,2014-10-30 22:37:40,
99998,SerShiro,1699,24,0000-00-00,0000-00-00,9,2,0.0,0,2011-12-31 17:03:21,


In [186]:
users = ratings_df.get('username')
animes = ratings_df.get('anime_id')
scores = ratings_df.get('my_score')
testing_users = testing.get('username')
testing_animes = testing.get('anime_id')
testing_scores = testing.get('my_score')

In [187]:
testingSet = []
testingTrueScores = []
for i in range(testing.shape[0]):
    u = testing_users.iloc[i]
    a = testing_animes.iloc[i]
    r = testing_scores.iloc[i]
    entry = (u, a, r)
    testingSet.append(entry)
    testingTrueScores.append(r)

len(testingSet)

100000

In [178]:
allRatings = []
trueScores = []
for i in range(ratings_df.shape[0]):
    u = users.iloc[i]
    a = animes.iloc[i]
    r = scores.iloc[i]
    entry = (u, a, r)
    allRatings.append(entry)
    trueScores.append(r)
len(allRatings)

200000

In [179]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
for u,a,r in allRatings:
    if np.isnan(r):
        continue
    ratingsPerUser[u].append(r)
    ratingsPerItem[a].append(r)

In [179]:
allRatings[0]

('tamama_kuroyuki', 14817, 7)

In [180]:
ratingsPerUser['tamama_kuroyuki']

[7, 7, 7, 8, 8, 6, 7, 7, 4, 7, 10, 7, 7, 8, 8, 6, 8, 8]

In [175]:
# baselines
# predict random score (0-10)
random_pred = np.random.randint(0, 11, len(testingTrueScores)) 
mean_squared_error(testingTrueScores, random_pred)

21.70265

In [192]:
# average score for all users (in training set)
global_avg = np.mean(trueScores)
global_avg

6.713695

In [196]:
# average score among all anime
anime_avg = 6.848998200359928

In [193]:
# predict that user's average score, or global score among all ratings in training set if no data
avg_preds = []
for i in range(len(testingSet)):
    user = testingSet[i][0]
    pred = np.mean(ratingsPerUser[user])
    if np.isnan(pred):
        avg_preds.append(global_avg)
        continue
    avg_preds.append(pred)
mean_squared_error(testingTrueScores, avg_preds)

6.313792202574026

In [197]:
# predict that anime's average score, or global average score among all anime if no data
avg_anime_preds = []
for i in range(len(testingSet)):
    item = testingSet[i][1]
    pred = np.mean(ratingsPerItem[item])
    if np.isnan(pred):
        avg_anime_preds.append(anime_avg)
        continue
    avg_anime_preds.append(pred)
mean_squared_error(testingTrueScores, avg_anime_preds)

8.322801402456909

In [6]:
# read in file and set up dataset
reader = Reader(rating_scale=(0, 10))
train_data = Dataset.load_from_df(dataset, reader=reader)
trainset = train_data.build_full_trainset()

In [7]:
# read in test data as dataframe
test_ratings = pd.read_csv('./dataset/ratings_testing.csv').drop(columns=['Unnamed: 0'])
test_df = test_ratings.get(['username', 'anime_id', 'my_score'])
test_df = test_df.rename(columns={'username': 'user', 'anime_id': 'item', 'my_score': 'rating'})
# test_df = test_df.take(np.arange(40000))
test_df

Unnamed: 0,user,item,rating
0,dan_sagara,24,8
1,expecional,7593,9
2,One_PIece,12113,8
3,Aseru,30831,10
4,Pauli25,27417,5
...,...,...,...
99995,vergil1111,855,0
99996,akumaxkr,846,8
99997,Rikki,21507,3
99998,SerShiro,1699,9


In [8]:
# load the test data
test_data = Dataset.load_from_df(test_df, reader=reader)
testset = test_data.build_full_trainset().build_testset()  # build as a test set

In [126]:
# optimizing parameters (lambda, learning rate?, decrease n_factors?)
# lambda (reg_all) -- default = 0.02
# learning rate (lr_all) -- default = 0.005
# n_factors -- default = 100
# n_epochs -- default = 20
# best MSE on HW 3 -- 1.4562096523896382
param_grid = {'n_factors': [1], 'reg_all': np.linspace(0.05, 0.19, num=8)}#, "lr_all": np.linspace(0.008, 0.015, num=15)}#, "lr_all": np.linspace(0.007, 0.008, num=10)  }#, "n_epochs":[20, 25, 30, 35, 40]}
# "reg_all": np.linspace(0, 0.5, num=25)
# "lr_all": np.linspace(0.002, 0.008, num=5)
# "n_factors": [10, 30, 50, 70, 90],
# "n_epochs":[20, 25, 30, 35, 40]
gs = GridSearchCV(SVD, param_grid, measures=["mse"], cv=20)

gs.fit(train_data)

# best MSE 
print(gs.best_score["mse"])

# combination of parameters that gave the best MSE score
print(gs.best_params["mse"])

6.386235800955032
{'n_factors': 1, 'reg_all': 0.07}


In [137]:
from surprise.model_selection import RandomizedSearchCV

param_grid = {
    'n_factors': [1, 3, 10],
    'n_epochs': [10, 20, 30],
    'lr_all': [0.002, 0.005, 0.007, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}

random_search = RandomizedSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_iter=5, random_state=42)
random_search.fit(train_data)

best_params = random_search.best_params['rmse']
print("Best parameters:", best_params)

Best parameters: {'n_factors': 3, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}


In [132]:
# set params to optimal values
lamb = 0.09
opt_lr = 0.015
opt_factors = 1

In [21]:
# test with optimized parameters
model = SVD(reg_all=0.15, n_factors=1, lr_all=0.015, n_epochs=20)

# fit model using training data
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1481d75f0>

In [22]:
# use model to predict ratings
predictions = model.test(testset)

In [23]:
# RMSE
accuracy.rmse(predictions, verbose=True)

RMSE: 2.4193


2.4193267720778335

In [24]:
# MSE
accuracy.mse(predictions, verbose=True)

MSE: 5.8531


5.853142030092549