# Imports and Installs

In [1]:
# Doesn't come default with our anaconda installations
# http://surpriselib.com/

!pip install surprise



In [2]:
from datetime import datetime

def now():
    return str(datetime.now())

In [3]:
print(now())

import pandas as pd
import numpy as np

from surprise import KNNBaseline
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

print(now())

2018-04-20 21:18:40.177000
2018-04-20 21:18:40.572000


# Loading In Data

In [5]:
# Only using 1mil lines bc my computer cant handle more than that
# Ran this one already

print(now())
infile = open('rating.csv', 'r')
outfile = open('ratingSubset.csv', 'w')

#for i in range(1000000):
#    outfile.write(infile.readline())
print(now())

infile.close()
outfile.close()

2018-04-20 21:18:52.652000
2018-04-20 21:18:52.656000


Here I route the data through Pandas so I can adjust the "-1" ratings. -1, in the dataset, represents "User has seen this item but declined to rate it". I choose to interpret this as "User mildly likes item"--even if they outright hated it, they had enough interest in the item to 1) watch it, and 2) mark the item as watched (MyAnimeList does not automatically flag items you've seen, it's not Netflix)

Thus I changed all -1s to 5s.

In [6]:
print(now())

ratingSubset = pd.read_csv('rating.csv')
ratingSubset = ratingSubset[ratingSubset.user_id <= 6000]
print(now())

2018-04-20 21:18:55.910000
2018-04-20 21:18:58.301000


In [7]:
ratingSubset["rating"] = ratingSubset["rating"].replace(to_replace = -1, value = 5)

In [8]:
ratingSubset.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,5
1,1,24,5
2,1,79,5
3,1,226,5
4,1,241,5


In [9]:
# Reader class: http://surprise.readthedocs.io/en/stable/reader.html#surprise.reader.Reader
# Using custom datasets: http://surprise.readthedocs.io/en/stable/getting_started.html#load-custom

# Directly from file:
# reader = Reader(line_format='user item rating', sep=',', rating_scale=(1,10), skip_lines=1)
# ratingSubset = Dataset.load_from_file("ratingSubset.csv", reader=reader)

# From Pandas DataFrame:
reader = Reader(rating_scale=(1, 10))

ratingSubset = Dataset.load_from_df(ratingSubset[['user_id', 'anime_id', 'rating']], reader)


# Training and Sample Predictions

Largely following this guide: http://surprise.readthedocs.io/en/stable/getting_started.html

## Training

In [23]:
# http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD
sim_options = {'name': 'pearson_baseline',
               }
bsl_options = {'method': 'sgd',
                'learning_rate': .005,'n_epochs': 50
               }
#bsl_options = {'method': 'als',
#               'n_epochs': 50,'reg_u': 20,
#               'reg_i': 25
#               }
algo = KNNBaseline(200,1,sim_options=sim_options,bsl_options=bsl_options)

In [24]:
trainingRatingSubset, testRatingSubset = train_test_split(ratingSubset, test_size=.2)

In [25]:
print(now())
algo.fit(trainingRatingSubset)
print(now())

2018-04-20 21:27:46.484000
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
2018-04-20 21:29:08.190000


## Generating Predictions

This is done through the predict() method (explained on the getting started page)

We can specify any user_id and anime_id we want (as long as we don't specify ratings on nonexistent anime or nonexistent users).

For end-user-demo-purposes or whatever, we would need to:
* Collect the user's data
* Add the user's data as new rows of the dataset (imagine adding new rows to rating.csv). 
* From there we could simply run the same process (go through pandas DF, then the surprise dataset type), now able to treat them as an "existing" user.

In [26]:
# We'll use the known line 373,11771,8

pred = algo.predict(373, 11711, verbose=True)
#r_ui is an optional parameter displaying the ground truth value

user: 373        item: 11711      r_ui = None   est = 7.16   {u'was_impossible': False}


This prediction isn't too far off, which is nice. 

In [27]:
# Some other sample explorations
user373File = open('ratingUser373.txt','r')
print("Ground Truth\tPrediction")

for line in user373File:
    tokens = line.strip().split(',')
    groundTruth = tokens[2]
    pred = algo.predict(int(tokens[0]), int(tokens[1]))
    print(str(groundTruth)+"\t\t"+str(pred.est))

Ground Truth	Prediction
8		7.322367054772657
-1		7.092108785595243
7		7.231108766464553
4		6.278871759388984
10		8.949432037327508
9		8.000618344436974
8		7.5027814503179835
9		7.93872621015756
-1		6.8060882133947
8		7.743934960231693
-1		6.361153549759088
-1		6.0765856679910915
6		6.631459693222434
-1		6.227468459864687
-1		6.8245540779840415
10		8.602001388685672
10		9.199434222978134
9		8.593639379766234
-1		7.594088426554322
4		5.952338649413228
-1		6.482859844511696
-1		6.056521079764605
-1		5.808158875713377
-1		5.733652613384064
-1		5.623319443872953
-1		5.574227526550976
-1		7.100156151800932
7		7.64716535265201
-1		6.510451837394258
9		8.677834518550359
-1		6.364813142953612
7		8.342507617433549
9		7.913071977901588
7		7.329952209630577
-1		6.900859799542011
10		9.02424935479901
10		9.090988271699993
6		6.990564274289637
10		8.846070492287813
-1		5.641554264419958
-1		7.0212609763383735
-1		5.636547474506153
-1		5.843453432457027
-1		8.889832088936902
-1		6.685223536015804
10	

~~...but all of the predicted ratings are the same.~~

~~I believe this to be a problem with the SVD approach itself or with the data, not necesasrily with how I coded it--something about how it just uses the default mean and the user factors aren't there or something. I remember this being a recurring issue in previous implementations though.~~

EDIT: See this github page here: https://github.com/NicolasHug/Surprise/issues/82

Turns out that, for DataFrames specifically, the user_id and item_id should be *ints*. In all other cases (and all the documentation?) predict() uses strings.

## Evaluating Predictions

`Surprise` has RMSE, MAE, and FCP (Fraction of Concordant Pairs) in its accuracy toolbox. It is possible to calculate precision and recall with some hacks detailed here: http://surprise.readthedocs.io/en/stable/FAQ.html#how-to-compute-precision-k-and-recall-k

In [28]:
# Get the test predictions
print(now())
predictions = algo.test(testRatingSubset)
print(now())

2018-04-20 21:29:23.378000
2018-04-20 21:31:06.264000


In [29]:
accuracy.rmse(predictions)

RMSE: 1.2062


1.2062479131392823