In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import cross_validate


In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify
import time

<IPython.core.display.Javascript object>

In [3]:
data = pd.read_csv('data/final_df.csv')
data = data.drop(['Unnamed: 0', 'r_year', 'r_fiscQ', 'm_decade', 'm_avg_rating', 'm_quality'], axis=1)
data['r_date'] = data['r_date'].astype('datetime64[ns]')
data.info()

Unnamed: 0,mid,cust_id,rating,r_date
0,3113,510180,3.0,1999-12-12
1,3113,739622,4.0,2001-07-15
2,3113,200684,1.0,2001-10-15
3,3113,2559624,3.0,2002-08-27
4,3113,1557262,3.0,2003-11-06


In [10]:
#only those columns relevant to surprise
data = data[['mid', 'cust_id', 'rating', 'r_date']]
data.head()

Unnamed: 0,mid,cust_id,rating,r_date
0,3113,510180,3.0,1999-12-12
19,11242,830363,3.0,1999-12-21
133,1642,872963,4.0,1999-12-30
178,14574,882798,3.0,1999-12-30
59,16438,882798,4.0,1999-12-30


# Train/Test Split: out of time approach

In [11]:
testsize = round(len(data) * 0.2)
testsize

20000

In [12]:
data.groupby('cust_id')['rating'].count().sort_values(ascending=False)

cust_id
305344     23
2118461    20
2439493    16
387418     15
1639792    14
           ..
990585      1
990639      1
990697      1
990773      1
2649378     1
Name: rating, Length: 77372, dtype: int64

This would work better with more data and less users who don't only vote once

In [13]:
data = data.sort_values(by=['cust_id'])
data = data[['cust_id', 'mid', 'rating', 'r_date']]
data

Unnamed: 0,cust_id,mid,rating,r_date
61077,6,10730,5.0,2004-09-15
86435,7,4465,2.0,2005-05-23
69300,134,3638,5.0,2004-11-28
25159,134,16912,5.0,2005-04-19
6587,195,11209,4.0,2005-05-16
...,...,...,...,...
83453,2649202,16265,4.0,2005-08-03
35733,2649257,17441,4.0,2005-10-08
21953,2649296,798,4.0,2002-02-19
9243,2649336,2862,4.0,2004-11-22


In [14]:
#splitting data into 3 groups based on how many reviews they've given
for cust in data['cust_id'].unique():
    i = data.index[data['cust_id'] == cust]
    
    #if more than 2, the final review will be in the test, 2nd to last will be in the val
    if len(i)> 2:
        data.loc[i[-1], 'split'] = 3
        data.loc[i[-2], 'split'] = 2
        data.loc[i[:-2], 'split'] = 1
        
    #if 2, the final review will be in the test
    if len(i) == 2:
        data.loc[i[-1], 'split'] = 3
        data.loc[i[0], 'split'] = 1
        
    #everyone with one review gets a temporal split linearly
    else:
        data.loc[i[0], 'split'] = 1
        
test = data[(data['split'] == 3)]
val = data[(data['split'] == 2)]
train = data[(data['split'] == 1 )]

In [20]:
test.shape

(16706, 5)

In [21]:
val.shape

(4180, 5)

In [22]:
train.shape

(79114, 5)

In [23]:
#sort vals by date, prep to split linearly
train = train.sort_values(by=['r_date'])
train

Unnamed: 0,cust_id,mid,rating,r_date,split
0,510180,3113,3.0,1999-12-12,1.0
51,2114455,17563,3.0,1999-12-30,1.0
178,882798,14574,3.0,1999-12-30,1.0
133,872963,1642,4.0,1999-12-30,1.0
277,64765,6534,5.0,2000-01-05,1.0
...,...,...,...,...,...
52514,563705,14277,2.0,2005-12-31,1.0
55383,200254,2660,3.0,2005-12-31,1.0
12910,28205,10371,4.0,2005-12-31,1.0
82256,862011,14111,4.0,2005-12-31,1.0


In [24]:
#adding only the end of training set to test
test2 = train[-(testsize -len(test)):]
test = pd.concat([test, test2])

#20k
test.shape

(3294, 5)

In [26]:
#remove test rows
train = train.loc[~train.index.isin(test.index)]
train.shape

(75820, 5)

In [27]:
#sanity check
test = test.sort_values(by=['r_date'])
test

Unnamed: 0,cust_id,mid,rating,r_date,split
19,830363,11242,3.0,1999-12-21,3.0
59,882798,16438,4.0,1999-12-30,3.0
281,764785,9728,4.0,2000-01-06,3.0
437,1356928,8784,3.0,2000-01-07,3.0
678,1438498,8840,4.0,2000-01-07,3.0
...,...,...,...,...,...
30350,307530,3106,3.0,2005-12-31,1.0
82993,2644224,4621,5.0,2005-12-31,1.0
17900,2638089,12500,3.0,2005-12-31,1.0
59843,594259,10596,3.0,2005-12-31,1.0


In [28]:
#same with val
val1 = train.sample(n=(20000-len(val)), replace=False, random_state=1)
val = pd.concat([val, val1])

#20k
val.shape

(20000, 5)

In [29]:
train = train.loc[~train.index.isin(val1.index)]
train.shape

(60000, 5)

# Baseline using Surprise

In [30]:
reader = Reader(rating_scale=(1,5))

In [31]:
train_data = Dataset.load_from_df(train[['cust_id','mid','rating']], reader)
val_data = Dataset.load_from_df(val[['cust_id','mid','rating']], reader)
test_data = Dataset.load_from_df(test[['cust_id','mid','rating']], reader)

In [32]:
train_sr = train_data.build_full_trainset()

val_sr_before = val_data.build_full_trainset()
val_sr = val_sr_before.build_testset()

test_sr_before = test_data.build_full_trainset()
test_sr = test_sr_before.build_testset()

In [61]:
#using als. 20 epochs because that is the svd default
bsl_options = {'method': 'als', 'n_epochs':20}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(val_sr)

Estimating biases using als...


In [62]:
predictions

[Prediction(uid=1333, iid=13840, r_ui=4.0, est=3.456016872617161, details={'was_impossible': False}),
 Prediction(uid=2128, iid=16869, r_ui=4.0, est=3.651227496290015, details={'was_impossible': False}),
 Prediction(uid=2225, iid=16604, r_ui=4.0, est=3.867963, details={'was_impossible': False}),
 Prediction(uid=2469, iid=7511, r_ui=3.0, est=3.701378303664416, details={'was_impossible': False}),
 Prediction(uid=4306, iid=7294, r_ui=3.0, est=3.2124221571373894, details={'was_impossible': False}),
 Prediction(uid=4421, iid=16425, r_ui=4.0, est=3.494912285714286, details={'was_impossible': False}),
 Prediction(uid=5569, iid=5612, r_ui=4.0, est=3.31206347416836, details={'was_impossible': False}),
 Prediction(uid=6206, iid=12161, r_ui=5.0, est=3.6808555148312587, details={'was_impossible': False}),
 Prediction(uid=6785, iid=4080, r_ui=4.0, est=3.5565255891089107, details={'was_impossible': False}),
 Prediction(uid=7070, iid=5181, r_ui=3.0, est=3.583818782324653, details={'was_impossible': F

In [63]:
accuracy.mae(predictions)

MAE:  0.8473


0.8473334647775006

In [64]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.0416


1.0415582301746527

# First Simple Model

In [57]:
#instance of SVD algorithm, fit using training set
svd = SVD(random_state=1)
svd.fit(train_sr)

#predicting based on validation set
preds = svd.test(val_sr)

In [58]:
#mae
accuracy.mae(preds)

MAE:  0.8436


0.8436178242415615

In [59]:
#rmse score
accuracy.rmse(preds)

RMSE: 1.0413


1.0412625996204237

# Tuning First Simple Model

This model performed negligably better. Tuning model:

In [60]:
crossval = cross_validate(svd, train_data, measures=['MAE','RMSE'], cv=5, verbose=True)

Evaluating MAE, RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.8429  0.8431  0.8557  0.8443  0.8525  0.8477  0.0053  
RMSE (testset)    1.0385  1.0414  1.0515  1.0431  1.0512  1.0451  0.0053  
Fit time          0.68    0.67    0.67    0.71    0.68    0.68    0.02    
Test time         0.06    0.17    0.07    0.16    0.05    0.10    0.05    


references: 
https://towardsdatascience.com/build-a-recommender-system-yelp-rating-prediction-example-collaborative-filtering-28a6e48a8cc

surprise tutorialL
https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

