In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler


In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify
import time

<IPython.core.display.Javascript object>

In [25]:
data = pd.read_csv('data/final_df.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.0,2
1,3113,739622,4.0,2001-07-15,2001,3,4,3.0,2
2,3113,200684,1.0,2001-10-15,2001,4,4,3.0,2
3,3113,2559624,3.0,2002-08-27,2002,3,4,3.0,2
4,3113,1557262,3.0,2003-11-06,2003,4,4,3.0,2


In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   mid           100000 non-null  int64  
 1   cust_id       100000 non-null  int64  
 2   rating        100000 non-null  float64
 3   r_date        100000 non-null  object 
 4   r_year        100000 non-null  int64  
 5   r_fiscQ       100000 non-null  int64  
 6   m_decade      100000 non-null  int64  
 7   m_avg_rating  100000 non-null  float64
 8   m_quality     100000 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 6.9+ MB


In [27]:
data['r_date'] = data['r_date'].astype('datetime64[ns]')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   mid           100000 non-null  int64         
 1   cust_id       100000 non-null  int64         
 2   rating        100000 non-null  float64       
 3   r_date        100000 non-null  datetime64[ns]
 4   r_year        100000 non-null  int64         
 5   r_fiscQ       100000 non-null  int64         
 6   m_decade      100000 non-null  int64         
 7   m_avg_rating  100000 non-null  float64       
 8   m_quality     100000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(6)
memory usage: 6.9 MB


In [28]:
data = data.sort_values(by=['r_date'])
data

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.000000,2
19,11242,830363,3.0,1999-12-21,1999,4,3,3.843750,3
133,1642,872963,4.0,1999-12-30,1999,4,4,4.044444,4
178,14574,882798,3.0,1999-12-30,1999,4,4,3.545455,3
59,16438,882798,4.0,1999-12-30,1999,4,4,3.824324,3
...,...,...,...,...,...,...,...,...,...
56222,11443,1472304,4.0,2005-12-31,2005,4,5,4.186441,4
84549,12596,1472871,3.0,2005-12-31,2005,4,2,2.600000,1
29199,5317,1835727,1.0,2005-12-31,2005,4,5,3.403361,2
30200,8376,1466372,4.0,2005-12-31,2005,4,5,3.545455,3


In [29]:
scaler = MinMaxScaler()
data[['r_scaled']] = scaler.fit_transform(data[['rating']])
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality,r_scaled
0,3113,510180,3.0,1999-12-12,1999,4,4,3.0,2,0.5
19,11242,830363,3.0,1999-12-21,1999,4,3,3.84375,3,0.5
133,1642,872963,4.0,1999-12-30,1999,4,4,4.044444,4,0.75
178,14574,882798,3.0,1999-12-30,1999,4,4,3.545455,3,0.5
59,16438,882798,4.0,1999-12-30,1999,4,4,3.824324,3,0.75


In [30]:
data['r_scaled'].mean()

0.6508525

In [63]:
reader = Reader(rating_scale=(0,1))
svd_data = Dataset.load_from_df(data[['cust_id', 'mid', 'r_scaled']], reader)

In [71]:
trainset, testset = train_test_split(svd_data, test_size=.25, random_state=1)

In [73]:
svd = SVD()
svd.fit(trainset)
cross_validate(svd, svd_data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.2630  0.2639  0.2622  0.2630  0.0007  
MAE (testset)     0.2125  0.2131  0.2117  0.2124  0.0006  
Fit time          0.84    0.85    0.86    0.85    0.01    
Test time         0.16    0.16    0.16    0.16    0.00    


{'test_rmse': array([0.26300896, 0.26394118, 0.26219772]),
 'test_mae': array([0.21249269, 0.21307264, 0.21171739]),
 'fit_time': (0.840217113494873, 0.8476862907409668, 0.8573000431060791),
 'test_time': (0.15952205657958984, 0.16091394424438477, 0.15687084197998047)}