In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly
from surprise import accuracy
from surprise import SVD
#from surprise.model_selection import cross_validate
#from surprise.model_selection import train_test_split

from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler


In [2]:
#notify me when a long running cell is complete
%load_ext jupyternotify
import time

<IPython.core.display.Javascript object>

In [3]:
data = pd.read_csv('data/final_df.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.0,2
1,3113,739622,4.0,2001-07-15,2001,3,4,3.0,2
2,3113,200684,1.0,2001-10-15,2001,4,4,3.0,2
3,3113,2559624,3.0,2002-08-27,2002,3,4,3.0,2
4,3113,1557262,3.0,2003-11-06,2003,4,4,3.0,2


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   mid           100000 non-null  int64  
 1   cust_id       100000 non-null  int64  
 2   rating        100000 non-null  float64
 3   r_date        100000 non-null  object 
 4   r_year        100000 non-null  int64  
 5   r_fiscQ       100000 non-null  int64  
 6   m_decade      100000 non-null  int64  
 7   m_avg_rating  100000 non-null  float64
 8   m_quality     100000 non-null  int64  
dtypes: float64(2), int64(6), object(1)
memory usage: 6.9+ MB


In [5]:
data['r_date'] = data['r_date'].astype('datetime64[ns]')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   mid           100000 non-null  int64         
 1   cust_id       100000 non-null  int64         
 2   rating        100000 non-null  float64       
 3   r_date        100000 non-null  datetime64[ns]
 4   r_year        100000 non-null  int64         
 5   r_fiscQ       100000 non-null  int64         
 6   m_decade      100000 non-null  int64         
 7   m_avg_rating  100000 non-null  float64       
 8   m_quality     100000 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(6)
memory usage: 6.9 MB


In [6]:
data = data.sort_values(by=['r_date'])
data

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality
0,3113,510180,3.0,1999-12-12,1999,4,4,3.000000,2
19,11242,830363,3.0,1999-12-21,1999,4,3,3.843750,3
133,1642,872963,4.0,1999-12-30,1999,4,4,4.044444,4
178,14574,882798,3.0,1999-12-30,1999,4,4,3.545455,3
59,16438,882798,4.0,1999-12-30,1999,4,4,3.824324,3
...,...,...,...,...,...,...,...,...,...
56222,11443,1472304,4.0,2005-12-31,2005,4,5,4.186441,4
84549,12596,1472871,3.0,2005-12-31,2005,4,2,2.600000,1
29199,5317,1835727,1.0,2005-12-31,2005,4,5,3.403361,2
30200,8376,1466372,4.0,2005-12-31,2005,4,5,3.545455,3


In [7]:
scaler = MinMaxScaler()
data[['r_scaled']] = scaler.fit_transform(data[['rating']])
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,r_year,r_fiscQ,m_decade,m_avg_rating,m_quality,r_scaled
0,3113,510180,3.0,1999-12-12,1999,4,4,3.0,2,0.5
19,11242,830363,3.0,1999-12-21,1999,4,3,3.84375,3,0.5
133,1642,872963,4.0,1999-12-30,1999,4,4,4.044444,4,0.75
178,14574,882798,3.0,1999-12-30,1999,4,4,3.545455,3,0.5
59,16438,882798,4.0,1999-12-30,1999,4,4,3.824324,3,0.75


In [8]:
data['r_scaled'].mean()

0.6508525

In [9]:
data['rating'].mean()

3.60341

In [10]:
data = data[['mid', 'cust_id', 'rating', 'r_date']]
data.head()

Unnamed: 0,mid,cust_id,rating,r_date
0,3113,510180,3.0,1999-12-12
19,11242,830363,3.0,1999-12-21
133,1642,872963,4.0,1999-12-30
178,14574,882798,3.0,1999-12-30
59,16438,882798,4.0,1999-12-30


# Train/Test Split: out of time approach

In [11]:
testsize = round(len(data) * 0.2)
testsize

20000

In [12]:
data.groupby('cust_id')['rating'].count().sort_values(ascending=False)

cust_id
305344     23
2118461    20
2439493    16
387418     15
1639792    14
           ..
990585      1
990639      1
990697      1
990773      1
2649378     1
Name: rating, Length: 77372, dtype: int64

This would work better with more data and less users who don't only vote once

In [13]:
data = data.sort_values(by=['cust_id'])
data

Unnamed: 0,mid,cust_id,rating,r_date
61077,10730,6,5.0,2004-09-15
86435,4465,7,2.0,2005-05-23
69300,3638,134,5.0,2004-11-28
25159,16912,134,5.0,2005-04-19
6587,11209,195,4.0,2005-05-16
...,...,...,...,...
83453,16265,2649202,4.0,2005-08-03
35733,17441,2649257,4.0,2005-10-08
21953,798,2649296,4.0,2002-02-19
9243,2862,2649336,4.0,2004-11-22


In [18]:
for cust in data['cust_id'].unique():
    i = data.index[data['cust_id'] == cust]
    
    if len(i)> 3:
        data.loc[i[-1], 'split'] = 3
        data.loc[i[-2], 'split'] = 2
        data.loc[i[:-2], 'split'] = 1
    if len(i) == 2:
        data.loc[i[-1], 'split'] = 3
        data.loc[i[0], 'split'] = 1
    else:
        data.loc[i[0], 'split'] = 1

In [19]:
data.head()

Unnamed: 0,mid,cust_id,rating,r_date,split
61077,10730,6,5.0,2004-09-15,1.0
86435,4465,7,2.0,2005-05-23,1.0
69300,3638,134,5.0,2004-11-28,1.0
25159,16912,134,5.0,2005-04-19,3.0
6587,11209,195,4.0,2005-05-16,1.0


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 61077 to 89382
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   mid      100000 non-null  int64         
 1   cust_id  100000 non-null  int64         
 2   rating   100000 non-null  float64       
 3   r_date   100000 non-null  datetime64[ns]
 4   split    100000 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 6.6 MB


In [21]:
data['split'].value_counts()

split
1.0    79114
3.0    16706
2.0     4180
Name: count, dtype: int64

In [37]:
test = data[(data['split'] == 3)]
val = data[(data['split'] == 2)]
train = data[(data['split'] == 1 )]

In [24]:
test.shape

(16706, 5)

In [25]:
val.shape

(4180, 5)

In [27]:
train.shape

(79114, 5)

In [38]:
train = train.sort_values(by=['r_date'])
train

Unnamed: 0,mid,cust_id,rating,r_date,split
0,3113,510180,3.0,1999-12-12,1.0
51,17563,2114455,3.0,1999-12-30,1.0
178,14574,882798,3.0,1999-12-30,1.0
133,1642,872963,4.0,1999-12-30,1.0
277,6534,64765,5.0,2000-01-05,1.0
...,...,...,...,...,...
52514,14277,563705,2.0,2005-12-31,1.0
55383,2660,200254,3.0,2005-12-31,1.0
12910,10371,28205,4.0,2005-12-31,1.0
82256,14111,862011,4.0,2005-12-31,1.0


In [39]:
test2 = train[-(testsize -len(test)):]
test2.shape

(3294, 5)

In [40]:
test = pd.concat([test, test2])
test.shape

(20000, 5)

In [41]:
train = train.loc[~train.index.isin(test.index)]
train.shape

(75820, 5)

In [42]:
test = test.sort_values(by=['r_date'])
test

Unnamed: 0,mid,cust_id,rating,r_date,split
19,11242,830363,3.0,1999-12-21,3.0
59,16438,882798,4.0,1999-12-30,3.0
281,9728,764785,4.0,2000-01-06,3.0
437,8784,1356928,3.0,2000-01-07,3.0
678,8840,1438498,4.0,2000-01-07,3.0
...,...,...,...,...,...
30350,3106,307530,3.0,2005-12-31,1.0
82993,4621,2644224,5.0,2005-12-31,1.0
17900,12500,2638089,3.0,2005-12-31,1.0
59843,10596,594259,3.0,2005-12-31,1.0


In [43]:
val1 = train.sample(n=(20000-len(val)), replace=False)
val1.shape

(15820, 5)

In [44]:
val = pd.concat([val, val1])
val.shape

(20000, 5)

In [48]:
train = train.loc[~train.index.isin(val1.index)]
train.shape

(60000, 5)

# Baseline

In [46]:
reader = Reader(rating_scale=(1,5))

In [49]:
train_data = Dataset.load_from_df(train[['cust_id','mid','rating']], reader)
val_data = Dataset.load_from_df(val[['cust_id','mid','rating']], reader)
test_data = Dataset.load_from_df(test[['cust_id','mid','rating']], reader)

In [50]:
train_sr = train_data.build_full_trainset()

val_sr_before = val_data.build_full_trainset()
val_sr = val_sr_before.build_testset()

test_sr_before = test_data.build_full_trainset()
test_sr = test_sr_before.build_testset()

In [51]:
bsl_options = {'method': 'als', 'n_epochs':3}
bias_baseline = BaselineOnly(bsl_options)
bias_baseline.fit(train_sr)
predictions = bias_baseline.test(val_sr)

Estimating biases using als...


In [52]:
predictions

[Prediction(uid=1333, iid=13840, r_ui=4.0, est=3.6463941427083335, details={'was_impossible': False}),
 Prediction(uid=1333, iid=3079, r_ui=3.0, est=4.172436487201704, details={'was_impossible': False}),
 Prediction(uid=2128, iid=16869, r_ui=4.0, est=3.6688591976281435, details={'was_impossible': False}),
 Prediction(uid=2225, iid=16604, r_ui=4.0, est=3.9792980627597734, details={'was_impossible': False}),
 Prediction(uid=2469, iid=7511, r_ui=3.0, est=3.7067830267978263, details={'was_impossible': False}),
 Prediction(uid=4306, iid=7294, r_ui=3.0, est=3.1733132855198045, details={'was_impossible': False}),
 Prediction(uid=4421, iid=16425, r_ui=4.0, est=3.4638543586343684, details={'was_impossible': False}),
 Prediction(uid=5569, iid=5612, r_ui=4.0, est=3.27543749554952, details={'was_impossible': False}),
 Prediction(uid=5569, iid=4089, r_ui=4.0, est=3.428206988187781, details={'was_impossible': False}),
 Prediction(uid=6206, iid=12161, r_ui=5.0, est=3.7195199854107495, details={'was_i

In [53]:
accuracy.rmse(predictions, verbose=True)

RMSE: 1.0440


1.044021452806938

# First Simple Model

In [56]:
svd = SVD()
svd.fit(train_sr)
preds = svd.test(val_sr)

In [57]:
accuracy.rmse(preds)

RMSE: 1.0432


1.0432024211195623

references: 
https://towardsdatascience.com/build-a-recommender-system-yelp-rating-prediction-example-collaborative-filtering-28a6e48a8cc


