# Final Capstone: Revisiting the Netflix Prize

## Notebook 6: Models

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import stats
from scipy.stats import yeojohnson as yj
from sklearn import metrics
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso
from sklearn.linear_model import Ridge, OrthogonalMatchingPursuit, Lars
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

sns.set_style('darkgrid')
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.8f}'.format)

In [2]:
%%time
# import main training and quiz data
base_path = 'C:/Users/jnpol/Documents/DS/Data Science/UL/'
train_features = pd.read_parquet(base_path + 'train_features_final.parquet')
train_target = pd.read_parquet(base_path + 'train_target.parquet')
quiz_features = pd.read_parquet(base_path + 'quiz_features_final.parquet')
quiz_target = pd.read_parquet(base_path + 'quiz_target.parquet')

train_features.info()
print()
train_target.info()
print()
quiz_features.info()
print()
quiz_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 18 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   day_rated        int16  
 3   mov_year         int16  
 4   mov_count        int32  
 5   rated_bycust     int16  
 6   rate_each_day    int32  
 7   mov_day_count    uint16 
 8   cust_day_count   int16  
 9   cust_days_since  int16  
 10  mov_days_since   int16  
 11  cust_avg_rating  float32
 12  mov_day_avg      float32
 13  cust_day_avg     float32
 14  avg_rate_mov_yr  float32
 15  avg_rate_cst_yr  float32
 16  global_mean      float32
 17  cust_pc          float32
dtypes: float32(7), int16(7), int32(3), uint16(1)
memory usage: 5.0 GB

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96304740 entries, 61342875 to 6598294
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   rating  int8 
dtypes: int8(1)
memory usage: 826.6 

In [3]:
%%time
# reset indices for target dfs
train_target.reset_index(drop=True, inplace=True)
quiz_target.reset_index(drop=True, inplace=True)
train_target.info()
print()
quiz_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 1 columns):
 #   Column  Dtype
---  ------  -----
 0   rating  int8 
dtypes: int8(1)
memory usage: 91.8 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   rating  1408395 non-null  int8 
dtypes: int8(1)
memory usage: 1.3 MB
Wall time: 49 ms


In [4]:
%%time
# import transformation dfs and the questionable movie feature pcs
train_trans = pd.read_parquet(base_path + 'train_trans.parquet')
quiz_trans = pd.read_parquet(base_path + 'quiz_trans.parquet')
train_mov_pcs = pd.read_parquet(base_path + 'mov_pcs_train.parquet')
train_trans.info()
print()
quiz_trans.info()
print()
train_mov_pcs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 8 columns):
 #   Column       Dtype  
---  ------       -----  
 0   tbit_tran    int16  
 1   tyr_4rt      float64
 2   tmovct_sqrt  float32
 3   trbc_4rt     float64
 4   trbc_3rt     float32
 5   tcdc_4rt     float64
 6   tcdc_log     float32
 7   tarmy_3rt    float32
dtypes: float32(4), float64(3), int16(1)
memory usage: 3.8 GB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   qbit_tran    1408395 non-null  int16  
 1   qyr_4rt      1408395 non-null  float64
 2   qmovct_sqrt  1408395 non-null  float32
 3   qrbc_4rt     1408395 non-null  float64
 4   qrbc_3rt     1408395 non-null  float32
 5   qcdc_4rt     1408395 non-null  float64
 6   qcdc_log     1408395 non-null  float32
 7   qarmy_3rt    1408395 non-null  float32
dtypes: flo

In [5]:
%%time
train_trans.tyr_4rt = train_trans.tyr_4rt.astype(np.float32)
train_trans.trbc_4rt = train_trans.trbc_4rt.astype(np.float32)
train_trans.tcdc_4rt = train_trans.tcdc_4rt.astype(np.float32)
quiz_trans.qyr_4rt = quiz_trans.qyr_4rt.astype(np.float32)
quiz_trans.qrbc_4rt = quiz_trans.qrbc_4rt.astype(np.float32)
quiz_trans.qcdc_4rt = quiz_trans.qcdc_4rt.astype(np.float32)

Wall time: 1.28 s


In [8]:
%%time
train_features.drop(['mov_count', 'rated_bycust', 'rate_each_day',
                     'mov_day_count', 'cust_day_count', 'cust_days_since',
                     'mov_days_since', 'cust_avg_rating', 'mov_day_avg',
                     'cust_day_avg', 'avg_rate_mov_yr', 'avg_rate_cst_yr',
                     'global_mean', 'cust_pc'], 1, inplace=True)
quiz_features.drop(['mov_count', 'rated_bycust', 'rate_each_day',
                    'mov_day_count', 'cust_day_count', 'cust_days_since',
                    'mov_days_since', 'cust_avg_rating', 'mov_day_avg',
                    'cust_day_avg', 'avg_rate_mov_yr', 'avg_rate_cst_yr',
                    'global_mean', 'cust_pc', 'mov_pc'], 1, inplace=True)
train_features.info()
quiz_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 4 columns):
 #   Column     Dtype
---  ------     -----
 0   mov_id     int16
 1   cust_id    int32
 2   day_rated  int16
 3   mov_year   int16
dtypes: int16(3), int32(1)
memory usage: 918.4 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   mov_id     1408395 non-null  int16
 1   cust_id    1408395 non-null  int32
 2   day_rated  1408395 non-null  int16
 3   mov_year   1408395 non-null  int16
dtypes: int16(3), int32(1)
memory usage: 13.4 MB
Wall time: 15 ms


In [9]:
%%time
# training set
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
y_train = train_target.rating.to_numpy()
del train_features, train_target

Wall time: 8.34 s


In [10]:
%%time
# quiz set
scaler = StandardScaler()
X_test = scaler.fit_transform(quiz_features)
y_test = quiz_target.rating.to_numpy()
del quiz_features, quiz_target

Wall time: 126 ms


## Assorted GLMs on Basic Features

In [11]:
%%time
lin = LinearRegression(fit_intercept=True, n_jobs=16)
lr = lin.fit(X_train, y_train)
y_pred = lr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', lr.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = -0.005657653275320085
MAE = 0.9477999095112267
RMSE = 1.1306171534350797
Wall time: 6.27 s


In [12]:
%%time
lars_reg = Lars(random_state=761)
larr = lars_reg.fit(X_train, y_train)
y_pred = larr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', larr.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = -0.0055600658534387115
MAE = 0.9477931331537993
RMSE = 1.130562295457532
Wall time: 4.28 s


In [13]:
%%time
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=4)
om = omp.fit(X_train, y_train)
y_pred = om.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', om.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = -0.0056576532753283
MAE = 0.9477999095114904
RMSE = 1.1306171534350842
Wall time: 4.31 s


In [14]:
%%time
rid = Ridge(max_iter=10, random_state=413)
ridg = rid.fit(X_train, y_train)
y_pred = ridg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', ridg.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = -0.00565765308873778
MAE = 0.9477999095037333
RMSE = 1.1306171533301965
Wall time: 3.39 s


In [17]:
sgdr = SGDRegressor(alpha=5, max_iter=10, shuffle=False, verbose=1, random_state=761)
sgd = sgdr.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', sgd.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

-- Epoch 1
Norm: 0.02, NNZs: 4, Bias: 3.601512, T: 96304740, Avg. loss: 0.584810
Total training time: 5.79 seconds.
-- Epoch 2
Norm: 0.02, NNZs: 4, Bias: 3.601994, T: 192609480, Avg. loss: 0.584779
Total training time: 11.55 seconds.
-- Epoch 3
Norm: 0.02, NNZs: 4, Bias: 3.602228, T: 288914220, Avg. loss: 0.584773
Total training time: 17.34 seconds.
-- Epoch 4
Norm: 0.02, NNZs: 4, Bias: 3.602376, T: 385218960, Avg. loss: 0.584770
Total training time: 23.11 seconds.
-- Epoch 5
Norm: 0.02, NNZs: 4, Bias: 3.602482, T: 481523700, Avg. loss: 0.584768
Total training time: 28.88 seconds.
-- Epoch 6
Norm: 0.02, NNZs: 4, Bias: 3.602564, T: 577828440, Avg. loss: 0.584766
Total training time: 34.68 seconds.
Convergence after 6 epochs took 34.68 seconds
R-squared = -0.002966188220161259
MAE = 0.9523534646210825
RMSE = 1.1291031912519867


## With Additional Count Features

In [4]:
%%time
train_features.drop(['cust_avg_rating', 'mov_day_avg', 'cust_day_avg',
                     'avg_rate_mov_yr', 'avg_rate_cst_yr',
                     'global_mean', 'cust_pc'], 1, inplace=True)
quiz_features.drop(['cust_avg_rating', 'mov_day_avg', 'cust_day_avg',
                    'avg_rate_mov_yr', 'avg_rate_cst_yr',
                    'global_mean', 'cust_pc', 'mov_pc'], 1, inplace=True)
train_features.info()
quiz_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 11 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   mov_id           int16 
 1   cust_id          int32 
 2   day_rated        int16 
 3   mov_year         int16 
 4   mov_count        int32 
 5   rated_bycust     int16 
 6   rate_each_day    int32 
 7   mov_day_count    uint16
 8   cust_day_count   int16 
 9   cust_days_since  int16 
 10  mov_days_since   int16 
dtypes: int16(7), int32(3), uint16(1)
memory usage: 2.5 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 11 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   mov_id           1408395 non-null  int16 
 1   cust_id          1408395 non-null  int32 
 2   day_rated        1408395 non-null  int16 
 3   mov_year         1408395 non-null  int16 
 4   mov_count        1408395 non-null  int32 
 5   ra

In [5]:
%%time
# training set
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
y_train = train_target.rating.to_numpy()
del train_features, train_target

Wall time: 24.1 s


In [6]:
%%time
# quiz set
scaler = StandardScaler()
X_test = scaler.fit_transform(quiz_features)
y_test = quiz_target.rating.to_numpy()
del quiz_features, quiz_target

Wall time: 401 ms


### LinearRegression
The Linear Regression model took 27 seconds to process, consumed more memory, and produced a higher RMSE that the models that follow, so it will be discarded.

In [11]:
%%time
lars_reg = Lars(random_state=761)
larr = lars_reg.fit(X_train, y_train)
y_pred = larr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', larr.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.005685461355062982
MAE = 0.9346952600955611
RMSE = 1.1242227861283012
Wall time: 14.5 s


In [12]:
%%time
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=10)
om = omp.fit(X_train, y_train)
y_pred = om.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', om.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.005665038814680723
MAE = 0.934685577071549
RMSE = 1.1242343314524736
Wall time: 13 s


In [7]:
%%time
rid = Ridge(max_iter=10, random_state=413)
ridg = rid.fit(X_train, y_train)
y_pred = ridg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', ridg.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.005665241741847349
MAE = 0.9346854823682469
RMSE = 1.124234216733738
Wall time: 10.6 s


In [13]:
sgdr = SGDRegressor(alpha=3, max_iter=10, shuffle=False, verbose=1, random_state=761)
sgd = sgdr.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', sgd.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

-- Epoch 1
Norm: 0.05, NNZs: 11, Bias: 3.601280, T: 96304740, Avg. loss: 0.576859
Total training time: 7.37 seconds.
-- Epoch 2
Norm: 0.05, NNZs: 11, Bias: 3.601772, T: 192609480, Avg. loss: 0.576796
Total training time: 14.38 seconds.
-- Epoch 3
Norm: 0.05, NNZs: 11, Bias: 3.602009, T: 288914220, Avg. loss: 0.576782
Total training time: 21.24 seconds.
-- Epoch 4
Norm: 0.05, NNZs: 11, Bias: 3.602160, T: 385218960, Avg. loss: 0.576774
Total training time: 28.12 seconds.
-- Epoch 5
Norm: 0.05, NNZs: 11, Bias: 3.602267, T: 481523700, Avg. loss: 0.576769
Total training time: 35.00 seconds.
-- Epoch 6
Norm: 0.05, NNZs: 11, Bias: 3.602350, T: 577828440, Avg. loss: 0.576765
Total training time: 41.88 seconds.
Convergence after 6 epochs took 41.88 seconds
R-squared = 0.003944034984914024
MAE = 0.9475332309675853
RMSE = 1.125206828242749


## With Additional Means Features, No Counts

In [4]:
%%time
train_features.drop(['mov_count', 'rated_bycust', 'rate_each_day',
                     'mov_day_count', 'cust_day_count',
                     'cust_days_since', 'mov_days_since', 
                     'global_mean', 'cust_pc'], 1, inplace=True)
quiz_features.drop(['mov_count', 'rated_bycust', 'rate_each_day',
                    'mov_day_count', 'cust_day_count',
                    'cust_days_since', 'mov_days_since',
                    'global_mean', 'cust_pc', 'mov_pc'], 1, inplace=True)
train_features.info()
quiz_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 9 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   day_rated        int16  
 3   mov_year         int16  
 4   cust_avg_rating  float32
 5   mov_day_avg      float32
 6   cust_day_avg     float32
 7   avg_rate_mov_yr  float32
 8   avg_rate_cst_yr  float32
dtypes: float32(5), int16(3), int32(1)
memory usage: 2.7 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   mov_id           1408395 non-null  int16  
 1   cust_id          1408395 non-null  int32  
 2   day_rated        1408395 non-null  int16  
 3   mov_year         1408395 non-null  int16  
 4   cust_avg_rating  1408395 non-null  float32
 5   mov_day_avg      1408395 non-null  float32


In [5]:
%%time
# training set
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
y_train = train_target.rating.to_numpy()
del train_features, train_target

Wall time: 20.2 s


In [6]:
%%time
# quiz set
scaler = StandardScaler()
X_test = scaler.fit_transform(quiz_features)
y_test = quiz_target.rating.to_numpy()
del quiz_features, quiz_target

Wall time: 380 ms


In [7]:
%%time
lars_reg = Lars(random_state=761)
larr = lars_reg.fit(X_train, y_train)
y_pred = larr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', larr.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.035447422767870274
MAE = 0.8699911031854564
RMSE = 1.107269765713606
Wall time: 10.1 s


In [11]:
%%time
omp = OrthogonalMatchingPursuit(n_nonzero_coefs=9)
om = omp.fit(X_train, y_train)
y_pred = om.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', om.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.03408664149530427
MAE = 0.8705325083227212
RMSE = 1.1080505530668545
Wall time: 10.3 s


In [9]:
%%time
rid = Ridge(max_iter=10, random_state=413)
ridg = rid.fit(X_train, y_train)
y_pred = ridg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', ridg.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.03408665052436877
MAE = 0.8705325047437593
RMSE = 1.1080505478879947
Wall time: 8.67 s


In [10]:
sgdr = SGDRegressor(alpha=3, max_iter=10, shuffle=False, verbose=1, random_state=761)
sgd = sgdr.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', sgd.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

-- Epoch 1
Norm: 0.21, NNZs: 9, Bias: 3.600170, T: 96304740, Avg. loss: 0.403791
Total training time: 7.11 seconds.
-- Epoch 2
Norm: 0.21, NNZs: 9, Bias: 3.600669, T: 192609480, Avg. loss: 0.403747
Total training time: 14.14 seconds.
-- Epoch 3
Norm: 0.21, NNZs: 9, Bias: 3.600927, T: 288914220, Avg. loss: 0.403738
Total training time: 20.81 seconds.
-- Epoch 4
Norm: 0.21, NNZs: 9, Bias: 3.601098, T: 385218960, Avg. loss: 0.403733
Total training time: 27.46 seconds.
-- Epoch 5
Norm: 0.21, NNZs: 9, Bias: 3.601225, T: 481523700, Avg. loss: 0.403729
Total training time: 34.12 seconds.
-- Epoch 6
Norm: 0.21, NNZs: 9, Bias: 3.601326, T: 577828440, Avg. loss: 0.403727
Total training time: 40.80 seconds.
Convergence after 6 epochs took 40.80 seconds
R-squared = 0.13353555081827717
MAE = 0.8620877823296546
RMSE = 1.049459934046641


### Note:
The SGDRegressor showed significantly more improvement with these features than the other linear models.

## With Counts and Means Features

In [4]:
%%time
train_features.drop(['global_mean', 'cust_pc'], 1, inplace=True)
quiz_features.drop(['global_mean', 'cust_pc', 'mov_pc'], 1, inplace=True)
train_features.info()
quiz_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96304740 entries, 0 to 96304739
Data columns (total 16 columns):
 #   Column           Dtype  
---  ------           -----  
 0   mov_id           int16  
 1   cust_id          int32  
 2   day_rated        int16  
 3   mov_year         int16  
 4   mov_count        int32  
 5   rated_bycust     int16  
 6   rate_each_day    int32  
 7   mov_day_count    uint16 
 8   cust_day_count   int16  
 9   cust_days_since  int16  
 10  mov_days_since   int16  
 11  cust_avg_rating  float32
 12  mov_day_avg      float32
 13  cust_day_avg     float32
 14  avg_rate_mov_yr  float32
 15  avg_rate_cst_yr  float32
dtypes: float32(5), int16(7), int32(3), uint16(1)
memory usage: 4.3 GB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408395 entries, 0 to 1408394
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   mov_id           1408395 non-null  int16  
 1   cust_id     

In [5]:
%%time
# training set
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
y_train = train_target.rating.to_numpy()
del train_features, train_target

Wall time: 52.5 s


In [6]:
%%time
# quiz set
scaler = StandardScaler()
X_test = scaler.fit_transform(quiz_features)
y_test = quiz_target.rating.to_numpy()
del quiz_features, quiz_target

Wall time: 749 ms


In [7]:
%%time
lars_reg = Lars(random_state=761)
larr = lars_reg.fit(X_train, y_train)
y_pred = larr.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', larr.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.03609202692092284
MAE = 0.8696534784332124
RMSE = 1.1068997133209384
Wall time: 21.2 s


In [8]:
%%time
rid = Ridge(max_iter=10, random_state=413)
ridg = rid.fit(X_train, y_train)
y_pred = ridg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', ridg.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.03399110038456965
MAE = 0.8704942918997729
RMSE = 1.1081053518549417
Wall time: 17.4 s


In [9]:
sgdr = SGDRegressor(alpha=3, max_iter=10, shuffle=False, verbose=1, random_state=761)
sgd = sgdr.fit(X_train, y_train)
y_pred = sgd.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('R-squared =', sgd.score(X_test, y_test))
print('MAE =', mae)
print('RMSE =', rmse)

-- Epoch 1
Norm: 0.21, NNZs: 16, Bias: 3.600075, T: 96304740, Avg. loss: 0.403287
Total training time: 7.93 seconds.
-- Epoch 2
Norm: 0.21, NNZs: 16, Bias: 3.600581, T: 192609480, Avg. loss: 0.403225
Total training time: 15.51 seconds.
-- Epoch 3
Norm: 0.21, NNZs: 16, Bias: 3.600841, T: 288914220, Avg. loss: 0.403211
Total training time: 22.88 seconds.
-- Epoch 4
Norm: 0.21, NNZs: 16, Bias: 3.601012, T: 385218960, Avg. loss: 0.403203
Total training time: 30.25 seconds.
-- Epoch 5
Norm: 0.21, NNZs: 16, Bias: 3.601139, T: 481523700, Avg. loss: 0.403198
Total training time: 37.61 seconds.
-- Epoch 6
Norm: 0.21, NNZs: 16, Bias: 3.601240, T: 577828440, Avg. loss: 0.403193
Total training time: 44.98 seconds.
Convergence after 6 epochs took 44.98 seconds
R-squared = 0.13480611346133642
MAE = 0.8609703852257143
RMSE = 1.0486902003514613


In [None]:
%%time
hgbr = HistGradientBoostingRegressor(
    learning_rate=0.1, max_iter=1300,
    max_leaf_nodes=401, random_state=213, verbose=1)
hgb = hgbr.fit(X_train, y_train)

In [15]:
%%time
print('R-squared =', hgb.score(X_test, y_test))
y_pred = hgb.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print('MAE =', mae)
print('RMSE =', rmse)

R-squared = 0.2233471942381995
MAE = 0.7832421162147075
RMSE = 0.9935825292229005
Wall time: 9.28 s


learning_rate=0.1, max_iter=500, max_leaf_nodes=301, random_state=213

R-squared = 0.22351579447811798

MAE = 0.784489468931582

RMSE = 0.9934746770758374

48 min

mov_id, cust_id, day_rated, mov_year_log, mov_count log, rated_bycust^1.5,
rate_each_day inv outliers adj 191500 134500, mov_day_count log,
cust_day_count^0.8, tr_mov_avg_rating^1.5, te_mov_avg_rating^0.9, cust_avg_rating
R-squared = 0.22382965797933052
MAE = 0.7870443034889095
RMSE = 0.9932738700571773

mov_id, cust_id, mov_year_log, mov_count log, rated_bycust^1.5, rate_each_day inv outliers adj 191500 124500,
mov_day_count log, cust_day_count^0.8, tr_mov_avg_rating^1.5, te_mov_avg_rating^0.9,
R-squared = 0.1264051195046394
MAE = 0.8530490804087013
RMSE = 1.0537692682359234

mov_id, cust_id, mov_year mov_count log, rated_bycust^1.5, rate_each_day inv outliers adj 191500 124500,
mov_day_count log, cust_day_count^0.8
R-squared = 0.013444013633980734
MAE = 0.9293510303091971
RMSE = 1.1198280888011323

On mov_id and cust_id ONLY:
R-squared = -0.004223546413989032
MAE = 0.9528487913115911
RMSE = 1.1298107138487197

WOW! BAD
On mov_id, cust_id, and day_rated ONLY:
R-squared = -0.012808924650737241
MAE = 0.9542767933749646
RMSE = 1.1346299639222615

Drop last 8 Baseline:
R-squared = 0.008494984714336185
MAE = 0.9307416975240954
RMSE = 1.1226333672925477

Baseline:
R-squared = 0.04866193461770418
MAE = 0.8631959599761593
RMSE = 1.0996587285938078

In [None]:
%%time
# training set
scaler = StandardScaler()
X_train = scaler.fit_transform(train_features)
y_train = train_ratings.rating.to_numpy()
del train_features, train_ratings

In [None]:
%%time
# test set
scaler = StandardScaler()
X_test = scaler.fit_transform(test_features)
y_test = test_ratings.rating.to_numpy()
del test_features, test_ratings