In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error as mse
from sklearn.svm import LinearSVR

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from tqdm import tqdm
import warnings
warnings.filterwarnings("error")

In [2]:
df = pd.read_csv('eda_cleaned.csv') \
       .merge(pd.read_csv('store_data.csv')[['store_id', 'group']], how = 'right', on = 'store_id')
df.drop(['id', 'urlKey', 'sku', 'store_id', 'price', 'discount'], axis = 1, inplace = True)
df['has_freeship_plus_benefit'] = df['has_freeship_plus_benefit'].replace({True: 1, False: 0})
for i in ['category', 'fulfillment_type', 'inventory_type', 'return_and_exchange_policy', 'type', 'group']:
    df = pd.concat([df, pd.get_dummies(df[i], prefix = i)], axis = 1)
    df.drop([i], axis = 1, inplace = True)
df = df[(df.quantity_sold < df.quantity_sold.quantile(0.99)) &
        (df.day_ago_created < df.day_ago_created.quantile(0.99)) &
        (df.original_price < df.original_price.quantile(0.99))
       ]

df_log = pd.DataFrame(data = np.log((df+1).values), columns = df.columns)

In [3]:
X = df_log[df_log.columns[1:]].values
y = df_log[df_log.columns[:1]].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 46)
X_train.shape, X_test.shape

((11206, 91), (1978, 91))

In [4]:
param_grid = \
{
    'epsilon': [x*1.0 for x in range(0, 11)],
    'C': [x*0.5 for x in range(0, 11)],
    'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
    'dual': [True, False]
}
param_list = list(ParameterGrid(param_grid))
param_df = pd.DataFrame(data = param_list)
print(param_df.shape)
param_df.head()

(484, 4)


Unnamed: 0,C,dual,epsilon,loss
0,0.0,True,0.0,epsilon_insensitive
1,0.0,True,0.0,squared_epsilon_insensitive
2,0.0,True,1.0,epsilon_insensitive
3,0.0,True,1.0,squared_epsilon_insensitive
4,0.0,True,2.0,epsilon_insensitive


In [5]:
rmse = []
for params in tqdm(param_list, total = len(param_list)):
    try:
        lsvr = LinearSVR(epsilon = params['epsilon'], 
                         tol = 1e-5,
                         C = params['C'],
                         loss = params['loss'],
                         random_state = 46, 
                         dual = params['dual'], 
                         max_iter = 5000) \
                .fit(X_train, y_train.reshape([y_train.shape[0], ]))
        rmse.append(np.sqrt(mse(lsvr.predict(X_train), y_train)))
    except:
        rmse.append(np.nan)

param_df['rmse'] = rmse

100%|██████████| 484/484 [00:59<00:00,  8.18it/s]


In [6]:
param_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484 entries, 0 to 483
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   C        484 non-null    float64
 1   dual     484 non-null    bool   
 2   epsilon  484 non-null    float64
 3   loss     484 non-null    object 
 4   rmse     273 non-null    float64
dtypes: bool(1), float64(3), object(1)
memory usage: 15.7+ KB


In [7]:
param_df = param_df[~(param_df.rmse.isna())].sort_values(by = ['rmse'], ascending = True)
print('rmse_train_log:', param_df.iloc[0, 4])
print('rmse_train:', np.e**param_df.iloc[0, 4] - 1)
param_df.head(5)

rmse_train_log: 0.5942007132244034
rmse_train: 0.81158239228372


Unnamed: 0,C,dual,epsilon,loss,rmse
375,4.0,False,0.0,squared_epsilon_insensitive,0.594201
463,5.0,False,0.0,squared_epsilon_insensitive,0.594201
419,4.5,False,0.0,squared_epsilon_insensitive,0.594222
331,3.5,False,0.0,squared_epsilon_insensitive,0.594222
287,3.0,False,0.0,squared_epsilon_insensitive,0.594222


In [8]:
lsvr_train_opt = LinearSVR(epsilon = param_df.iloc[0, :]['epsilon'], 
                           tol = 1e-5,
                           C = param_df.iloc[0, :]['C'],
                           loss = param_df.iloc[0, :]['loss'],
                           random_state = 46, 
                           dual = param_df.iloc[0, :]['dual'], 
                           max_iter = 5000) \
        .fit(X_train, y_train.reshape([y_train.shape[0], ]))

rmse_test = np.sqrt(mse(lsvr_train_opt.predict(X_test), y_test))
print('rmse_test_log:', rmse_test)
print('rmse_test:', np.e**rmse_test - 1)

rmse_test_log: 0.5908989153669597
rmse_test: 0.805610777380088


In [9]:
train_result = \
pd.DataFrame({'train_pred_norm': list(lsvr_train_opt.predict(X_train)), 
              'y_train_norm': list(y_train.reshape([y_train.shape[0],])),
              'train_pred': list((np.e**lsvr_train_opt.predict(X_train) - 1).round()),
              'y_train': list(np.e**y_train.reshape([y_train.shape[0],]) - 1)
             }).reset_index()
px.line(train_result, x = "index", y = ["train_pred", 'y_train'],
        title = "Training results").show()

In [10]:
test_result = \
pd.DataFrame({'test_pred_norm': list(lsvr_train_opt.predict(X_test)), 
              'y_test_norm': list(y_test.reshape([y_test.shape[0],])),
              'test_pred': list((np.e**lsvr_train_opt.predict(X_test) - 1).round()),
              'y_test': list(np.e**y_test.reshape([y_test.shape[0],]) - 1)
             }).reset_index()
px.line(test_result, x = "index", y = ["test_pred", 'y_test'],
        title = "Testing results").show()

In [11]:
final = \
pd.DataFrame(
    {
        'y': np.concatenate((train_result.y_train.values, test_result.y_test.values), axis = 0),
        'y_hat': np.concatenate((train_result.train_pred.values, test_result.test_pred.values), axis = 0)
    }
)

final['error'] = (final.y - final.y_hat)
px.histogram(final, x = 'error').show()

In [12]:
round((final[(final.error <= 10) & (final.error >= -10)].shape[0] / final.shape[0])*100, 2)

79.26