In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid

from sklearn.metrics import mean_squared_error as mse

from sklearn.svm import LinearSVR

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from tqdm import tqdm
import warnings
warnings.filterwarnings("error")

In [2]:
df = pd.read_csv('eda_cleaned.csv') \
    .merge(pd.read_csv('store_data.csv')[['store_id', 'group']], how = 'right', on = 'store_id')
df.head()

Unnamed: 0,id,category,urlKey,sku,quantity_sold,has_freeship_plus_benefit,partner_rewards_amount,tiki_rewards_amount,store_id,day_ago_created,...,fulfillment_type,inventory_type,rating_average,return_and_exchange_policy,review_count,type,extra_gift,extra_amount,n_competitors,group
0,173230770,8060,thiet-bi-luu-tru,4634285820809,34.0,False,0,0.902326,9,447,...,dropship,backorder,5.0,dt7,2,simple,0,0,2,HFO
1,16568705,8060,thiet-bi-luu-tru,2076219699449,275.0,False,0,3.302326,9,1491,...,dropship,backorder,4.8,dt7,25,simple,0,0,0,HFO
2,20627244,8060,thiet-bi-luu-tru,4297955208513,226.0,False,0,0.827907,9,1458,...,dropship,backorder,4.7,dt7,11,simple,0,0,0,HFO
3,590581,8060,thiet-bi-luu-tru,6108721649141,68.0,False,0,0.781395,9,2284,...,dropship,backorder,4.6,dt7,5,simple,0,0,0,HFO
4,590446,8060,thiet-bi-luu-tru,6103102569780,38.0,False,0,2.037209,9,2284,...,dropship,backorder,5.0,dt7,3,simple,0,0,0,HFO


In [3]:
df.columns

Index(['id', 'category', 'urlKey', 'sku', 'quantity_sold',
       'has_freeship_plus_benefit', 'partner_rewards_amount',
       'tiki_rewards_amount', 'store_id', 'day_ago_created', 'original_price',
       'price', 'discount', 'discount_rate', 'fulfillment_type',
       'inventory_type', 'rating_average', 'return_and_exchange_policy',
       'review_count', 'type', 'extra_gift', 'extra_amount', 'n_competitors',
       'group'],
      dtype='object')

In [4]:
df.drop(['id', 'urlKey', 'sku', 'store_id', 'price', 'discount'], axis = 1, inplace = True)
df['has_freeship_plus_benefit'] = df['has_freeship_plus_benefit'].replace({True: 1, False: 0})

In [5]:
# 80, meaning 80 more categorical features
for i in ['category', 'fulfillment_type', 'inventory_type', 'return_and_exchange_policy', 'type', 'group']:
    print(i, len(df[i].unique()))

category 60
fulfillment_type 4
inventory_type 2
return_and_exchange_policy 4
type 2
group 8


In [6]:
for i in ['category', 'fulfillment_type', 'inventory_type', 'return_and_exchange_policy', 'type', 'group']:
    df = pd.concat([df, pd.get_dummies(df[i], prefix = i)], axis = 1)
    df.drop([i], axis = 1, inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13581 entries, 0 to 13580
Data columns (total 92 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   quantity_sold                     13581 non-null  float64
 1   has_freeship_plus_benefit         13581 non-null  int64  
 2   partner_rewards_amount            13581 non-null  int64  
 3   tiki_rewards_amount               13581 non-null  float64
 4   day_ago_created                   13581 non-null  int64  
 5   original_price                    13581 non-null  int64  
 6   discount_rate                     13581 non-null  int64  
 7   rating_average                    13581 non-null  float64
 8   review_count                      13581 non-null  int64  
 9   extra_gift                        13581 non-null  int64  
 10  extra_amount                      13581 non-null  int64  
 11  n_competitors                     13581 non-null  int64  
 12  cate

In [8]:
df.head()

Unnamed: 0,quantity_sold,has_freeship_plus_benefit,partner_rewards_amount,tiki_rewards_amount,day_ago_created,original_price,discount_rate,rating_average,review_count,extra_gift,...,type_configurable,type_simple,group_HFN,group_HFO,group_HMN,group_HMO,group_LFN,group_LFO,group_LMN,group_LMO
0,34.0,0,0,0.902326,447,105000,8,5.0,2,0,...,0,1,0,1,0,0,0,0,0,0
1,275.0,0,0,3.302326,1491,650000,45,4.8,25,0,...,0,1,0,1,0,0,0,0,0,0
2,226.0,0,0,0.827907,1458,89000,0,4.7,11,0,...,0,1,0,1,0,0,0,0,0,0
3,68.0,0,0,0.781395,2284,89000,6,4.6,5,0,...,0,1,0,1,0,0,0,0,0,0
4,38.0,0,0,2.037209,2284,219000,0,5.0,3,0,...,0,1,0,1,0,0,0,0,0,0


In [9]:
df = df[(df.quantity_sold < df.quantity_sold.quantile(0.99)) &
        (df.day_ago_created < df.day_ago_created.quantile(0.99)) &
        (df.original_price < df.original_price.quantile(0.99))
       ]

df_norm = df.copy()
norm_col = ['quantity_sold', 'tiki_rewards_amount', 'day_ago_created', 'original_price', 
            'discount_rate', 'rating_average', 'review_count']
df_norm[norm_col] = StandardScaler().fit_transform(df[norm_col].values)

In [10]:
target_mean = df.quantity_sold.mean()
target_std = df.quantity_sold.std()
print('target_mean:', target_mean)
print('target_std:', target_std)

target_mean: 41.14532766990291
target_std: 117.83358526194722


In [11]:
y = df_norm[df_norm.columns[:1]].values
X = df_norm[df_norm.columns[1:]].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 46)
X_train.shape, X_test.shape

((11206, 91), (1978, 91))

In [12]:
param_grid = \
{
    'epsilon': [x*1.0 for x in range(0, 11)],
    'C': [x*0.5 for x in range(0, 11)],
    'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
    'dual': [True, False]
}
param_list = list(ParameterGrid(param_grid))
param_df = pd.DataFrame(data = param_list)
print(param_df.shape)
param_df.head()

(484, 4)


Unnamed: 0,C,dual,epsilon,loss
0,0.0,True,0.0,epsilon_insensitive
1,0.0,True,0.0,squared_epsilon_insensitive
2,0.0,True,1.0,epsilon_insensitive
3,0.0,True,1.0,squared_epsilon_insensitive
4,0.0,True,2.0,epsilon_insensitive


In [13]:
rmse = []
for params in tqdm(param_list, total = len(param_list)):
    try:
        lsvr = LinearSVR(epsilon = params['epsilon'], 
                         tol = 1e-5,
                         C = params['C'],
                         loss = params['loss'],
                         random_state = 46, 
                         dual = params['dual'], 
                         max_iter = 5000) \
                .fit(X_train, y_train.reshape([y_train.shape[0], ]))
        rmse.append(np.sqrt(mse(lsvr.predict(X_train), y_train)))
    except:
        rmse.append(np.nan)

param_df['rmse'] = rmse

100%|██████████| 484/484 [02:27<00:00,  3.29it/s]


In [14]:
param_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 484 entries, 0 to 483
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   C        484 non-null    float64
 1   dual     484 non-null    bool   
 2   epsilon  484 non-null    float64
 3   loss     484 non-null    object 
 4   rmse     227 non-null    float64
dtypes: bool(1), float64(3), object(1)
memory usage: 15.7+ KB


In [15]:
param_df = param_df[~(param_df.rmse.isna())].sort_values(by = ['rmse'], ascending = True)
print('rmse_norm:', param_df.iloc[0, 4])
print('rmse:', (param_df.iloc[0, 4]*target_std)+target_mean)
param_df.head(5)

rmse_norm: 0.4797947330646236
rmse: 97.68126125670645


In [19]:
lsvr_train_opt = LinearSVR(epsilon = param_df.iloc[0, :]['epsilon'], 
                           tol = 1e-5,
                           C = param_df.iloc[0, :]['C'],
                           loss = param_df.iloc[0, :]['loss'],
                           random_state = 46, 
                           dual = param_df.iloc[0, :]['dual'], 
                           max_iter = 5000) \
        .fit(X_train, y_train.reshape([y_train.shape[0], ]))

rmse_on_test = np.sqrt(mse(lsvr_train.predict(X_test), y_test))
print('rmse_test_norm:', rmse_on_test)
print('rmse_test:', (rmse_on_test*target_std)+target_mean)

rmse_test_norm: 0.46642354785023843
rmse_test: 96.1056865636939


In [30]:
df.quantity_sold.describe().round()

count    13184.0
mean        41.0
std        118.0
min          0.0
25%          0.0
50%          3.0
75%         23.0
max       1135.0
Name: quantity_sold, dtype: float64

In [33]:
train_result = \
pd.DataFrame({'train_pred_norm': list(lsvr_train_opt.predict(X_train)), 
              'y_train_norm': list(y_train.reshape([y_train.shape[0],])),
              'train_pred': list((lsvr_train_opt.predict(X_train) * target_std) + target_mean),
              'y_train': list((y_train.reshape([y_train.shape[0],]) * target_std) + target_mean)
             }).reset_index()
px.line(train_result, x = "index", y = ["train_pred", 'y_train'],
        title = "Training results").show()

In [34]:
test_result = \
pd.DataFrame({'test_pred_norm': list(lsvr_train_opt.predict(X_test)), 
              'y_test_norm': list(y_test.reshape([y_test.shape[0],])),
              'test_pred': list((lsvr_train_opt.predict(X_test) * target_std) + target_mean),
              'y_test': list((y_test.reshape([y_test.shape[0],]) * target_std) + target_mean)
             }).reset_index()
px.line(test_result, x = "index", y = ["test_pred", 'y_test'],
        title = "Testing results").show()

In [42]:
param_df.head()

Unnamed: 0,C,dual,epsilon,loss,rmse
419,4.5,False,0.0,squared_epsilon_insensitive,0.479795
463,5.0,False,0.0,squared_epsilon_insensitive,0.479811
331,3.5,False,0.0,squared_epsilon_insensitive,0.479839
243,2.5,False,0.0,squared_epsilon_insensitive,0.47984
199,2.0,False,0.0,squared_epsilon_insensitive,0.47984


In [41]:
lsvr_train_opt.coef_

array([ 5.66788788e-03, -1.41205219e-04,  9.86269183e-02,  2.12802728e-02,
       -1.23598065e-01,  2.90353309e-02,  1.87941530e-02,  8.17975921e-01,
       -1.02293619e-02, -7.56629484e-07,  2.66420085e-03, -1.48211492e-01,
       -2.24074207e-01, -4.80804647e-02, -8.86351889e-02, -1.31268755e-01,
       -2.29434659e-02, -8.80564518e-02,  1.70584811e-01,  4.14782058e-02,
        2.39351288e-01,  6.95558022e-02,  7.84637195e-02,  1.57028789e-01,
       -3.91419616e-02,  3.48787077e-01, -3.11178346e-02,  6.69373974e-02,
        7.73257304e-02, -3.19147949e-02, -1.21008923e-01, -1.52021088e-01,
        1.55836452e-02,  5.29677451e-03,  2.84224041e-02,  1.17763713e-01,
        1.49694003e-01, -4.69116638e-02,  5.48398442e-02, -6.04517088e-02,
        1.27063134e-01, -4.90785019e-02, -8.27953802e-02, -5.46897313e-02,
        2.44492830e-02,  2.64455089e-04, -1.95688785e-02,  2.88518267e-02,
       -2.50219549e-02,  1.91211376e-01,  5.66941524e-03, -1.00592584e-02,
       -9.48732675e-03,  