In [1]:
# import packages
import pandas as pd
# import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
import datetime
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy.interpolate import interpn
from collections import defaultdict

In [2]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
#load data
items = pd.read_csv('ex_items.csv', sep = '|', parse_dates=['dateFirstSell'])
items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,minSalesPrice,maxSalesPrice,meanSalesPrice,minSalesPriceCounts,maxSalesPriceCounts,unitsSold,revenue,relRevenue,dateFirstSell,dateLastSell,soldWeekly,soldMonthly
0,1,0,1,4.38,1,1-1,1-1-1,8.84,3.11,3.11,3.11,690.0,690.0,690.0,2145.9,2.949022e-05,2018-01-23,2018-05-24,False,True
1,2,0,2,3.0,1,1-2,1-2-1,16.92,9.15,9.15,9.15,5.0,5.0,5.0,45.75,6.287235e-07,2018-02-24,2018-05-29,False,False
2,3,0,3,5.0,1,1-3,1-3-1,15.89,9.89,14.04,11.918371,91.0,99.0,190.0,2289.95,3.146984e-05,2018-01-10,2018-05-16,False,False
3,4,0,2,4.44,1,1-2,1-2-1,40.17,13.01,13.01,13.01,53.0,53.0,53.0,689.53,9.475928e-06,2018-01-18,2018-05-30,False,False
4,5,0,2,2.33,1,1-1,1-1-1,17.04,7.48,7.84,7.740377,49.0,128.0,177.0,1370.04,1.88279e-05,2018-01-06,2018-05-30,False,False


In [4]:
items_simple = items[['itemID', 'customerRating', 'manufacturer', 'brand', 'category1', 'category3', 'dateFirstSell', 'soldWeekly', 'soldMonthly']]

In [5]:
items_simple.category1 = pd.Categorical(items_simple.category1)
items_simple['category1'] = items_simple.category1.cat.codes
items_simple.category3 = pd.Categorical(items_simple.category3)
items_simple['category3'] = items_simple.category3.cat.codes
items_simple.head()

Unnamed: 0,itemID,customerRating,manufacturer,brand,category1,category3,dateFirstSell,soldWeekly,soldMonthly
0,1,4.38,1,0,0,0,2018-01-23,False,True
1,2,3.0,2,0,0,1,2018-02-24,False,False
2,3,5.0,3,0,0,2,2018-01-10,False,False
3,4,4.44,2,0,0,1,2018-01-18,False,False
4,5,2.33,2,0,0,0,2018-01-06,False,False


In [6]:
# load data
df_train = pd.read_csv('data/orders0206_train.csv', sep='|', parse_dates=['time'])
df_train['date'] = [d.date() for d in df_train['time']]
df_test = pd.read_csv('data/orders0206_test.csv', sep='|', parse_dates=['time'])
df_test['date'] = [d.date() for d in df_test['time']]
df_items = pd.read_csv('data/items.csv', sep='|')

In [7]:
train = df_train
test = df_test
all_data = train.append(test)

In [8]:
# aggregate sales per day
aggregated = all_data.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
aggregated_sp = all_data.groupby(['itemID','date']).mean()["salesPrice"].to_frame().reset_index().rename(columns={'salesPrice':'avg_salesPrice'})

In [11]:
# # aggregate sales per day
# train_aggregated = train.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
# test_aggregated = test.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
# # aggregate sales per day
# train_aggregated_sp = train.groupby(['itemID','date']).mean()["salesPrice"].to_frame().reset_index().rename(columns={'salesPrice':'avg_salesPrice'})
# test_aggregated_sp = test.groupby(['itemID','date']).mean()["salesPrice"].to_frame().reset_index().rename(columns={'salesPrice':'avg_salesPrice'})

In [9]:
aggregated = pd.merge(aggregated,aggregated_sp,how='left')
aggregated.head()

Unnamed: 0,itemID,date,count,avg_salesPrice
0,1,2018-01-23,1,3.11
1,1,2018-01-25,1,3.11
2,1,2018-01-29,307,3.11
3,1,2018-01-30,3,3.11
4,1,2018-01-31,1,3.11


In [10]:
# (currently unused)
def add_datepart(df, fldname, drop=True):
#     https://towardsdatascience.com/multivariate-time-series-forecasting-using-random-forest-2372f3ecbad1
    """Add additional date information to dataframe"""
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, 
                                     infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 
            'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 
            'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9 
    if drop: df.drop(fldname, axis=1, inplace=True)

In [11]:
import numpy as np
import re
# add additional date information
add_datepart(aggregated, 'date', drop = False)
add_datepart(aggregated, 'date', drop = False)

In [12]:
aggregated["WeekGroup"] = aggregated["Dayofyear"] // 14

In [13]:
aggregated.head()

Unnamed: 0,itemID,date,count,avg_salesPrice,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,WeekGroup
0,1,2018-01-23,1,3.11,2018,1,4,23,1,23,False,False,False,False,False,False,1516665600,1
1,1,2018-01-25,1,3.11,2018,1,4,25,3,25,False,False,False,False,False,False,1516838400,1
2,1,2018-01-29,307,3.11,2018,1,5,29,0,29,False,False,False,False,False,False,1517184000,2
3,1,2018-01-30,3,3.11,2018,1,5,30,1,30,False,False,False,False,False,False,1517270400,2
4,1,2018-01-31,1,3.11,2018,1,5,31,2,31,True,False,False,False,False,False,1517356800,2


In [14]:
from itertools import product
# Create "grid" with columns
index_cols = ['itemID', 'WeekGroup']

grid = []
for block_num in aggregated['WeekGroup'].unique():
    cur_items = aggregated.loc[aggregated['WeekGroup'] == block_num, 'itemID'].unique()
    grid.append(np.array(list(product(*[cur_items, [block_num]])),dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)
grid.head()

Unnamed: 0,itemID,WeekGroup
0,1,1
1,3,1
2,4,1
3,5,1
4,8,1


In [15]:
train_m = aggregated.groupby(['WeekGroup','itemID']).agg({'count': 'sum','avg_salesPrice': np.mean}).reset_index()

# Merging sales numbers with the grid dataframe
train_m = pd.merge(grid,train_m,on=['WeekGroup','itemID'],how='left').fillna(0)

In [16]:
### add 0 sales for missing dates per itemID
for prod in train_m.itemID.unique():
    s = train_m.loc[train_m['itemID'] == prod][["WeekGroup","count","avg_salesPrice"]]
    s = s.set_index("WeekGroup")
    idx = range(s.index.min(), s.index.max()+1)
#     s = s.reindex(idx)
    s = s.reindex(idx, fill_value=0)
    if "df_week" not in globals():
        s["itemID"] = prod
        df_week = s
    else:
        s["itemID"] = prod
        df_week = pd.concat([df_week,s], ignore_index=False)
df_week.head(3)

Unnamed: 0_level_0,count,avg_salesPrice,itemID
WeekGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,3.11,1
2,314,3.11,1
3,35,3.11,1


In [17]:
df_week.reset_index(inplace=True)
df_week[df_week['itemID']==5]

Unnamed: 0,WeekGroup,count,avg_salesPrice,itemID
32,0,2,7.84,5
33,1,1,7.84,5
34,2,0,0.0,5
35,3,1,7.84,5
36,4,0,0.0,5
37,5,0,0.0,5
38,6,0,0.0,5
39,7,127,7.6,5
40,8,4,7.48,5
41,9,39,7.48,5


In [18]:
df_week.sort_values(by=['itemID', 'WeekGroup'], inplace=True)
# df_week['count'].fillna(value=0, inplace=True)
# df_week['avg_salesPrice'].fillna(value=df_week.groupby('itemID')['avg_salesPrice'].transform('mean'), inplace=True)
# df_week['avg_salesPrice'].fillna(value=0, inplace=True)
# df_week['avg_salesPrice'].fillna(method='ffill', inplace=True)

In [19]:
# adding the category id too from the items table.
df_m = pd.merge(df_week,items_simple,on=['itemID'],how='left')

In [20]:
df_m['dateFirstSell'] = [d.month for d in df_m['dateFirstSell']]

In [21]:
df_m[df_m['itemID']==5]

Unnamed: 0,WeekGroup,count,avg_salesPrice,itemID,customerRating,manufacturer,brand,category1,category3,dateFirstSell,soldWeekly,soldMonthly
40,0,2,7.84,5,2.33,2,0,0,0,1.0,False,False
41,1,1,7.84,5,2.33,2,0,0,0,1.0,False,False
42,2,0,0.0,5,2.33,2,0,0,0,1.0,False,False
43,3,1,7.84,5,2.33,2,0,0,0,1.0,False,False
44,4,0,0.0,5,2.33,2,0,0,0,1.0,False,False
45,5,0,0.0,5,2.33,2,0,0,0,1.0,False,False
46,6,0,0.0,5,2.33,2,0,0,0,1.0,False,False
47,7,127,7.6,5,2.33,2,0,0,0,1.0,False,False
48,8,4,7.48,5,2.33,2,0,0,0,1.0,False,False
49,9,39,7.48,5,2.33,2,0,0,0,1.0,False,False


In [22]:
groupcollist = ['brand','category1','category3']
aggregationlist = [('avg_salesPrice', np.mean, 'avg'),('count', np.sum, 'sum'),('count', np.mean, 'avg')]

for type_id in groupcollist:
    for column_id,aggregator,aggtype in aggregationlist:
        # get numbers from sales data and set column names
        mean_df = df_m.groupby([type_id,'WeekGroup']).aggregate(aggregator).reset_index()[[column_id,type_id,'WeekGroup']]
        mean_df.columns = [type_id+'_'+aggtype+'_'+column_id,type_id,'WeekGroup']
        # merge new columns on sales_m data
        df_m = pd.merge(df_m,mean_df,on=['WeekGroup',type_id],how='left')

In [23]:
df_m[df_m['itemID']==10]

Unnamed: 0,WeekGroup,count,avg_salesPrice,itemID,customerRating,manufacturer,brand,category1,category3,dateFirstSell,...,soldMonthly,brand_avg_avg_salesPrice,brand_sum_count,brand_avg_count,category1_avg_avg_salesPrice,category1_sum_count,category1_avg_count,category3_avg_avg_salesPrice,category3_sum_count,category3_avg_count
80,3,358,24.93,10,5.0,4,0,0,0,2.0,...,True,62.69362,50513,37.087372,95.758101,14960,20.662983,115.203651,7993,29.278388
81,4,1,24.17,10,5.0,4,0,0,0,2.0,...,True,58.353217,166794,91.393973,90.045207,17638,20.677608,43.758367,6040,20.133333
82,5,63,24.17,10,5.0,4,0,0,0,2.0,...,True,57.602947,152878,71.072989,78.591586,17310,18.201893,88.783952,6070,18.119403
83,6,1,24.17,10,5.0,4,0,0,0,2.0,...,True,50.13569,94833,41.887367,81.977861,18572,18.553447,88.90719,5635,16.239193
84,7,1,24.17,10,5.0,4,0,0,0,2.0,...,True,50.389888,109424,45.104699,79.563256,12497,12.097773,109.929853,4656,13.042017
85,8,4,24.17,10,5.0,4,0,0,0,2.0,...,True,77.698319,165836,62.911988,111.430454,31784,27.423641,173.449067,14944,35.665871
86,9,0,0.0,10,5.0,4,0,0,0,2.0,...,True,46.756122,129284,49.935883,85.919913,17847,16.268915,52.588411,6725,17.558747
87,10,2,24.17,10,5.0,4,0,0,0,2.0,...,True,58.145531,128045,49.191318,96.831005,17342,17.641913,102.89864,4666,14.269113
88,11,1,24.17,10,5.0,4,0,0,0,2.0,...,True,58.250467,127225,60.210601,96.082644,15942,23.171512,105.455562,6904,29.504274


In [24]:
from sklearn.preprocessing import MinMaxScaler
# scale data between -1 & 1
scaler = MinMaxScaler()
scale_col = ['count', 'brand_sum_count', 'brand_avg_count', 'category1_sum_count', 'category1_avg_count', 'category3_sum_count', 'category3_avg_count']
for c in scale_col:
    scaler.fit(df_m[[c]])
    df_m[c] = pd.DataFrame(scaler.transform(df_m[[c]]))[0]

In [25]:
df_m[df_m['itemID']==10]

Unnamed: 0,WeekGroup,count,avg_salesPrice,itemID,customerRating,manufacturer,brand,category1,category3,dateFirstSell,...,soldMonthly,brand_avg_avg_salesPrice,brand_sum_count,brand_avg_count,category1_avg_avg_salesPrice,category1_sum_count,category1_avg_count,category3_avg_avg_salesPrice,category3_sum_count,category3_avg_count
80,3,0.074135,24.93,10,5.0,4,0,0,0,2.0,...,True,62.69362,0.302847,0.019748,95.758101,0.126926,0.069774,115.203651,0.130262,0.026755
81,4,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,True,58.353217,1.0,0.048666,90.045207,0.154043,0.069836,43.758367,0.098434,0.018398
82,5,0.013046,24.17,10,5.0,4,0,0,0,2.0,...,True,57.602947,0.916568,0.037845,78.591586,0.150722,0.059325,88.783952,0.098923,0.016557
83,6,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,True,50.13569,0.568564,0.022304,81.977861,0.163501,0.060818,88.90719,0.091834,0.014839
84,7,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,True,50.389888,0.656043,0.024017,79.563256,0.101987,0.03341,109.929853,0.075879,0.011918
85,8,0.000828,24.17,10,5.0,4,0,0,0,2.0,...,True,77.698319,0.994256,0.033499,111.430454,0.297282,0.098476,173.449067,0.243542,0.032591
86,9,0.0,0.0,10,5.0,4,0,0,0,2.0,...,True,46.756122,0.775112,0.02659,85.919913,0.15616,0.051119,52.588411,0.109597,0.016045
87,10,0.000414,24.17,10,5.0,4,0,0,0,2.0,...,True,58.145531,0.767683,0.026193,96.831005,0.151046,0.056948,102.89864,0.076042,0.013039
88,11,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,True,58.250467,0.762767,0.032061,96.082644,0.13687,0.080424,105.455562,0.112514,0.026961


In [26]:
lag_variables  = ['count','avg_salesPrice','brand_avg_avg_salesPrice','brand_sum_count','brand_avg_count','category1_avg_avg_salesPrice','category1_sum_count','category1_avg_count','category3_avg_avg_salesPrice','category3_sum_count','category3_avg_count']
lags = [1 ,2 ,3 ,4, 5]
# we will keep the results in thsi dataframe
sales_means = df_m.copy()
for lag in lags:
    sales_new_df = df_m.copy()
    sales_new_df.WeekGroup+=lag
    # subset only the lag variables we want
    sales_new_df = sales_new_df[['WeekGroup','itemID']+lag_variables]
    sales_new_df.columns = ['WeekGroup','itemID']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variables]
    # join with date_block_num,shop_id and item_id
    sales_means = pd.merge(sales_means, sales_new_df,on=['WeekGroup','itemID'] ,how='left')

In [27]:
for feat in sales_means.columns:
    if 'count' in feat:
        sales_means[feat]=sales_means[feat].fillna(0)
    elif 'salesPrice' in feat:
        sales_means[feat]=sales_means[feat].fillna(sales_means[feat].median())

In [28]:
sales_means[sales_means['itemID']==10]

Unnamed: 0,WeekGroup,count,avg_salesPrice,itemID,customerRating,manufacturer,brand,category1,category3,dateFirstSell,...,avg_salesPrice_lag_5,brand_avg_avg_salesPrice_lag_5,brand_sum_count_lag_5,brand_avg_count_lag_5,category1_avg_avg_salesPrice_lag_5,category1_sum_count_lag_5,category1_avg_count_lag_5,category3_avg_avg_salesPrice_lag_5,category3_sum_count_lag_5,category3_avg_count_lag_5
80,3,0.074135,24.93,10,5.0,4,0,0,0,2.0,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
81,4,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
82,5,0.013046,24.17,10,5.0,4,0,0,0,2.0,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
83,6,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
84,7,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
85,8,0.000828,24.17,10,5.0,4,0,0,0,2.0,...,24.93,62.69362,0.302847,0.019748,95.758101,0.126926,0.069774,115.203651,0.130262,0.026755
86,9,0.0,0.0,10,5.0,4,0,0,0,2.0,...,24.17,58.353217,1.0,0.048666,90.045207,0.154043,0.069836,43.758367,0.098434,0.018398
87,10,0.000414,24.17,10,5.0,4,0,0,0,2.0,...,24.17,57.602947,0.916568,0.037845,78.591586,0.150722,0.059325,88.783952,0.098923,0.016557
88,11,0.000207,24.17,10,5.0,4,0,0,0,2.0,...,24.17,50.13569,0.568564,0.022304,81.977861,0.163501,0.060818,88.90719,0.091834,0.014839


In [29]:
cols_to_drop = lag_variables[1:]
for col in cols_to_drop:
    del sales_means[col]
sales_means

Unnamed: 0,WeekGroup,count,itemID,customerRating,manufacturer,brand,category1,category3,dateFirstSell,soldWeekly,...,avg_salesPrice_lag_5,brand_avg_avg_salesPrice_lag_5,brand_sum_count_lag_5,brand_avg_count_lag_5,category1_avg_avg_salesPrice_lag_5,category1_sum_count_lag_5,category1_avg_count_lag_5,category3_avg_avg_salesPrice_lag_5,category3_sum_count_lag_5,category3_avg_count_lag_5
0,1,0.000414,1,4.38,1,0,0,0,1.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
1,2,0.065024,1,4.38,1,0,0,0,1.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
2,3,0.007248,1,4.38,1,0,0,0,1.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
3,4,0.000414,1,4.38,1,0,0,0,1.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
4,5,0.000207,1,4.38,1,0,0,0,1.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46105,11,0.025264,10450,0.00,227,182,7,66,,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
46106,8,0.000207,10459,0.00,253,180,7,66,5.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
46107,7,0.000207,10460,0.00,253,0,7,66,4.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0
46108,8,0.000207,10462,0.00,253,180,7,66,4.0,False,...,11.375,50.13569,0.0,0.0,51.148979,0.0,0.0,39.83112,0.0,0.0


In [30]:
# sales_means = sales_means[sales_means['WeekGroup']>2]

In [31]:
X_train = sales_means[sales_means['WeekGroup']<9]
X_cv =  sales_means[sales_means['WeekGroup']==10]
X_test = sales_means[sales_means['WeekGroup']==11]
Y_train = X_train['count']
Y_cv = X_cv['count']
Y_test = X_test['count']
del X_train['count']
del X_cv['count']
del X_test['count']
del X_train['WeekGroup']
del X_cv['WeekGroup']
del X_test['WeekGroup']

In [32]:
del X_train['itemID']
del X_cv['itemID']

In [68]:
from xgboost import XGBRegressor
model = XGBRegressor(
    max_depth=6,
    n_estimators=1000,
    min_child_weight=300, 
    colsample_bytree=0.8, 
    subsample=0.8, 
    eta=0.3,    
    seed=42)
model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_cv, Y_cv)], 
    verbose=True, 
    early_stopping_rounds = 10)

[0]	validation_0-rmse:0.34397	validation_1-rmse:0.34411
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:0.24193	validation_1-rmse:0.24178
[2]	validation_0-rmse:0.17092	validation_1-rmse:0.17041
[3]	validation_0-rmse:0.12190	validation_1-rmse:0.12123
[4]	validation_0-rmse:0.08832	validation_1-rmse:0.08711
[5]	validation_0-rmse:0.06581	validation_1-rmse:0.06424
[6]	validation_0-rmse:0.05120	validation_1-rmse:0.04976
[7]	validation_0-rmse:0.04223	validation_1-rmse:0.04043
[8]	validation_0-rmse:0.03701	validation_1-rmse:0.03532
[9]	validation_0-rmse:0.03410	validation_1-rmse:0.03258
[10]	validation_0-rmse:0.03256	validation_1-rmse:0.03109
[11]	validation_0-rmse:0.03170	validation_1-rmse:0.03034
[12]	validation_0-rmse:0.03124	validation_1-rmse:0.02999
[13]	validation_0-rmse:0.03098	validation_1-rmse:0.02969
[14]	validation_0-rmse:0.03082	validation_1-rmse:0.02

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=300, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=42, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=42, subsample=0.8,
             tree_method=None, validate_parameters=False, verbosity=None)

In [69]:
import plotly_express as px
columns = X_train.columns
feature_importances = pd.DataFrame({'columns': columns,'importance':model.feature_importances_})
feature_importances = feature_importances.sort_values(by='importance',ascending=False)
px.bar(feature_importances,x='columns',y='importance')

In [70]:
def evaluate_result(y: dict, y_pred: dict):
    monetary_value = 0
    y_pred = defaultdict(int, y_pred)  # return prediction of 0 for items without prediction
    
    for item in set(y_pred).difference(set(y)):
        y[item] = 0  # make sure that all items for which a demand has been predicted are contained in the actual demands
    
    for item, demand in y.items():
        predicted_demand = y_pred[item]
        price = product_prices[item]
        monetary_value += price * min(demand, predicted_demand)
        if predicted_demand > demand:
            monetary_value -= .6 * price * (predicted_demand - demand)
            
    return monetary_value

In [71]:
df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
df_items.head()

Unnamed: 0_level_0,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
itemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,1,4.38,1,1,1,8.84
2,0,2,3.0,1,2,1,16.92
3,0,3,5.0,1,3,1,15.89
4,0,2,4.44,1,2,1,40.17
5,0,2,2.33,1,1,1,17.04


In [72]:
# actual demand
y = df_test.groupby(by='itemID')['order'].sum().to_dict()

# baseline 1 (average demand of previous 14 days)
y_baseline1 = df_train[df_train['time'] >= '2018-05-19'].groupby(by='itemID')['order'].sum().to_dict()

# baseline 2 (average demand of previous half year)
total_orders = df_train.groupby(by='itemID')['order'].sum().to_dict()
total_observed_days = (df_train['time'].dt.normalize().max() - df_train['time'].dt.normalize().min()).days
y_baseline2 = {item: orders / total_observed_days * 14 for item, orders in total_orders.items()}  # 14-day avg. demand

df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
product_prices = df_info['simulationPrice'].to_dict()

In [73]:
# apply to unseen data
y_xgboost_all = dict()
for prod in X_test.itemID.unique():
    train_predict = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(model.predict(X_test[X_test.itemID == prod].drop(['itemID'], axis=1))))).rename(columns={0:'predicted_count'})
    train_predict["actual_count"] = test[test.itemID == prod]["order"].sum()
    y_xgboost_all[prod] = int(train_predict["predicted_count"].sum().round())
#     if prod in df_train.itemID.unique():
#         y_xgboost_all[prod] = int(train_predict["predicted_count"].sum().round())
#     else:
#         y_xgboost_all[prod] = 0

In [74]:
# perfect result
print(f'Perfect Result: {evaluate_result(y, y):.2f}')

# baseline 1
print(f'Baseline 1: {evaluate_result(y, y_baseline1):.2f}')

# baseline 2
print(f'Baseline 2: {evaluate_result(y, y_baseline2):.2f}')

# random forest
print(f'XGBoost: {evaluate_result(y, y_xgboost_all):.2f}')
# 920416.65

Perfect Result: 7895975.87
Baseline 1: -3727365.60
Baseline 2: -1672504.21
XGBoost: 920416.65
