In [1]:
# import packages
import pandas as pd
# import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
import datetime
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy.interpolate import interpn
from collections import defaultdict
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
#load data
items = pd.read_csv('./data/ex_items_binary.csv', sep = '|', parse_dates=['dateFirstSell'])#ex_items_binary
recently_sold = pd.read_csv('./data/ex_items.csv', sep = '|', parse_dates=['dateFirstSell'])
items = pd.merge(left=items, right=recently_sold[['itemID', 'recentlySold']], on='itemID')
items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,minSalesPrice,maxSalesPrice,meanSalesPrice,minSalesPriceCounts,maxSalesPriceCounts,unitsSold,revenue,relRevenue,dateFirstSell,dateLastSell,soldWeekly,soldMonthly,recentlySold
0,1,0,1,4.38,1,1-1,1-1-1,8.84,3.11,3.11,3.11,690.0,690.0,690.0,2145.9,2.949022e-05,2018-01-23,2018-05-24,False,True,9.0
1,2,0,2,3.0,1,1-2,1-2-1,16.92,9.15,9.15,9.15,5.0,5.0,5.0,45.75,6.287235e-07,2018-02-24,2018-05-29,False,False,4.0
2,3,0,3,5.0,1,1-3,1-3-1,15.89,9.89,14.04,11.918371,91.0,99.0,190.0,2289.95,3.146984e-05,2018-01-10,2018-05-16,False,False,17.0
3,4,0,2,4.44,1,1-2,1-2-1,40.17,13.01,13.01,13.01,53.0,53.0,53.0,689.53,9.475928e-06,2018-01-18,2018-05-30,False,False,3.0
4,5,0,2,2.33,1,1-1,1-1-1,17.04,7.48,7.84,7.740377,49.0,128.0,177.0,1370.04,1.88279e-05,2018-01-06,2018-05-30,False,False,3.0


In [4]:
items_simple = items[['itemID', 'customerRating', 'manufacturer', 'brand', 'category1', 'category2', 'category3','recommendedRetailPrice', 'dateFirstSell', 'soldWeekly', 'recentlySold']]

In [5]:
items_simple.category1 = pd.Categorical(items_simple.category1)
items_simple['category1'] = items_simple.category1.cat.codes
items_simple.category2 = pd.Categorical(items_simple.category2)
items_simple['category2'] = items_simple.category2.cat.codes
items_simple.category3 = pd.Categorical(items_simple.category3)
items_simple['category3'] = items_simple.category3.cat.codes
items_simple.head()

Unnamed: 0,itemID,customerRating,manufacturer,brand,category1,category2,category3,recommendedRetailPrice,dateFirstSell,soldWeekly,recentlySold
0,1,4.38,1,0,0,0,0,8.84,2018-01-23,False,9.0
1,2,3.0,2,0,0,1,1,16.92,2018-02-24,False,4.0
2,3,5.0,3,0,0,2,2,15.89,2018-01-10,False,17.0
3,4,4.44,2,0,0,1,1,40.17,2018-01-18,False,3.0
4,5,2.33,2,0,0,0,0,17.04,2018-01-06,False,3.0


In [6]:
# load data
df_train = pd.read_csv('data/orders0206_train.csv', sep='|', parse_dates=['time'])
df_train['date'] = [d.date() for d in df_train['time']]
df_test = pd.read_csv('data/orders0206_test.csv', sep='|', parse_dates=['time'])
df_test['date'] = [d.date() for d in df_test['time']]
df_items = pd.read_csv('data/items.csv', sep='|')

In [7]:
infos = pd.read_csv('data/infos_promotions.csv', sep='|', parse_dates=['predicted_promotion'])
infos['predicted_promotion'] = [d.date() for d in infos['predicted_promotion']]

In [8]:
infos['is_promotion'] = 1

In [9]:
infos[infos['itemID'] == 5010]

Unnamed: 0,index,itemID,simulationPrice,promotion,promotion_train,promotion_test,predicted_promotion,is_promotion
9965,5009,5010,5.95,2018-07-10,"[datetime.date(2018, 4, 30), datetime.date(201...","[datetime.date(2018, 6, 3), datetime.date(2018...",2018-04-30,1
9966,5009,5010,5.95,2018-07-10,"[datetime.date(2018, 4, 30), datetime.date(201...","[datetime.date(2018, 6, 3), datetime.date(2018...",2018-05-23,1
9967,5009,5010,5.95,2018-07-10,"[datetime.date(2018, 4, 30), datetime.date(201...","[datetime.date(2018, 6, 3), datetime.date(2018...",2018-06-03,1
9968,5009,5010,5.95,2018-07-10,"[datetime.date(2018, 4, 30), datetime.date(201...","[datetime.date(2018, 6, 3), datetime.date(2018...",2018-06-10,1
9969,5009,5010,5.95,2018-07-10,"[datetime.date(2018, 4, 30), datetime.date(201...","[datetime.date(2018, 6, 3), datetime.date(2018...",2018-06-13,1
9970,5009,5010,5.95,2018-07-10,"[datetime.date(2018, 4, 30), datetime.date(201...","[datetime.date(2018, 6, 3), datetime.date(2018...",2018-06-15,1


In [10]:
train = df_train
test = df_test
all_data = train.append(test)

In [11]:
# aggregate sales per day
aggregated = all_data.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
aggregated_sp = all_data.groupby(['itemID','date']).mean()["salesPrice"].to_frame().reset_index().rename(columns={'salesPrice':'avg_salesPrice'})
# aggregated_promotion = all_data.merge(infos, on='itemID').groupby(['itemID','date']).count()["predicted_promotion"].to_frame().reset_index()



In [12]:
# # aggregate sales per day
# train_aggregated = train.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
# test_aggregated = test.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
# # aggregate sales per day
# train_aggregated_sp = train.groupby(['itemID','date']).mean()["salesPrice"].to_frame().reset_index().rename(columns={'salesPrice':'avg_salesPrice'})
# test_aggregated_sp = test.groupby(['itemID','date']).mean()["salesPrice"].to_frame().reset_index().rename(columns={'salesPrice':'avg_salesPrice'})

In [13]:
aggregated = pd.merge(aggregated,aggregated_sp,how='left')
aggregated = pd.merge(aggregated, infos['itemID predicted_promotion'.split()], left_on=['itemID', 'date'], right_on=['itemID', 'predicted_promotion'], how='left')
aggregated['predicted_promotion'] = aggregated['predicted_promotion'].apply(lambda x: isinstance(x, datetime.date))

In [14]:
aggregated.head()

Unnamed: 0,itemID,date,count,avg_salesPrice,predicted_promotion
0,1,2018-01-23,1,3.11,False
1,1,2018-01-25,1,3.11,False
2,1,2018-01-29,307,3.11,True
3,1,2018-01-30,3,3.11,False
4,1,2018-01-31,1,3.11,False


In [15]:
# (currently unused)
def add_datepart(df, fldname, drop=True):
#     https://towardsdatascience.com/multivariate-time-series-forecasting-using-random-forest-2372f3ecbad1
    """Add additional date information to dataframe"""
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, 
                                     infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 
            'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 
            'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9 
    if drop: df.drop(fldname, axis=1, inplace=True)

In [16]:
import numpy as np
import re
# add additional date information
add_datepart(aggregated, 'date', drop = False)
add_datepart(aggregated, 'date', drop = False)

In [17]:
aggregated["WeekGroup"] = aggregated.apply(lambda row: (row["Dayofyear"] + 1) // 14 , axis=1)

In [18]:
aggregated = aggregated[aggregated["WeekGroup"] > 0]

In [19]:
aggregated.head()

Unnamed: 0,itemID,date,count,avg_salesPrice,predicted_promotion,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,WeekGroup
0,1,2018-01-23,1,3.11,False,2018,1,4,23,1,23,False,False,False,False,False,False,1516665600,1
1,1,2018-01-25,1,3.11,False,2018,1,4,25,3,25,False,False,False,False,False,False,1516838400,1
2,1,2018-01-29,307,3.11,True,2018,1,5,29,0,29,False,False,False,False,False,False,1517184000,2
3,1,2018-01-30,3,3.11,False,2018,1,5,30,1,30,False,False,False,False,False,False,1517270400,2
4,1,2018-01-31,1,3.11,False,2018,1,5,31,2,31,True,False,False,False,False,False,1517356800,2


In [20]:
from itertools import product
# Create "grid" with columns
index_cols = ['itemID', 'WeekGroup']

grid = []
for block_num in aggregated['WeekGroup'].unique():
    cur_items = aggregated.loc[aggregated['WeekGroup'] == block_num, 'itemID'].unique()
    grid.append(np.array(list(product(*[cur_items, [block_num]])),dtype='int32'))

grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)
grid.head()

Unnamed: 0,itemID,WeekGroup
0,1,1
1,3,1
2,4,1
3,5,1
4,8,1


In [21]:
train_m = aggregated.groupby(['WeekGroup','itemID']).agg({'count': 'sum','avg_salesPrice': np.mean, 'predicted_promotion': 'sum'}).reset_index()

# Merging sales numbers with the grid dataframe
train_m = pd.merge(grid,train_m,on=['WeekGroup','itemID'],how='left').fillna(0)

In [22]:
train_m[train_m['itemID'] == 1]

Unnamed: 0,itemID,WeekGroup,count,avg_salesPrice,predicted_promotion
0,1,1,2,3.11,0.0
1664,1,2,313,3.11,1.0
3278,1,3,35,3.11,0.0
5156,1,4,3,3.11,0.0
7532,1,5,1,3.11,0.0
10446,1,6,1,3.11,0.0
13614,1,7,2,3.11,0.0
16947,1,8,299,3.11,1.0
20689,1,9,3,3.11,0.0
24597,1,10,31,3.11,0.0


In [23]:
# del df_week

In [24]:
### add 0 sales for missing dates per itemID
for prod in train_m.itemID.unique():
    s = train_m.loc[train_m['itemID'] == prod][["WeekGroup","count","avg_salesPrice", "predicted_promotion"]]
    s = s.set_index("WeekGroup")
    idx = range(s.index.min(), s.index.max()+1)
    s = s.reindex(idx)
#     s = s.reindex(idx, fill_value=0)
    s['weekGroup'] = s.index
    if "df_week" not in globals():
        s["itemID"] = prod
        df_week = s
    else:
        s["itemID"] = prod
        df_week = pd.concat([df_week,s], ignore_index=True)
df_week.head(3)

Unnamed: 0,count,avg_salesPrice,predicted_promotion,weekGroup,itemID
0,2.0,3.11,0.0,1,1
1,313.0,3.11,1.0,2,1
2,35.0,3.11,0.0,3,1


In [25]:
df_week.reset_index(inplace=True)
df_week[df_week['itemID']==5]

Unnamed: 0,index,count,avg_salesPrice,predicted_promotion,weekGroup,itemID
31,31,1.0,7.84,0.0,1,5
32,32,,,,2,5
33,33,1.0,7.84,0.0,3,5
34,34,,,,4,5
35,35,,,,5,5
36,36,,,,6,5
37,37,127.0,7.6,1.0,7,5
38,38,4.0,7.48,0.0,8,5
39,39,39.0,7.48,0.0,9,5
40,40,3.0,7.48,0.0,10,5


In [26]:
df_week.sort_values(by=['itemID', 'weekGroup'], inplace=True)
df_week['count'].fillna(value=0, inplace=True)
df_week['predicted_promotion'].fillna(value=0, inplace=True)
# df_week['avg_salesPrice'].fillna(value=df_week.groupby('itemID')['avg_salesPrice'].transform('mean'), inplace=True)
# df_week['avg_salesPrice'].fillna(value=0, inplace=True)
df_week['avg_salesPrice'].fillna(method='ffill', inplace=True)

In [27]:
# adding the category id too from the items table.
df_m = pd.merge(df_week,items_simple,on=['itemID'],how='left')

In [28]:
df_m['dateFirstSell'] = [d.month for d in df_m['dateFirstSell']]

In [29]:
df_m['retail_price_diff'] = df_m.apply(lambda x: (x.avg_salesPrice - x.recommendedRetailPrice)/x.recommendedRetailPrice, axis = 1)
df_m['price_fluctuate'] = df_m.groupby('itemID').avg_salesPrice.shift(1)
df_m['price_fluctuate'] = df_m.groupby('itemID')['price_fluctuate'].fillna(method='bfill')
df_m['price_fluctuate'] = df_m.apply(lambda x: 0 if x.price_fluctuate == x.avg_salesPrice 
                                     else (-1 if x.price_fluctuate > x.avg_salesPrice else 1), axis = 1)

In [30]:
df_m[df_m['itemID']==5]

Unnamed: 0,index,count,avg_salesPrice,predicted_promotion,weekGroup,itemID,customerRating,manufacturer,brand,category1,category2,category3,recommendedRetailPrice,dateFirstSell,soldWeekly,recentlySold,retail_price_diff,price_fluctuate
38,31,1.0,7.84,0.0,1,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.539906,0
39,32,0.0,7.84,0.0,2,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.539906,0
40,33,1.0,7.84,0.0,3,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.539906,0
41,34,0.0,7.84,0.0,4,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.539906,0
42,35,0.0,7.84,0.0,5,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.539906,0
43,36,0.0,7.84,0.0,6,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.539906,0
44,37,127.0,7.6,1.0,7,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.553991,-1
45,38,4.0,7.48,0.0,8,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.561033,-1
46,39,39.0,7.48,0.0,9,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.561033,-1
47,40,3.0,7.48,0.0,10,5,2.33,2,0,0,0,0,17.04,1.0,False,3.0,-0.561033,1


In [31]:
groupcollist = ['brand','category1','category3']
aggregationlist = [('avg_salesPrice', np.mean, 'avg'),('count', np.sum, 'sum'),('count', np.mean, 'avg')]

for type_id in groupcollist:
    for column_id,aggregator,aggtype in aggregationlist:
        # get numbers from sales data and set column names
        mean_df = df_m.groupby([type_id,'weekGroup']).aggregate(aggregator).reset_index()[[column_id,type_id,'weekGroup']]
        mean_df.columns = [type_id+'_'+aggtype+'_'+column_id,type_id,'weekGroup']
        # merge new columns on sales_m data
        df_m = pd.merge(df_m,mean_df,on=['weekGroup',type_id],how='left')

In [32]:
df_m[df_m['itemID']==1]

Unnamed: 0,index,count,avg_salesPrice,predicted_promotion,weekGroup,itemID,customerRating,manufacturer,brand,category1,category2,category3,recommendedRetailPrice,dateFirstSell,soldWeekly,recentlySold,retail_price_diff,price_fluctuate,brand_avg_avg_salesPrice,brand_sum_count,brand_avg_count,category1_avg_avg_salesPrice,category1_sum_count,category1_avg_count,category3_avg_avg_salesPrice,category3_sum_count,category3_avg_count
0,0,2.0,3.11,0.0,1,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,0,80.488057,55881.0,63.429058,138.274918,18579.0,42.710345,116.237975,6196.0,43.027778
1,1,313.0,3.11,1.0,2,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,1,79.549199,58626.0,55.151458,126.672261,12467.0,22.708561,97.324982,7530.0,37.277228
2,2,35.0,3.11,0.0,3,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,-1,99.45066,55269.0,42.877424,161.587334,15306.0,22.311953,164.694577,8329.0,32.662745
3,3,3.0,3.11,0.0,4,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,0,100.509603,133488.0,77.294731,162.195503,17479.0,21.212379,149.429555,6014.0,20.954704
4,4,1.0,3.11,0.0,5,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,0,98.633131,166790.0,79.575382,154.442048,17300.0,18.642241,163.532071,6006.0,18.594427
5,5,1.0,3.11,0.0,6,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,0,86.349422,105227.0,47.229354,146.018099,16499.0,16.784334,153.84981,3800.0,11.377246
6,6,2.0,3.11,0.0,7,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,0,85.565957,118255.0,48.825351,149.821686,14736.0,14.348588,155.704991,6581.0,18.538028
7,7,299.0,3.11,1.0,8,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,1,99.469022,153578.0,58.842146,154.673661,31313.0,27.017256,194.706605,14797.0,35.230952
8,8,3.0,3.11,0.0,9,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,-1,76.659796,130916.0,50.683701,140.093535,17704.0,16.050771,139.732266,6727.0,17.382429
9,9,31.0,3.11,0.0,10,1,4.38,1,0,0,0,0,8.84,1.0,False,9.0,-0.64819,0,77.624119,125886.0,48.774119,137.527018,14371.0,14.414243,150.015988,4605.0,14.082569


In [33]:
from sklearn.preprocessing import MinMaxScaler
# scale data between -1 & 1
scaler = MinMaxScaler()
scale_col = ['count', 'avg_salesPrice', 'customerRating', 'retail_price_diff', 
             'brand_avg_avg_salesPrice', 'brand_sum_count', 'brand_avg_count', 
             'category1_avg_avg_salesPrice', 'category1_sum_count', 'category1_avg_count', 
             'category3_avg_avg_salesPrice', 'category3_sum_count', 'category3_avg_count']
for c in scale_col:
    scaler.fit(df_m[[c]])
    df_m[c] = pd.DataFrame(scaler.transform(df_m[[c]]))[0]

In [34]:
df_m[df_m['itemID']==10]

Unnamed: 0,index,count,avg_salesPrice,predicted_promotion,weekGroup,itemID,customerRating,manufacturer,brand,category1,category2,category3,recommendedRetailPrice,dateFirstSell,soldWeekly,recentlySold,retail_price_diff,price_fluctuate,brand_avg_avg_salesPrice,brand_sum_count,brand_avg_count,category1_avg_avg_salesPrice,category1_sum_count,category1_avg_count,category3_avg_avg_salesPrice,category3_sum_count,category3_avg_count
75,19112,0.074135,0.002621,1.0,3,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.264284,0,0.024296,0.331369,0.02288,0.341113,0.145073,0.051666,0.124626,0.13021,0.034527
76,19113,0.000207,0.00254,0.0,4,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,-1,0.024558,0.800336,0.041246,0.342541,0.166788,0.048402,0.112982,0.094019,0.022151
77,19114,0.013046,0.00254,0.0,5,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.024093,1.0,0.042463,0.324337,0.164999,0.040775,0.123739,0.093894,0.019656
78,19115,0.000207,0.00254,0.0,6,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.021052,0.630895,0.025202,0.304558,0.156995,0.035261,0.116354,0.059407,0.012027
79,19116,0.000207,0.00254,0.0,7,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.020858,0.709005,0.026054,0.313489,0.139377,0.028032,0.117769,0.102883,0.019596
80,19117,0.000828,0.00254,0.0,8,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.0243,0.920787,0.031399,0.32488,0.305036,0.06563,0.147518,0.231326,0.037242
81,19118,0.0,0.00254,0.0,9,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.018652,0.784915,0.027046,0.290648,0.169037,0.033084,0.105585,0.105165,0.018375
82,19119,0.000414,0.00254,0.0,10,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.018891,0.754757,0.026027,0.284622,0.135729,0.028227,0.113429,0.071991,0.014886
83,19120,0.000207,0.00254,1.0,11,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.015622,0.847677,0.034138,0.230819,0.187574,0.064103,0.113794,0.111309,0.030348


In [35]:
lag_variables  = ['count','avg_salesPrice',
                  'brand_avg_avg_salesPrice','brand_sum_count','brand_avg_count',
#                   'manufacturer_avg_avg_salesPrice','manufacturer_sum_count','manufacturer_avg_count',
                  'category1_avg_avg_salesPrice','category1_sum_count','category1_avg_count',
                  'category3_avg_avg_salesPrice','category3_sum_count','category3_avg_count']
lags = [1 ,2 ,3, 4, 5]
# we will keep the results in thsi dataframe
sales_means = df_m.copy()
for lag in lags:
    sales_new_df = df_m.copy()
    sales_new_df.weekGroup+=lag
    # subset only the lag variables we want
    sales_new_df = sales_new_df[['weekGroup','itemID']+lag_variables]
    sales_new_df.columns = ['weekGroup','itemID']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variables]
    # join with date_block_num,shop_id and item_id
    sales_means = pd.merge(sales_means, sales_new_df,on=['weekGroup','itemID'] ,how='left')

In [36]:
for feat in sales_means.columns:
    if 'count' in feat:
        sales_means[feat]=sales_means[feat].fillna(0)
    elif 'salesPrice' or 'diff' in feat:
#         sales_means[feat]=sales_means[feat].fillna(sales_means[feat].median())
        sales_means[feat]=sales_means[feat].fillna(value=sales_means.groupby('itemID')[feat].transform('mean'))

In [37]:
sales_means[sales_means['itemID']==10]

Unnamed: 0,index,count,avg_salesPrice,predicted_promotion,weekGroup,itemID,customerRating,manufacturer,brand,category1,category2,category3,recommendedRetailPrice,dateFirstSell,soldWeekly,recentlySold,retail_price_diff,price_fluctuate,brand_avg_avg_salesPrice,brand_sum_count,brand_avg_count,category1_avg_avg_salesPrice,category1_sum_count,category1_avg_count,category3_avg_avg_salesPrice,category3_sum_count,category3_avg_count,count_lag_1,avg_salesPrice_lag_1,brand_avg_avg_salesPrice_lag_1,brand_sum_count_lag_1,brand_avg_count_lag_1,category1_avg_avg_salesPrice_lag_1,category1_sum_count_lag_1,category1_avg_count_lag_1,category3_avg_avg_salesPrice_lag_1,category3_sum_count_lag_1,category3_avg_count_lag_1,count_lag_2,avg_salesPrice_lag_2,brand_avg_avg_salesPrice_lag_2,brand_sum_count_lag_2,brand_avg_count_lag_2,category1_avg_avg_salesPrice_lag_2,category1_sum_count_lag_2,category1_avg_count_lag_2,category3_avg_avg_salesPrice_lag_2,category3_sum_count_lag_2,category3_avg_count_lag_2,count_lag_3,avg_salesPrice_lag_3,brand_avg_avg_salesPrice_lag_3,brand_sum_count_lag_3,brand_avg_count_lag_3,category1_avg_avg_salesPrice_lag_3,category1_sum_count_lag_3,category1_avg_count_lag_3,category3_avg_avg_salesPrice_lag_3,category3_sum_count_lag_3,category3_avg_count_lag_3,count_lag_4,avg_salesPrice_lag_4,brand_avg_avg_salesPrice_lag_4,brand_sum_count_lag_4,brand_avg_count_lag_4,category1_avg_avg_salesPrice_lag_4,category1_sum_count_lag_4,category1_avg_count_lag_4,category3_avg_avg_salesPrice_lag_4,category3_sum_count_lag_4,category3_avg_count_lag_4,count_lag_5,avg_salesPrice_lag_5,brand_avg_avg_salesPrice_lag_5,brand_sum_count_lag_5,brand_avg_count_lag_5,category1_avg_avg_salesPrice_lag_5,category1_sum_count_lag_5,category1_avg_count_lag_5,category3_avg_avg_salesPrice_lag_5,category3_sum_count_lag_5,category3_avg_count_lag_5
75,19112,0.074135,0.002621,1.0,3,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.264284,0,0.024296,0.331369,0.02288,0.341113,0.145073,0.051666,0.124626,0.13021,0.034527,0.0,0.00255,0.022088,0.0,0.0,0.315773,0.0,0.0,0.12025,0.0,0.0,0.0,0.002551,0.022544,0.0,0.0,0.320224,0.0,0.0,0.121225,0.0,0.0,0.0,0.002553,0.023193,0.0,0.0,0.325153,0.0,0.0,0.123831,0.0,0.0,0.0,0.002556,0.022971,0.0,0.0,0.325207,0.0,0.0,0.119094,0.0,0.0,0.0,0.00256,0.0235,0.0,0.0,0.328137,0.0,0.0,0.119425,0.0,0.0
76,19113,0.000207,0.00254,0.0,4,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,-1,0.024558,0.800336,0.041246,0.342541,0.166788,0.048402,0.112982,0.094019,0.022151,0.074135,0.002621,0.024296,0.331369,0.02288,0.341113,0.145073,0.051666,0.124626,0.13021,0.034527,0.0,0.002551,0.022544,0.0,0.0,0.320224,0.0,0.0,0.121225,0.0,0.0,0.0,0.002553,0.023193,0.0,0.0,0.325153,0.0,0.0,0.123831,0.0,0.0,0.0,0.002556,0.022971,0.0,0.0,0.325207,0.0,0.0,0.119094,0.0,0.0,0.0,0.00256,0.0235,0.0,0.0,0.328137,0.0,0.0,0.119425,0.0,0.0
77,19114,0.013046,0.00254,0.0,5,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.024093,1.0,0.042463,0.324337,0.164999,0.040775,0.123739,0.093894,0.019656,0.000207,0.00254,0.024558,0.800336,0.041246,0.342541,0.166788,0.048402,0.112982,0.094019,0.022151,0.074135,0.002621,0.024296,0.331369,0.02288,0.341113,0.145073,0.051666,0.124626,0.13021,0.034527,0.0,0.002553,0.023193,0.0,0.0,0.325153,0.0,0.0,0.123831,0.0,0.0,0.0,0.002556,0.022971,0.0,0.0,0.325207,0.0,0.0,0.119094,0.0,0.0,0.0,0.00256,0.0235,0.0,0.0,0.328137,0.0,0.0,0.119425,0.0,0.0
78,19115,0.000207,0.00254,0.0,6,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.021052,0.630895,0.025202,0.304558,0.156995,0.035261,0.116354,0.059407,0.012027,0.013046,0.00254,0.024093,1.0,0.042463,0.324337,0.164999,0.040775,0.123739,0.093894,0.019656,0.000207,0.00254,0.024558,0.800336,0.041246,0.342541,0.166788,0.048402,0.112982,0.094019,0.022151,0.074135,0.002621,0.024296,0.331369,0.02288,0.341113,0.145073,0.051666,0.124626,0.13021,0.034527,0.0,0.002556,0.022971,0.0,0.0,0.325207,0.0,0.0,0.119094,0.0,0.0,0.0,0.00256,0.0235,0.0,0.0,0.328137,0.0,0.0,0.119425,0.0,0.0
79,19116,0.000207,0.00254,0.0,7,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.020858,0.709005,0.026054,0.313489,0.139377,0.028032,0.117769,0.102883,0.019596,0.000207,0.00254,0.021052,0.630895,0.025202,0.304558,0.156995,0.035261,0.116354,0.059407,0.012027,0.013046,0.00254,0.024093,1.0,0.042463,0.324337,0.164999,0.040775,0.123739,0.093894,0.019656,0.000207,0.00254,0.024558,0.800336,0.041246,0.342541,0.166788,0.048402,0.112982,0.094019,0.022151,0.074135,0.002621,0.024296,0.331369,0.02288,0.341113,0.145073,0.051666,0.124626,0.13021,0.034527,0.0,0.00256,0.0235,0.0,0.0,0.328137,0.0,0.0,0.119425,0.0,0.0
80,19117,0.000828,0.00254,0.0,8,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.0243,0.920787,0.031399,0.32488,0.305036,0.06563,0.147518,0.231326,0.037242,0.000207,0.00254,0.020858,0.709005,0.026054,0.313489,0.139377,0.028032,0.117769,0.102883,0.019596,0.000207,0.00254,0.021052,0.630895,0.025202,0.304558,0.156995,0.035261,0.116354,0.059407,0.012027,0.013046,0.00254,0.024093,1.0,0.042463,0.324337,0.164999,0.040775,0.123739,0.093894,0.019656,0.000207,0.00254,0.024558,0.800336,0.041246,0.342541,0.166788,0.048402,0.112982,0.094019,0.022151,0.074135,0.002621,0.024296,0.331369,0.02288,0.341113,0.145073,0.051666,0.124626,0.13021,0.034527
81,19118,0.0,0.00254,0.0,9,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.018652,0.784915,0.027046,0.290648,0.169037,0.033084,0.105585,0.105165,0.018375,0.000828,0.00254,0.0243,0.920787,0.031399,0.32488,0.305036,0.06563,0.147518,0.231326,0.037242,0.000207,0.00254,0.020858,0.709005,0.026054,0.313489,0.139377,0.028032,0.117769,0.102883,0.019596,0.000207,0.00254,0.021052,0.630895,0.025202,0.304558,0.156995,0.035261,0.116354,0.059407,0.012027,0.013046,0.00254,0.024093,1.0,0.042463,0.324337,0.164999,0.040775,0.123739,0.093894,0.019656,0.000207,0.00254,0.024558,0.800336,0.041246,0.342541,0.166788,0.048402,0.112982,0.094019,0.022151
82,19119,0.000414,0.00254,0.0,10,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.018891,0.754757,0.026027,0.284622,0.135729,0.028227,0.113429,0.071991,0.014886,0.0,0.00254,0.018652,0.784915,0.027046,0.290648,0.169037,0.033084,0.105585,0.105165,0.018375,0.000828,0.00254,0.0243,0.920787,0.031399,0.32488,0.305036,0.06563,0.147518,0.231326,0.037242,0.000207,0.00254,0.020858,0.709005,0.026054,0.313489,0.139377,0.028032,0.117769,0.102883,0.019596,0.000207,0.00254,0.021052,0.630895,0.025202,0.304558,0.156995,0.035261,0.116354,0.059407,0.012027,0.013046,0.00254,0.024093,1.0,0.042463,0.324337,0.164999,0.040775,0.123739,0.093894,0.019656
83,19120,0.000207,0.00254,1.0,11,10,1.0,4,0,0,0,0,19.43,2.0,False,2.0,0.255944,0,0.015622,0.847677,0.034138,0.230819,0.187574,0.064103,0.113794,0.111309,0.030348,0.000414,0.00254,0.018891,0.754757,0.026027,0.284622,0.135729,0.028227,0.113429,0.071991,0.014886,0.0,0.00254,0.018652,0.784915,0.027046,0.290648,0.169037,0.033084,0.105585,0.105165,0.018375,0.000828,0.00254,0.0243,0.920787,0.031399,0.32488,0.305036,0.06563,0.147518,0.231326,0.037242,0.000207,0.00254,0.020858,0.709005,0.026054,0.313489,0.139377,0.028032,0.117769,0.102883,0.019596,0.000207,0.00254,0.021052,0.630895,0.025202,0.304558,0.156995,0.035261,0.116354,0.059407,0.012027


In [38]:
cols_to_drop = lag_variables[1:] + ['recommendedRetailPrice', 'category2', 'retail_price_diff', 'price_fluctuate']
cols_to_keep = [x for x in list(sales_means.columns) if x not in cols_to_drop]

In [39]:
# for col in cols_to_drop:
#     del sales_means[col]
# sales_means

In [40]:
# sales_means = sales_means[sales_means['WeekGroup']>2]

In [41]:
sales_means = sales_means[cols_to_keep]
sales_means = sales_means.drop(columns=["index"])#,"predicted_promotion"])

In [42]:
sales_means.groupby("weekGroup").count()

Unnamed: 0_level_0,count,predicted_promotion,itemID,customerRating,manufacturer,brand,category1,category3,dateFirstSell,soldWeekly,recentlySold,count_lag_1,avg_salesPrice_lag_1,brand_avg_avg_salesPrice_lag_1,brand_sum_count_lag_1,brand_avg_count_lag_1,category1_avg_avg_salesPrice_lag_1,category1_sum_count_lag_1,category1_avg_count_lag_1,category3_avg_avg_salesPrice_lag_1,category3_sum_count_lag_1,category3_avg_count_lag_1,count_lag_2,avg_salesPrice_lag_2,brand_avg_avg_salesPrice_lag_2,brand_sum_count_lag_2,brand_avg_count_lag_2,category1_avg_avg_salesPrice_lag_2,category1_sum_count_lag_2,category1_avg_count_lag_2,category3_avg_avg_salesPrice_lag_2,category3_sum_count_lag_2,category3_avg_count_lag_2,count_lag_3,avg_salesPrice_lag_3,brand_avg_avg_salesPrice_lag_3,brand_sum_count_lag_3,brand_avg_count_lag_3,category1_avg_avg_salesPrice_lag_3,category1_sum_count_lag_3,category1_avg_count_lag_3,category3_avg_avg_salesPrice_lag_3,category3_sum_count_lag_3,category3_avg_count_lag_3,count_lag_4,avg_salesPrice_lag_4,brand_avg_avg_salesPrice_lag_4,brand_sum_count_lag_4,brand_avg_count_lag_4,category1_avg_avg_salesPrice_lag_4,category1_sum_count_lag_4,category1_avg_count_lag_4,category3_avg_avg_salesPrice_lag_4,category3_sum_count_lag_4,category3_avg_count_lag_4,count_lag_5,avg_salesPrice_lag_5,brand_avg_avg_salesPrice_lag_5,brand_sum_count_lag_5,brand_avg_count_lag_5,category1_avg_avg_salesPrice_lag_5,category1_sum_count_lag_5,category1_avg_count_lag_5,category3_avg_avg_salesPrice_lag_5,category3_sum_count_lag_5,category3_avg_count_lag_5
weekGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1
1,1664,1664,1664,1664,1664,1664,1664,1664,1664,1664,1664,1664,1582,1582,1664,1664,1582,1664,1664,1582,1664,1664,1664,1555,1555,1664,1664,1555,1664,1664,1555,1664,1664,1664,1514,1514,1664,1664,1514,1664,1664,1514,1664,1664,1664,1478,1478,1664,1664,1478,1664,1664,1478,1664,1664,1664,1431,1431,1664,1664,1431,1664,1664,1431,1664,1664
2,2143,2143,2143,2143,2143,2143,2143,2143,2143,2143,2143,2143,2088,2088,2143,2143,2088,2143,2143,2088,2143,2143,2143,2048,2048,2143,2143,2048,2143,2143,2048,2143,2143,2143,1986,1986,2143,2143,1986,2143,2143,1986,2143,2143,2143,1932,1932,2143,2143,1932,2143,2143,1932,2143,2143,2143,1850,1850,2143,2143,1850,2143,2143,1850,2143,2143
3,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2668,2628,2628,2668,2668,2628,2668,2668,2628,2668,2668,2668,2567,2567,2668,2668,2567,2668,2668,2567,2668,2668,2668,2472,2472,2668,2668,2472,2668,2668,2472,2668,2668,2668,2382,2382,2668,2668,2382,2668,2668,2382,2668,2668,2668,2255,2255,2668,2668,2255,2668,2668,2255,2668,2668
4,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,3290,3290,3395,3395,3290,3395,3395,3290,3395,3395,3395,3167,3167,3395,3395,3167,3395,3395,3167,3395,3395,3395,3050,3050,3395,3395,3050,3395,3395,3050,3395,3395,3395,2890,2890,3395,3395,2890,3395,3395,2890,3395,3395,3395,2707,2707,3395,3395,2707,3395,3395,2707,3395,3395
5,4123,4123,4123,4123,4123,4123,4123,4123,4123,4123,4123,4123,3965,3965,4123,4123,3965,4123,4123,3965,4123,4123,4123,3822,3822,4123,4123,3822,4123,4123,3822,4123,4123,4123,3664,3664,4123,4123,3664,4123,4123,3664,4123,4123,4123,3468,3468,4123,4123,3468,4123,4123,3468,4123,4123,4123,3205,3205,4123,4123,3205,4123,4123,3205,4123,4123
6,4536,4536,4536,4536,4536,4536,4536,4536,4536,4536,4536,4536,4397,4397,4536,4536,4397,4536,4536,4397,4536,4536,4536,4247,4247,4536,4536,4247,4536,4536,4247,4536,4536,4536,4068,4068,4536,4536,4068,4536,4536,4068,4536,4536,4536,3835,3835,4536,4536,3835,4536,4536,3835,4536,4536,4536,3501,3501,4536,4536,3501,4536,4536,3501,4536,4536
7,4807,4807,4807,4807,4807,4807,4807,4807,4807,4807,4807,4807,4681,4681,4807,4807,4681,4807,4807,4681,4807,4807,4807,4535,4535,4807,4807,4535,4807,4807,4535,4807,4807,4807,4344,4344,4807,4807,4344,4807,4807,4344,4807,4807,4807,4034,4034,4807,4807,4034,4807,4807,4034,4807,4807,4807,3432,3432,4807,4807,3432,4807,4807,3432,4807,4807
8,5118,5118,5118,5118,5118,5118,5118,5118,5118,5118,5118,5118,4979,4979,5118,5118,4979,5118,5118,4979,5118,5118,5118,4831,4831,5118,5118,4831,5118,5118,4831,5118,5118,5118,4566,4566,5118,5118,4566,5118,5118,4566,5118,5118,5118,3889,3889,5118,5118,3889,5118,5118,3889,5118,5118,5118,3332,3332,5118,5118,3332,5118,5118,3332,5118,5118
9,5229,5229,5229,5229,5229,5229,5229,5229,5229,5229,5229,5229,5057,5057,5229,5229,5057,5229,5229,5057,5229,5229,5229,4843,4843,5229,5229,4843,5229,5229,4843,5229,5229,5229,4256,4256,5229,5229,4256,5229,5229,4256,5229,5229,5229,3651,3651,5229,5229,3651,5229,5229,3651,5229,5229,5229,3150,3150,5229,5229,3150,5229,5229,3150,5229,5229
10,5266,5266,5266,5266,5266,5266,5266,5266,5266,5266,5266,5266,5004,5004,5266,5266,5004,5266,5266,5004,5266,5266,5266,4332,4332,5266,5266,4332,5266,5266,4332,5266,5266,5266,3820,3820,5266,5266,3820,5266,5266,3820,5266,5266,5266,3270,3270,5266,5266,3270,5266,5266,3270,5266,5266,5266,2849,2849,5266,5266,2849,5266,5266,2849,5266,5266


In [43]:
X_train = sales_means[(sales_means['weekGroup']<=9) & (sales_means["itemID"].isin(df_test.itemID.unique()))]
X_cv =  sales_means[(sales_means['weekGroup']==10) & (sales_means["itemID"].isin(df_test.itemID.unique()))]
X_test = sales_means[sales_means['weekGroup']==11]
Y_train = X_train['count']
Y_cv = X_cv['count']
Y_test = X_test['count']
del X_train['count']
del X_cv['count']
del X_test['count']
del X_train['weekGroup']
del X_cv['weekGroup']
del X_test['weekGroup']

In [44]:
del X_train['itemID']
del X_cv['itemID']

In [None]:
from xgboost import XGBRegressor

xgb1 = XGBRegressor()
parameters = {#'nthread':[4], #when use hyperthread, xgboost may become slower
          'objective':["reg:logistic"],#,"reg:squaredlogerror", 'reg:linear',"reg:squarederror"; logistic was best!
          'learning_rate': [0.1, 0.2, 0.3], # 0.01, 0.3; 0.1 and 0.3 were best depending on model #so called `eta` value
          'max_depth': [3,4,5],#,4,5,6], 3 was best
          'min_child_weight': [1,10],#, 50, 200,300],
          'silent': [1],
          'subsample': [ 0.3,0.5,0.8], # depends on model
          'colsample_bytree': [ 0.3,0.5,0.8], #0.5,0.8
              'n_estimators': [300,500,1000],
#               #'eta': [0.01, 0.1,0.3],
              'seed': [0,20,40],#, 10,40,75],
#               'eval_metric': ['rmsle','rmse'],
          'booster':['gbtree'],#,"dart"], # both models choose dart when optimizing, but gbtree performs better on test data
          'gamma': [0,5,10,20]} #0,100; 10 was best
model = GridSearchCV(xgb1,
                    parameters,
                    cv = 3,
                    n_jobs = 8,
                    verbose=True)

# train the model
model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_cv, Y_cv)], 
    verbose=True, 
    early_stopping_rounds = 10)

print(model.best_score_)
print(model.best_params_)

# model = XGBRegressor(
#     booster = 'gbtree', 
#     colsample_bytree = 0.8, 
#     gamma = 0, 
#     learning_rate = 0.1, 
#     max_depth = 4, 
#     min_child_weight = 1, 
#     objective = 'reg:logistic', 
#     silent = 1, 
#     subsample = 0.3)
# model.fit(
#     X_train, 
#     Y_train, 
#     eval_metric="rmse", 
#     eval_set=[(X_train, Y_train), (X_cv, Y_cv)], 
#     verbose=True, 
#     early_stopping_rounds = 15)

Fitting 3 folds for each of 5832 candidates, totalling 17496 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
# import plotly_express as px
# columns = X_train.columns
# feature_importances = pd.DataFrame({'columns': columns,'importance':model.feature_importances_})
# feature_importances = feature_importances.sort_values(by='importance',ascending=False)
# px.bar(feature_importances,x='columns',y='importance')

In [None]:
def evaluate_result(y: dict, y_pred: dict):
    monetary_value = 0
    y_pred = defaultdict(int, y_pred)  # return prediction of 0 for items without prediction
    
    for item in set(y_pred).difference(set(y)):
        y[item] = 0  # make sure that all items for which a demand has been predicted are contained in the actual demands
    
    for item, demand in y.items():
        predicted_demand = y_pred[item]
        price = product_prices[item]
        monetary_value += price * min(demand, predicted_demand)
        if predicted_demand > demand:
            monetary_value -= .6 * price * (predicted_demand - demand)
            
    return monetary_value

In [None]:
df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
df_items.head()

In [None]:
# actual demand
y = df_test.groupby(by='itemID')['order'].sum().to_dict()

# baseline 1 (average demand of previous 14 days)
y_baseline1 = df_train[df_train['time'] >= '2018-05-19'].groupby(by='itemID')['order'].sum().to_dict()

# baseline 2 (average demand of previous half year)
total_orders = df_train.groupby(by='itemID')['order'].sum().to_dict()
total_observed_days = (df_train['time'].dt.normalize().max() - df_train['time'].dt.normalize().min()).days
y_baseline2 = {item: orders / total_observed_days * 14 for item, orders in total_orders.items()}  # 14-day avg. demand

df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
product_prices = df_info['simulationPrice'].to_dict()

In [None]:
df_test.head()

In [None]:
sales_means.head()

In [None]:
X_test_all = sales_means[sales_means["weekGroup"] == 11]
X_test_all = X_test_all.merge(items_simple.set_index("itemID")[[]], on="itemID", how="outer",
                             suffixes=('exitems', ""))
X_test_all.weekGroup = X_test_all.weekGroup.fillna(0)
X_test_all.soldWeekly = X_test_all.soldWeekly.fillna(False)
X_test_all = X_test_all.fillna(0)
X_test_all['dateFirstSell'] = [6 if d== 0 or d == None else d for d in X_test_all['dateFirstSell']]

doublecol = [col for col in X_test_all.columns.tolist() if 'exitems' in col]
# doublecol = doublecol + ["level_0"]
X_test_all = X_test_all.drop(columns=doublecol)
print(len(X_test_all.itemID.unique()))
cols = X_train.columns.tolist() + ["itemID"]
X_test_all = X_test_all[cols]
X_test_all.head(10)

there is a problem in the prediction since we only predict items that we know are in the test period. if we include all items or only those that were sold in the previous month though, the performance goes down by A LOT!

In [None]:
# apply to unseen data
y_xgboost_all = dict()
lastitems = df_train[(df_train.date >= datetime.date(2018, 5, 15))].itemID.unique()
for prod in lastitems:# X_test.itemID.unique():#
    # this if is cheating!!!
    # but without it we get -12673934.13.....
    if prod in X_test.itemID.unique():
        train_predict = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(model.predict(X_test_all[X_test_all.itemID == prod].drop(['itemID'], axis=1))))).rename(columns={0:'predicted_count'})
        train_predict["actual_count"] = test[test.itemID == prod]["order"].sum()
        y_xgboost_all[prod] = int(train_predict["predicted_count"].sum().round())
#     if prod in df_train.itemID.unique():
    #         y_xgboost_all[prod] = int(train_predict["predicted_count"].sum().round())
    #     else:
    #         y_xgboost_all[prod] = 0

In [None]:
# col = sales_means.drop(columns=["level_0","index"]).columns.tolist()
# col = ["level_0","index"] + col
# sales_means = sales_means[col]

In [None]:
# perfect result
print(f'Perfect Result: {evaluate_result(y, y):.2f}')

# baseline 1
print(f'Baseline 1: {evaluate_result(y, y_baseline1):.2f}')

# baseline 2
print(f'Baseline 2: {evaluate_result(y, y_baseline2):.2f}')

# random forest
print(f'XGBoost: {evaluate_result(y, y_xgboost_all):.2f}')
# 953796.09

In [None]:
len(y_xgboost_all)

In [None]:
y_comparison = pd.DataFrame(y_xgboost_all.items(), columns=['itemID', 'predicted_count'])
y_actual = pd.DataFrame(y.items(), columns=['itemID', 'actual_count'])
y_comparison = y_comparison.join(y_actual.set_index("itemID")[['actual_count']], on="itemID")
y_comparison["absolute_diff"] = abs(y_comparison["actual_count"] - y_comparison["predicted_count"])

In [None]:
y_comparison.sort_values(by="predicted_count", ascending = True).head(15)

In [None]:
# # apply to training data
# y_xgboost_all_train = dict()
# itemsss = df_test[df_test.itemID.isin(sales_means[(sales_means['weekGroup']<=10)].itemID.unique())].itemID.unique()
# for prod in itemsss: #sales_means[sales_means['weekGroup']==9].itemID.unique():
#     train_predict = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(model.predict(sales_means[(sales_means['weekGroup']<=10) & (sales_means["itemID"] == prod)].drop(columns=["itemID","weekGroup","count"]))))).rename(columns={0:'predicted_count'})
# #     train_predict["actual_count"] = train[train.itemID == prod]["order"].sum()
#     y_xgboost_all_train[prod] = int(train_predict["predicted_count"].sum().round())
# #     if prod in df_train.itemID.unique():
# #         y_xgboost_all[prod] = int(train_predict["predicted_count"].sum().round())
# #     else:
# #         y_xgboost_all[prod] = 0

In [None]:
# y_comparison2 = pd.DataFrame(y_xgboost_all_train.items(), columns=['itemID', 'predicted_count'])
# y_actual2 = pd.DataFrame(scaler.inverse_transform(sales_means[sales_means['weekGroup']==9][["count"]]).round())
# y_comparison2["actual_count"] = y_actual2[0]#y_comparison.join(df_train.set_index("itemID")[['count']], on="itemID")
# y_comparison2["absolute_diff"] = abs(y_comparison2["actual_count"] - y_comparison2["predicted_count"])

In [None]:
# y_comparison2.sort_values(by="actual_count", ascending = False).head(10)

In [None]:
# pd.DataFrame(y_xgboost_all.items()).rename(columns={0:'itemID', 1:"demandPrediction"}).to_csv('abraca-data.csv',index=False, sep='|')

In [None]:
# y_xgboost_all

In [None]:
# # import packages
# import pandas as pd
# # import seaborn as sb
# import matplotlib.pyplot as plt
# import numpy as np
# import datetime
# import re
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.decomposition import PCA
# from scipy.interpolate import interpn
# from collections import defaultdict
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBRegressor

In [None]:
# y_xgboost_week_opti = dict()
# for weekly in X_train.soldWeekly.unique(): 
#     x_train_category = X_train[X_train.soldWeekly == weekly]#.drop(columns=["cluster","itemID"])
#     y_train_category = Y_train.to_frame()[Y_train.to_frame().index.isin(list(x_train_category.index))].squeeze()
#     X_cv_category = X_cv[X_cv.soldWeekly == weekly]#.drop(columns=["itemID","cluster"])
#     Y_cv_category = Y_cv.to_frame()[Y_cv.to_frame().index.isin(list(X_cv_category.index))].squeeze()
    
    
#     xgb1 = XGBRegressor()
#     parameters = {#'nthread':[4], #when use hyperthread, xgboost may become slower
#               'objective':["reg:logistic"],#,"reg:squaredlogerror", 'reg:linear',"reg:squarederror"; logistic was best!
#               'learning_rate': [0.1, 0.3], # 0.01, 0.3; 0.1 and 0.3 were best depending on model #so called `eta` value
#               'max_depth': [3],#,4,5,6], 3 was best
#               'min_child_weight': [1],#, 50, 200,300],
#               'silent': [1],
#               'subsample': [ 0.5,0.8], # depends on model
#               'colsample_bytree': [ 0.3,0.9], #0.5,0.8
# #               'n_estimators': [300,500,1000],
# #               #'eta': [0.01, 0.1,0.3],
# #               'seed': [40],#, 10,40,75],
# #               'eval_metric': ['rmsle','rmse'],
#               'booster':['gbtree'],#,"dart"], # both models choose dart when optimizing, but gbtree performs better on test data
#               'gamma': [10]} #0,100; 10 was best
#     xgb_grid = GridSearchCV(xgb1,
#                         parameters,
#                         cv = 2,
#                         n_jobs = 8,
#                         verbose=True)
        
#     # train the model
#     xgb_grid.fit(
#         x_train_category, 
#         y_train_category, 
#         eval_metric="rmse", 
#         eval_set=[(x_train_category, y_train_category), (X_cv_category, Y_cv_category)], 
#         verbose=True, 
#         early_stopping_rounds = 10)
    
#     print(xgb_grid.best_score_)
#     print(xgb_grid.best_params_)
    
#     # apply to unseen data

#     for prod in X_test[X_test.soldWeekly == weekly].itemID.unique():
#         train_predict = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(xgb_grid.predict(X_test[X_test.itemID == prod].drop(['itemID'], axis=1))))).rename(columns={0:'predicted_count'})
#         train_predict["actual_count"] = test[test.itemID == prod]["order"].sum()
#         y_xgboost_week_opti[prod] = int(train_predict["predicted_count"].sum().round())

In [None]:
# print(f'XGBoost by weekly, optimized: {evaluate_result(y, y_xgboost_week_opti):.2f}')