In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# EDA
<hr>

## Table infos

In [2]:
infos = pd.read_csv('infos.csv', sep = '|')

In [3]:
infos.head()

Unnamed: 0,itemID,simulationPrice,promotion
0,1,3.43,
1,2,9.15,
2,3,14.04,
3,4,14.1,
4,5,7.48,


In [4]:
infos.shape

(10463, 3)

In [5]:
len(infos) - infos.count()

itemID                0
simulationPrice       0
promotion          8620
dtype: int64

## Table items

In [6]:
items = pd.read_csv('items.csv', sep = '|')

In [7]:
items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,1,0,1,4.38,1,1,1,8.84
1,2,0,2,3.0,1,2,1,16.92
2,3,0,3,5.0,1,3,1,15.89
3,4,0,2,4.44,1,2,1,40.17
4,5,0,2,2.33,1,1,1,17.04


In [8]:
items.shape

(10463, 8)

In [9]:
items.count()

itemID                    10463
brand                     10463
manufacturer              10463
customerRating            10463
category1                 10463
category2                 10463
category3                 10463
recommendedRetailPrice    10463
dtype: int64

In [10]:
items.nunique()

itemID                    10463
brand                       275
manufacturer                253
customerRating               99
category1                     8
category2                    52
category3                     8
recommendedRetailPrice     5106
dtype: int64

## Table orders

In [11]:
orders = pd.read_csv('orders.csv', sep = '|', parse_dates=['time'])

In [12]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice
0,2018-01-01 00:01:56,2278968,450,1,17.42
1,2018-01-01 00:01:56,2278968,83,1,5.19
2,2018-01-01 00:07:11,2255797,7851,2,20.47
3,2018-01-01 00:09:24,2278968,450,1,17.42
4,2018-01-01 00:09:24,2278968,83,1,5.19


In [13]:
orders.shape

(2181955, 5)

In [14]:
orders.count()

time          2181955
transactID    2181955
itemID        2181955
order         2181955
salesPrice    2181955
dtype: int64

In [15]:
orders.dtypes

time          datetime64[ns]
transactID             int64
itemID                 int64
order                  int64
salesPrice           float64
dtype: object

In [16]:
orders.time

0         2018-01-01 00:01:56
1         2018-01-01 00:01:56
2         2018-01-01 00:07:11
3         2018-01-01 00:09:24
4         2018-01-01 00:09:24
                  ...        
2181950   2018-06-29 23:54:22
2181951   2018-06-29 23:55:09
2181952   2018-06-29 23:55:09
2181953   2018-06-29 23:55:09
2181954   2018-06-29 23:57:01
Name: time, Length: 2181955, dtype: datetime64[ns]

In [17]:
orders.time.dt.week

0           1
1           1
2           1
3           1
4           1
           ..
2181950    26
2181951    26
2181952    26
2181953    26
2181954    26
Name: time, Length: 2181955, dtype: int64

In [18]:
orders.groupby('itemID')['salesPrice'].nunique().max()

22

# Other things
<hr>

## Evalutation function

In [19]:
# custo 
# np.sum((prediction - np.maximum(prediction - target, 0) * 1.6) * simulatedPrice)

## Submission structure

In [20]:
# submission = items[['itemID']]
# submission['demandPrediction'] = 0 # prediction here
# submission.to_csv('submission.csv', sep = '|', index=False)

# First Model (aggregating by every two weeks before target)

## - Creating the structure

In [173]:
df = orders.copy()

In [174]:
df.tail()

Unnamed: 0,time,transactID,itemID,order,salesPrice
2181950,2018-06-29 23:54:22,2040347,9217,1,23.5
2181951,2018-06-29 23:55:09,2260943,2175,1,11.6
2181952,2018-06-29 23:55:09,2260943,2061,1,1.03
2181953,2018-06-29 23:55:09,2260943,2195,1,18.73
2181954,2018-06-29 23:57:01,2525,8462,1,18.93


In [175]:
df.tail().time.dt.dayofweek

2181950    4
2181951    4
2181952    4
2181953    4
2181954    4
Name: time, dtype: int64

In [176]:
# We want the last dayofweek from training to be 6

In [177]:
(df.tail().time.dt.dayofyear + 2) // 7

2181950    26
2181951    26
2181952    26
2181953    26
2181954    26
Name: time, dtype: int64

In [178]:
(df.head().time.dt.dayofyear + 2) // 7

0    0
1    0
2    0
3    0
4    0
Name: time, dtype: int64

In [179]:
df['week'] = (df.time.dt.dayofyear + 2 + 7) // 14
# + 7 because we want weeks 25 and 26 to be together, week 0 will be discarded

In [180]:
maxx = df.week.max()
minn = df.week.min()
minn, maxx

(0, 13)

In [181]:
n_items = items['itemID'].nunique()
print('total number of items:', n_items)
print('expected number of instances:', n_items * (maxx + 1))

total number of items: 10463
expected number of instances: 146482


In [217]:
mi = pd.MultiIndex.from_product([range(0, maxx + 1), items['itemID']], names=['week', 'itemID'])
data = pd.DataFrame(index = mi)

In [218]:
data = data.join(df.groupby(['week', 'itemID'])[['order']].sum(), how = 'left')

In [219]:
data.fillna(0, inplace = True)

In [221]:
data.groupby('itemID').count().min()

order    14
dtype: int64

## - Creating features

In [243]:
# rolling window example with shift
random_df = pd.DataFrame({'B': [0, 1, 2, 3, 4]})
random_df.shift(1).rolling(2).sum()

Unnamed: 0,B
0,
1,
2,1.0
3,3.0
4,5.0


In [None]:
# I am going to create three features: the mean of the orders of the last [1, 2, 4] weeks for each item 

In [222]:
data.sort_values('week', inplace = True)

In [223]:
shifted = data.groupby('itemID')[['order']].shift(1)
for n in range(3):
    rolled = shifted.groupby('itemID', as_index = False)['order'].rolling(2 ** n).mean()
    data['order_%d' % (2 ** n)] = rolled.reset_index(0, drop = True) # rolling has a weird index behavior...

In [224]:
data.count() # the larger the window, more NaN are expected

order      146482
order_1    136019
order_2    125556
order_4    104630
dtype: int64

In [225]:
data.fillna(-1, inplace=True)

In [241]:
# checking if we got what we wanted
data.xs(1, level = 1, drop_level=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,order,order_1,order_2,order_4
week,itemID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,0.0,-1.0,-1.0,-1.0
1,1,0.0,0.0,-1.0,-1.0
2,1,313.0,0.0,0.0,-1.0
3,1,4.0,313.0,156.5,-1.0
4,1,35.0,4.0,158.5,79.25
5,1,2.0,35.0,19.5,88.0
6,1,0.0,2.0,18.5,88.5
7,1,1.0,0.0,1.0,10.25
8,1,300.0,1.0,0.5,9.5
9,1,2.0,300.0,150.5,75.75


##  - fit, predict

In [282]:
# max expected rmse
from sklearn.metrics import mean_squared_error as mse
pred = data.loc[1:12].groupby('itemID')['order'].mean().sort_index()
target_week = data.loc[13:, 'order'].reset_index(level = 0, drop = True).sort_index()
mse(target_week, pred) ** .5

87.19741453702186

In [227]:
train = data.loc[1:12].reset_index()
test = data.loc[13:].reset_index()

In [228]:
y_train = train.pop('order').values
y_test = test.pop('order').values

X_train = train.values
X_test = test.values

In [233]:
import xgboost as xgb

In [285]:
dtrain = xgb.DMatrix(X_train, y_train, missing = -1)
dtest = xgb.DMatrix(X_test, y_test, missing = -1)
# specify parameters via map
param = {'max_depth':6, 'eta':0.01, 'objective':'reg:squarederror' }
num_round = 200
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds = 5,
                evals = [(dtrain, 'train'), (dtest, 'test')])

[0]	train-rmse:105.74801	test-rmse:86.55006
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 5 rounds.
[1]	train-rmse:105.63801	test-rmse:86.44087
[2]	train-rmse:105.53018	test-rmse:86.33538
[3]	train-rmse:105.42452	test-rmse:86.23684
[4]	train-rmse:105.31979	test-rmse:86.14248
[5]	train-rmse:105.21917	test-rmse:86.04932
[6]	train-rmse:105.11799	test-rmse:85.96081
[7]	train-rmse:105.01976	test-rmse:85.87427
[8]	train-rmse:104.92540	test-rmse:85.79022
[9]	train-rmse:104.82925	test-rmse:85.70736
[10]	train-rmse:104.73519	test-rmse:85.62856
[11]	train-rmse:104.64571	test-rmse:85.54427
[12]	train-rmse:104.55712	test-rmse:85.47169
[13]	train-rmse:104.46720	test-rmse:85.39318
[14]	train-rmse:104.37711	test-rmse:85.31720
[15]	train-rmse:104.29012	test-rmse:85.24924
[16]	train-rmse:104.20571	test-rmse:85.17835
[17]	train-rmse:104.12382	test-rmse:85.10540
[18]	train-rmse:104.04105	test-rmse:85.03643
[19]	train-rm