In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# EDA
<hr>

## Table infos

In [6]:
infos = pd.read_csv('dora/datasets/1.0v/infos.csv', sep = '|')

In [7]:
infos.head()

Unnamed: 0,itemID,simulationPrice,promotion
0,1,3.43,
1,2,9.15,
2,3,14.04,
3,4,14.1,
4,5,7.48,


In [8]:
infos.shape

(10463, 3)

In [9]:
len(infos) - infos.count()

itemID                0
simulationPrice       0
promotion          8620
dtype: int64

## Table items

In [12]:
items = pd.read_csv('dora/datasets/1.0v/items.csv', sep = '|')

In [13]:
items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,1,0,1,4.38,1,1,1,8.84
1,2,0,2,3.0,1,2,1,16.92
2,3,0,3,5.0,1,3,1,15.89
3,4,0,2,4.44,1,2,1,40.17
4,5,0,2,2.33,1,1,1,17.04


In [14]:
items.shape

(10463, 8)

In [15]:
items.count()

itemID                    10463
brand                     10463
manufacturer              10463
customerRating            10463
category1                 10463
category2                 10463
category3                 10463
recommendedRetailPrice    10463
dtype: int64

In [16]:
items.nunique()

itemID                    10463
brand                       275
manufacturer                253
customerRating               99
category1                     8
category2                    52
category3                     8
recommendedRetailPrice     5106
dtype: int64

## Table orders

In [18]:
orders = pd.read_csv('dora/datasets/1.0v/orders.csv', sep = '|', parse_dates=['time'])

In [20]:
orders.shape

(2181955, 5)

In [21]:
orders.count()

time          2181955
transactID    2181955
itemID        2181955
order         2181955
salesPrice    2181955
dtype: int64

In [22]:
orders.dtypes

time          datetime64[ns]
transactID             int64
itemID                 int64
order                  int64
salesPrice           float64
dtype: object

In [23]:
orders.time

0         2018-01-01 00:01:56
1         2018-01-01 00:01:56
2         2018-01-01 00:07:11
3         2018-01-01 00:09:24
4         2018-01-01 00:09:24
                  ...        
2181950   2018-06-29 23:54:22
2181951   2018-06-29 23:55:09
2181952   2018-06-29 23:55:09
2181953   2018-06-29 23:55:09
2181954   2018-06-29 23:57:01
Name: time, Length: 2181955, dtype: datetime64[ns]

In [24]:
orders.time.dt.week

0           1
1           1
2           1
3           1
4           1
           ..
2181950    26
2181951    26
2181952    26
2181953    26
2181954    26
Name: time, Length: 2181955, dtype: int64

In [25]:
orders.groupby('itemID')['salesPrice'].nunique().max()

22

# Other things
<hr>

## Evalutation function

In [1]:
# custo 
cost = lambda prediction, target, simulatedPrice : np.sum((prediction - np.maximum(prediction - target, 0) * 1.6) * simulatedPrice)

## Submission structure

In [27]:
submission = items[['itemID']]
submission['demandPrediction'] = 0 # prediction here
submission.to_csv('submission.csv', sep = '|', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['demandPrediction'] = 0 # prediction here


# First Model (aggregating by every two weeks before target)

In [28]:
df = orders.copy()

In [29]:
df.tail()

Unnamed: 0,time,transactID,itemID,order,salesPrice
2181950,2018-06-29 23:54:22,2040347,9217,1,23.5
2181951,2018-06-29 23:55:09,2260943,2175,1,11.6
2181952,2018-06-29 23:55:09,2260943,2061,1,1.03
2181953,2018-06-29 23:55:09,2260943,2195,1,18.73
2181954,2018-06-29 23:57:01,2525,8462,1,18.93


In [30]:
df.tail().time.dt.dayofweek

2181950    4
2181951    4
2181952    4
2181953    4
2181954    4
Name: time, dtype: int64

In [31]:
# We want the last dayofweek from training to be 6

In [32]:
(df.tail().time.dt.dayofyear + 2) // 7

2181950    26
2181951    26
2181952    26
2181953    26
2181954    26
Name: time, dtype: int64

In [33]:
(df.head().time.dt.dayofyear + 2) // 7

0    0
1    0
2    0
3    0
4    0
Name: time, dtype: int64

In [34]:
df['week'] = (df.time.dt.dayofyear + 2 + 7) // 14
# + 7 because we want weeks 25 and 26 to be together, week 0 will be discarded

In [35]:
maxx = df.week.max()
minn = df.week.min()
minn, maxx

(0, 13)

In [36]:
data = pd.DataFrame(index = pd.MultiIndex.from_product([range(0, maxx + 1), items['itemID']], names=['week', 'itemID']))
data['order'] = np.NaN

In [37]:
data.fillna(0, inplace = True)

In [38]:
g = df.groupby(['week', 'itemID'])['order'].sum()

In [39]:
g.sum()

2715559

In [40]:
data['order'] += g

In [41]:
data['order'].sum()

2715559.0

In [42]:
data.dropna(inplace=True) # g might have pairs week itemID not present in our data df
# I didn't really thing too deeply about this... there might be something wrong in the code

In [46]:
train = data.loc[1:12].reset_index()
test = data.loc[13:].reset_index()

In [49]:
train

Unnamed: 0,week,itemID,order
0,1,3,90.0
1,1,4,1.0
2,1,5,2.0
3,1,8,1.0
4,1,9,1.0
...,...,...,...
36320,12,10441,6.0
36321,12,10442,10.0
36322,12,10443,7.0
36323,12,10449,14.0


In [50]:
y_train = train.pop('order').values
y_test = test.pop('order').values

X_train = train.values
X_test = test.values

In [51]:
y_train.sum(), y_test.sum()

(2519023.0, 170426.0)

In [205]:
X_train.shape, X_test.shape

((36325, 2), (3549, 2))

In [206]:
import xgboost as xgb

In [211]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)
# specify parameters via map
param = {'max_depth':2, 'eta':0.3, 'objective':'reg:squarederror' }
num_round = 100
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds = 5,
                evals = [(dtrain, 'train'), (dtest, 'test')])

[0]	train-rmse:188.35466	test-rmse:141.61888
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 5 rounds.
[1]	train-rmse:183.81259	test-rmse:140.00558
[2]	train-rmse:181.37389	test-rmse:138.84898
[3]	train-rmse:180.16801	test-rmse:138.73764
[4]	train-rmse:178.90533	test-rmse:139.61211
[5]	train-rmse:178.52675	test-rmse:139.80324
[6]	train-rmse:178.21046	test-rmse:140.63464
[7]	train-rmse:178.02478	test-rmse:140.56982
[8]	train-rmse:177.87239	test-rmse:140.60701
Stopping. Best iteration:
[3]	train-rmse:180.16801	test-rmse:138.73764

