<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#-Forecasting-Demand-for-Optimized-Inventory-Planning-" data-toc-modified-id="-Forecasting-Demand-for-Optimized-Inventory-Planning--1"><center> Forecasting Demand for Optimized Inventory Planning </center></a></span><ul class="toc-item"><li><span><a href="#Imports" data-toc-modified-id="Imports-1.1">Imports</a></span></li></ul></li><li><span><a href="#Simple-processing" data-toc-modified-id="Simple-processing-2">Simple processing</a></span><ul class="toc-item"><li><span><a href="#Train/test-split" data-toc-modified-id="Train/test-split-2.1">Train/test split</a></span></li></ul></li><li><span><a href="#Simple-xgboost" data-toc-modified-id="Simple-xgboost-3">Simple xgboost</a></span></li></ul></div>

<h1><center> Forecasting Demand for Optimized Inventory Planning </center></h1>


## Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 50)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
import seaborn as sns
sns.set()

The 2 cells below can be skipped if not wanted

In [2]:
%%javascript
$('#menubar').toggle();

<IPython.core.display.Javascript object>

In [3]:
# This is optional and can be skipped if not installed
%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append("../main")

from utils import read_data, process_time, merge_data

In [5]:
infos, items, orders = read_data()

**OBS:** This time processing is different than Tobias; I consider the **two** last weeks, he considers just the last **one** week

In [6]:
process_time(orders)

# Simple processing

In [7]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice,days,days_backwards,group_backwards
0,2018-01-01 00:01:56,2278968,450,1,17.42,1,180,13
1,2018-01-01 00:01:56,2278968,83,1,5.19,1,180,13
2,2018-01-01 00:07:11,2255797,7851,2,20.47,1,180,13
3,2018-01-01 00:09:24,2278968,450,1,17.42,1,180,13
4,2018-01-01 00:09:24,2278968,83,1,5.19,1,180,13


In [8]:
aggs = {"order": "sum", "salesPrice": "mean"}
df = orders.groupby(["group_backwards", "itemID"],
                    as_index=False).agg(aggs)
df.rename(columns={x: x+"_"+y for x, y in aggs.items()}, inplace=True)
df.head()

Unnamed: 0,group_backwards,itemID,order_sum,salesPrice_mean
0,1,1,3,3.43
1,1,3,140,14.04
2,1,4,145,14.1
3,1,5,1,7.48
4,1,7,1,34.39


In [9]:
df = merge_data(df, items, infos)
df.head()

Unnamed: 0,group_backwards,itemID,order_sum,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,simulationPrice,promotion
0,1,1,3,3.43,0,1,4.38,1,1,1,8.84,3.43,
1,3,1,31,3.11,0,1,4.38,1,1,1,8.84,3.43,
2,4,1,3,3.11,0,1,4.38,1,1,1,8.84,3.43,
3,5,1,299,3.11,0,1,4.38,1,1,1,8.84,3.43,
4,6,1,2,3.11,0,1,4.38,1,1,1,8.84,3.43,


In [10]:
# Remove some columns..,
# we could prob use simulationPrice, but lets not for now
data = df.drop(columns=["promotion", "itemID", "simulationPrice"])
data.set_index("group_backwards", inplace=True)
data.sort_index(inplace=True)

## Train/test split

In [11]:
train = data.loc[2:]
x_train = train.drop(columns="order_sum")
y_train = train["order_sum"]

test = data.loc[1]
x_test = test.drop(columns="order_sum")
y_test = test["order_sum"]

In [12]:
y_train.sum(), y_test.sum()

(2417048, 298511)

In [13]:
x_train.shape, x_test.shape

((34833, 8), (4682, 8))

# Simple xgboost

In [14]:
import xgboost as xgb

In [22]:
dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test, y_test)
# specify parameters via map
#param = {'max_depth':2, 'eta':0.3, 'objective': 'reg:squarederror'}
param = {'objective': 'reg:squarederror'}
num_round = 100
bst = xgb.train(param, dtrain,
                num_round, early_stopping_rounds=5,
                evals = [(dtrain, 'train'), (dtest, 'test')])

[0]	train-rmse:188.74406	test-rmse:171.08737
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 5 rounds.
[1]	train-rmse:182.83247	test-rmse:166.64780
[2]	train-rmse:179.58496	test-rmse:164.29442
[3]	train-rmse:177.32872	test-rmse:163.61066
[4]	train-rmse:174.94792	test-rmse:162.97627
[5]	train-rmse:173.80353	test-rmse:162.70467
[6]	train-rmse:172.61009	test-rmse:162.33612
[7]	train-rmse:171.34149	test-rmse:162.18359
[8]	train-rmse:170.52031	test-rmse:162.17513
[9]	train-rmse:170.05641	test-rmse:162.20081
[10]	train-rmse:169.51828	test-rmse:162.72906
[11]	train-rmse:169.11403	test-rmse:162.63315
[12]	train-rmse:168.61763	test-rmse:162.97955
[13]	train-rmse:168.29451	test-rmse:162.91130
Stopping. Best iteration:
[8]	train-rmse:170.52031	test-rmse:162.17513



In [20]:
y_train.std(), y_test.std()

(187.5070430244958, 169.67687028219245)