In [1]:
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display

In [2]:
# # change used width of browser window
# from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
def log_progress(sequence, every=None, size=None, name='Items'):
#https://github.com/kuk/log-progress
    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [4]:
import pandas as pd
# import cufflinks as cf
# import copy
# import sys
from matplotlib import pyplot
# from pandas.plotting import autocorrelation_plot
import datetime
from collections import defaultdict
from pmdarima.arima import auto_arima

In [5]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [6]:
df_train = pd.read_csv('data/orders0206_train.csv', sep='|', parse_dates=['time'])
df_test = pd.read_csv('data/orders0206_train.csv', sep='|', parse_dates=['time'])

df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
product_prices = df_info['simulationPrice'].to_dict()
df_train["time"] = pd.to_datetime(df_train["time"], format='%Y-%m-%d %H:%M:%S')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
# df_train_small = df_train[df_train["itemID"].isin([7798])]

In [7]:
# aggregate sales per day
df_train['date'] = [d.date() for d in df_train['time']]
df_aggregated = df_train.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
df_aggregated.head()

Unnamed: 0,itemID,date,count
0,1,2018-01-23,1
1,1,2018-01-25,1
2,1,2018-01-29,307
3,1,2018-01-30,3
4,1,2018-01-31,1


In [8]:
# add 0 sales for missing dates per itemID
for prod in df_aggregated.itemID.unique():
    s = df_aggregated.loc[df_aggregated['itemID'] == prod][["date","count"]]
    s = s.set_index("date")
    idx = pd.date_range(s.index.min(), 
                        datetime.date(2018, 6, 1))
    s.index = pd.DatetimeIndex(s.index)
    s = s.reindex(idx, fill_value=0)
    if "df_timeseries" not in globals():
        s["itemID"] = prod
        df_timeseries = s
    else:
        s["itemID"] = prod
        df_timeseries = pd.concat([df_timeseries,s], ignore_index=False)
df_timeseries.head(3)

Unnamed: 0,count,itemID
2018-01-23,1,1
2018-01-24,0,1
2018-01-25,1,1


In [9]:
#model with own parameter settings:
#https://machinelearningmastery.com/arima-for-time-series-forecasting-with-python/
#p=4 because there the auto-correlation is significant, d=1 to make it stationary, m=7 daily (number of observations per seasonal cycle)
y_arima = dict()
for item in df_items.index.unique():
    y_arima[item] = 0
for prod in log_progress(df_timeseries.itemID.unique()[:10], every=1):
    if len(df_timeseries[df_timeseries.itemID == prod].drop(columns=["itemID"],axis=1)) > 10:
        stepwise_model = auto_arima(df_timeseries[df_timeseries.itemID == prod].drop(columns=["itemID"],axis=1), 
#                                 start_p=2, start_q=1,
#                                max_p=6, max_q=3, m=7,
#                                start_P=0, seasonal=True,
#                                d=1, D=1, trace=True,
#                                error_action='ignore',  
#                                suppress_warnings=True, 
#                                stepwise=True
                               )
#     print(stepwise_model.aic())
        model_fit_1 = stepwise_model.fit(df_timeseries[df_timeseries.itemID == prod].drop(columns=["itemID"],axis=1))
        future_forecast = stepwise_model.predict(n_periods=14)
        y_arima[prod] = future_forecast.sum().round() if future_forecast.sum().round() > 0 else 0 # since arima could return negative results
# print(model_fit_1.summary())

VBox(children=(HTML(value=''), IntProgress(value=0, max=10)))

In [10]:
df_train = pd.read_csv('data/orders0206_train.csv', sep='|', parse_dates=['time'])
df_test = pd.read_csv('data/orders0206_test.csv', sep='|', parse_dates=['time'])

df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
product_prices = df_info['simulationPrice'].to_dict()

In [11]:
from collections import defaultdict

def evaluate_result(y: dict, y_pred: dict):
    monetary_value = 0
    y_pred = defaultdict(int, y_pred)  # return prediction of 0 for items without prediction
    
    for item in set(y_pred).difference(set(y)):
        y[item] = 0  # make sure that all items for which a demand has been predicted are contained in the actual demands
    
    for item, demand in y.items():
        predicted_demand = y_pred[item]
        price = product_prices[item]
        monetary_value += price * min(demand, predicted_demand)
        if predicted_demand > demand:
            monetary_value -= .6 * price * (predicted_demand - demand)
            
    return monetary_value

In [12]:
# actual demand
y = df_test.groupby(by='itemID')['order'].sum().to_dict()

# baseline 1 (average demand of previous 14 days)
y_baseline1 = df_train[df_train['time'] >= '2018-05-19'].groupby(by='itemID')['order'].sum().to_dict()

# baseline 2 (average demand of previous half year)
total_orders = df_train.groupby(by='itemID')['order'].sum().to_dict()
total_observed_days = (df_train['time'].dt.normalize().max() - df_train['time'].dt.normalize().min()).days
y_baseline2 = {item: orders / total_observed_days * 14 for item, orders in total_orders.items()}  # 14-day avg. demand

In [13]:
# perfect result
print(f'Perfect Result: {evaluate_result(y, y):.2f}')

# baseline 1
print(f'Baseline 1: {evaluate_result(y, y_baseline1):.2f}')

# baseline 2
print(f'Baseline 2: {evaluate_result(y, y_baseline2):.2f}')

# auto arima
print(f'Auto Arima: {evaluate_result(y, y_arima):.2f}')

# mit 6 -414600.38
# -39897.06

Perfect Result: 7895975.87
Baseline 1: -3727365.60
Baseline 2: -1672504.21
Auto Arima: -478.51


In [14]:
df_result_all = pd.DataFrame.from_dict(y_arima, orient='index')
# df_result_all = df_result_all[df_result_all[0] != 0]

In [15]:
for index, row in df_result_all.iterrows():
#     df_result_all.loc[index, "cluster"] = test_aggregated_w[test_aggregated_w.itemID == index]["cluster"].iloc[0]
    df_result_all.loc[index, 'actual'] = y[index]
    df_result_all.loc[index, 'diff'] = abs(y[index] - row[0])
    df_result_all.loc[index, 'diff_perc'] = ((abs(row[0]-y[index])/((row[0]+y[index])/2))*100).round()

In [16]:
bins = [0, 5, 10, 15, 25, 50, 75, 100, 150, 200, 500, 1000, 10000]
display(df_result_all.groupby([ pd.cut(df_result_all['diff'], bins)])[0].count())
display(df_result_all.groupby([ pd.cut(df_result_all['diff_perc'], bins)])[0].count())
# df_result.groupby([ pd.cut(df_result_all['diff'], bins),"cluster"])[0].count()
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df_result.groupby("diff_perc").count()[[0]])
# print("200% is there if the prediction is > 0 and the actual is 0")

diff
(0, 5]           2342
(5, 10]           410
(10, 15]          179
(15, 25]          214
(25, 50]          310
(50, 75]          174
(75, 100]         145
(100, 150]        213
(150, 200]        118
(200, 500]        306
(500, 1000]        81
(1000, 10000]      28
Name: 0, dtype: int64

diff_perc
(0, 5]              0
(5, 10]             0
(10, 15]            0
(15, 25]            0
(25, 50]            0
(50, 75]            0
(75, 100]           0
(100, 150]          0
(150, 200]       4520
(200, 500]          0
(500, 1000]         0
(1000, 10000]       0
Name: 0, dtype: int64