In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from helper import DateToOrdinal

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import cross_val_score

import xgboost as xgb

In [2]:
data_folder = "data"
filename = "train.csv"
df = pd.read_csv(os.path.join("..", data_folder, filename))#, parse_dates=True, index_col="date")

In [3]:
def create_lagged_features(df, value, lags):
    for lag in range(1, lags + 1):
        df[f't-{lag}'] = df[value].shift(lag)
    df.dropna(inplace=True)
    return df

In [4]:
df = df[["orders", "date", "warehouse"]]

In [5]:
df

Unnamed: 0,orders,date,warehouse
0,6895.0,2020-12-05,Prague_1
1,6584.0,2020-12-06,Prague_1
2,7030.0,2020-12-07,Prague_1
3,6550.0,2020-12-08,Prague_1
4,6910.0,2020-12-09,Prague_1
...,...,...,...
7335,6733.0,2024-03-10,Budapest_1
7336,6492.0,2024-03-11,Budapest_1
7337,6661.0,2024-03-12,Budapest_1
7338,6843.0,2024-03-13,Budapest_1


In [13]:
df_lagged = create_lagged_features(df, "orders", lags=14)
df_lagged

Unnamed: 0,orders,date,warehouse,t-1,t-2,t-3,t-4,t-5,t-6,t-7,t-8,t-9,t-10,t-11,t-12,t-13,t-14
31,7422.0,28,Prague_1,7466.0,7984.0,6841.0,6553.0,3754.0,6238.0,7618.0,7597.0,7210.0,6702.0,7784.0,4327.0,8968.0,8875.0
32,7669.0,29,Prague_1,7422.0,7466.0,7984.0,6841.0,6553.0,3754.0,6238.0,7618.0,7597.0,7210.0,6702.0,7784.0,4327.0,8968.0
33,8083.0,30,Prague_1,7669.0,7422.0,7466.0,7984.0,6841.0,6553.0,3754.0,6238.0,7618.0,7597.0,7210.0,6702.0,7784.0,4327.0
34,7555.0,31,Prague_1,8083.0,7669.0,7422.0,7466.0,7984.0,6841.0,6553.0,3754.0,6238.0,7618.0,7597.0,7210.0,6702.0,7784.0
35,7435.0,32,Prague_1,7555.0,8083.0,7669.0,7422.0,7466.0,7984.0,6841.0,6553.0,3754.0,6238.0,7618.0,7597.0,7210.0,6702.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7335,6733.0,1184,Budapest_1,7099.0,7488.0,6530.0,6557.0,6575.0,6450.0,6258.0,7103.0,7249.0,6443.0,6340.0,6213.0,6322.0,6363.0
7336,6492.0,1185,Budapest_1,6733.0,7099.0,7488.0,6530.0,6557.0,6575.0,6450.0,6258.0,7103.0,7249.0,6443.0,6340.0,6213.0,6322.0
7337,6661.0,1186,Budapest_1,6492.0,6733.0,7099.0,7488.0,6530.0,6557.0,6575.0,6450.0,6258.0,7103.0,7249.0,6443.0,6340.0,6213.0
7338,6843.0,1187,Budapest_1,6661.0,6492.0,6733.0,7099.0,7488.0,6530.0,6557.0,6575.0,6450.0,6258.0,7103.0,7249.0,6443.0,6340.0


In [14]:
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', seed=42)
xgb_pipe = make_pipeline(xgb_regressor)

In [15]:
date_encoder = DateToOrdinal()
df_lagged["date"] = date_encoder.fit_transform(df_lagged["date"].copy())

In [16]:
warehouses = df_lagged["warehouse"].unique().tolist()

for warehouse in warehouses:
    warehouse_data = df_lagged[df_lagged["warehouse"] == warehouse]

    warehouse_data_X = warehouse_data.drop(columns=["orders", "warehouse"])
    warehouse_data_y = warehouse_data["orders"]

    final_score = abs(cross_val_score(xgb_pipe, warehouse_data_X, warehouse_data_y, cv=5, scoring="neg_mean_absolute_percentage_error").mean()) * 100

    print(warehouse, final_score)
    print()

Prague_1 6.510305187591789

Brno_1 7.017239459838331

Prague_2 7.109163656772218

Prague_3 7.682908168631883

Munich_1 29.296961560827672

Frankfurt_1 7.213988773178319

Budapest_1 5.857344571616363



### We see that we have much better values with lagged features, expecially with 14 days ! :)

### TODO: 
* Find best lag-size
* Gridsearch best model parameters