In [147]:
# import packages
import pandas as pd
# import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
import datetime
import re
from sklearn.externals import joblib 
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy.interpolate import interpn
from collections import defaultdict

In [148]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
sales_means =pd.read_csv('./data/sales_means.csv', sep='|').drop(columns=["predicted_promotion"])
scaler = joblib.load('scaler.pkl') 
df_test = pd.read_csv('data/orders0206_test.csv', sep='|', parse_dates=['time'])
df_test['date'] = [d.date() for d in df_test['time']]
df_train = pd.read_csv('data/orders0206_train.csv', sep='|', parse_dates=['time'])
df_train['date'] = [d.date() for d in df_train['time']]
bundles = pd.read_csv('./data/bundles.csv', sep=',')

In [None]:
sales_means.head()

In [None]:
sales_means = sales_means.join(bundles.set_index("itemID"), on="itemID", how="left")

In [None]:
sales_means.sample(frac=1).reset_index(drop=True)
X_train = sales_means.tail(len(sales_means)-2000)#[(sales_means['weekGroup']<=9)]# & (sales_means['weekGroup']==10)]
X_cv =  sales_means.head(2000)#[sales_means['weekGroup']==10]
X_test = sales_means[sales_means['weekGroup']==11]
Y_train = X_train['count']
Y_cv = X_cv['count']
Y_test = X_test['count']
del X_train['count']
del X_cv['count']
del X_test['count']
del X_train['weekGroup']
del X_cv['weekGroup']
del X_test['weekGroup']

In [None]:
del X_train['itemID']
del X_cv['itemID']

In [None]:
from xgboost import XGBRegressor
model = XGBRegressor(
    booster = 'gbtree', 
    colsample_bytree = 0.8, 
    gamma = 0, 
    learning_rate = 0.08, 
    max_depth= 3, 
    min_child_weight= 1, 
    n_estimators= 200, 
    objective= 'reg:logistic', 
    seed= 20, 
    silent= 1, 
    subsample= 0.8)
model.fit(
    X_train, 
    Y_train, 
    eval_metric="rmse", 
    eval_set=[(X_train, Y_train), (X_cv, Y_cv)], 
    verbose=True, 
    early_stopping_rounds = 10)

In [None]:
import plotly_express as px
columns = X_train.columns
feature_importances = pd.DataFrame({'columns': columns,'importance':model.feature_importances_})
feature_importances = feature_importances.sort_values(by='importance',ascending=False)
px.bar(feature_importances,x='columns',y='importance')

In [None]:
def evaluate_result(y: dict, y_pred: dict):
    monetary_value = 0
    y_pred = defaultdict(int, y_pred)  # return prediction of 0 for items without prediction
    
    for item in set(y_pred).difference(set(y)):
        y[item] = 0  # make sure that all items for which a demand has been predicted are contained in the actual demands
    
    for item, demand in y.items():
        predicted_demand = y_pred[item]
        price = product_prices[item]
        monetary_value += price * min(demand, predicted_demand)
        if predicted_demand > demand:
            monetary_value -= .6 * price * (predicted_demand - demand)
            
    return monetary_value

In [None]:
df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
df_items.head()

In [None]:
# actual demand
y = df_test.groupby(by='itemID')['order'].sum().to_dict()

# baseline 1 (average demand of previous 14 days)
y_baseline1 = df_train[df_train['time'] >= '2018-05-19'].groupby(by='itemID')['order'].sum().to_dict()

# baseline 2 (average demand of previous half year)
total_orders = df_train.groupby(by='itemID')['order'].sum().to_dict()
total_observed_days = (df_train['time'].dt.normalize().max() - df_train['time'].dt.normalize().min()).days
y_baseline2 = {item: orders / total_observed_days * 14 for item, orders in total_orders.items()}  # 14-day avg. demand

df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
product_prices = df_info['simulationPrice'].to_dict()

In [None]:
# apply to unseen data
y_xgboost_all = dict()
for prod in X_test.itemID.unique():
    train_predict = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(model.predict(X_test[X_test.itemID == prod].drop(['itemID'], axis=1))))).rename(columns={0:'predicted_count'})
    # train_predict["actual_count"] = test[test.itemID == prod]["order"].sum()
    y_xgboost_all[prod] = int(train_predict["predicted_count"].sum().round())
#     if prod in df_train.itemID.unique():
#         y_xgboost_all[prod] = int(train_predict["predicted_count"].sum().round())
#     else:
#         y_xgboost_all[prod] = 0

In [None]:
# perfect result
print(f'Perfect Result: {evaluate_result(y, y):.2f}')

# baseline 1
print(f'Baseline 1: {evaluate_result(y, y_baseline1):.2f}')

# baseline 2
print(f'Baseline 2: {evaluate_result(y, y_baseline2):.2f}')

# random forest
print(f'XGBoost: {evaluate_result(y, y_xgboost_all):.2f}')
# 953796.09