In [1]:
cd ..

/Users/joeranbosma/stack/Projects/M5Forecast


In [2]:
# basic imports
import os
import numpy as np
import pandas as pd
from tqdm import tqdm as tqdm
import matplotlib.pyplot as plt

# own imports
from flow import load_data, select_dates, sales_to_money, select_final_day, create_submission
from evaluation import Referee, CrossValiDataGenerator
from agent import KDayMeanTimesWeeklyPattern

os.environ['DATA_DIR'] = 'data/'
os.environ['SUB_DIR'] = 'submissions/'

# General settings for figures
plt.rcParams["figure.figsize"] = (9, 4.5)
plt.rcParams["savefig.format"] = "pdf"
plt.rcParams["savefig.dpi"] = 400
plt.rcParams["savefig.transparent"] = True
plt.rcParams.update({'font.size': 13})  # 12, 16
plt.rcParams["savefig.bbox"] = 'tight'

# Baseline for M5Forecast - Accuracy
This notebook calculates the mean of the final $k$ training days and multiplies those with the typical weekly pattern to create a slightly more advanced baseline. 

In [None]:
# Load data
calendar, sales_train_validation, sell_prices = load_data()

# select true sales period
sales_true = select_dates(sales_train_validation, day_end=1913, num_days=28, include_metadata=True)
sales_train = select_dates(sales_train_validation, day_start=1, num_days=1913-28, include_metadata=True)

In [None]:
# create referee with true sales
ref = Referee(sales_true, sales_train, sell_prices, calendar)

## Determine weekly pattern

In [None]:
sales_train

In [None]:
col_list = []
col_names = []

for i in range(1, 1+7):
    # select days from a certain day of the week
    cols = list(calendar[calendar.wday == i].d.values)
    col_name = calendar[calendar.wday == i].weekday.values[0]
    # filter days to match training set
    cols = [d for d in cols if d in sales_train.columns]
    col_list.append(cols); col_names.append(col_name)
    print("{} is day nr. {} of the week".format(col_name, i))

In [None]:
num_sales = [sales_train[cols].sum(axis=1).sum() for cols in col_list]

In [None]:
x = range(len(col_names))
f, ax = plt.subplots(1, 1, figsize=(12, 5))
ax.bar(x, height=num_sales)
ax.set_xticks(x)
ax.set_xticklabels(col_names)
plt.show()

In [None]:
portions = np.array(num_sales) / np.sum(num_sales)
portions

## Predict weekly sales as mean of final $k$ days times weekly pattern
Based on the results shown in `k-day-average.ipynb` and semantic preference for multiples of 7, $k$ is set to 28. 

In [None]:
k = 28

# create skeleton for predictions
sales_pred = sales_true.copy()
day_cols = sales_pred.filter(regex='d_').columns
sales_pred[day_cols] = sales_pred[day_cols] * 0  # set all predictions to zero

# set all predictions to training mean of last k days
df = select_dates(sales_train, num_days=k, day_end=select_final_day(sales_train))
weekly_mean = df.filter(regex='d_').mean(axis=1) * 7
for dday in day_cols:
    week_day = calendar[calendar.d == dday].wday.values[0]
    portion = portions[week_day-1]
    sales_pred[dday] = weekly_mean * portion

# evaluate predictions
metrics = ref.evaluate(sales_pred)
print(metrics)

**Result**: Incorporating the weekly pattern reduces the WRMSSE signigicantly, from ~1.1 to ~0.67. 

## Create submission

In [None]:
create_submission(sales_pred, filename='weekly_pattern_fold1', add_timestamp=False)

Submit the predictions using:

`kaggle competitions submit -c m5-forecasting-accuracy -f submissions/submission_weekly_pattern.csv -m "Mean final 30 train days times weekly pattern"`

## Create forecast with KDayMeanTimesWeeklyPattern agent
Selection of training and validation set is handled by the CrossValiDataGenerator. The value for $k$ is changed to 28 for semantic reasons. 

In [None]:
k = 28
# Set up generator
cv_generator = CrossValiDataGenerator(sales_train_validation)
train_df, val_df = cv_generator.get_train_val_split(fold=1, train_size=-1)

In [None]:
# Define model, with mean of k days
model = KDayMeanTimesWeeklyPattern(calendar, k=k)

# Determine the weekly pattern based on complete training set
model.fit(train_df)

# Predict next 28 days based on the k days before it and the weekly pattern
sales_pred = model.predict(train_df)
sales_pred

In [None]:
ref = Referee(val_df, train_df, sell_prices, calendar)

In [None]:
ref.evaluate(sales_pred)

## Cross-validation of performance

In [None]:
k = 28
cv_generator = CrossValiDataGenerator(sales_train_validation, train_size=k)
train_df, _ = cv_generator.get_train_val_split(fold=10, train_size=-1)

### Determine weekly pattern with days before validation set nr. 10

In [None]:
model.fit(train_df)
model.portions

### Predict and evaluate

In [None]:
metrics_list = []

for fold in tqdm(range(1, 1+10), "CV"):
    # The train df should contain at least 28 days, for correct weights during evaluation
    train_df, val_df = cv_generator.get_train_val_split(fold=fold, train_size=max(28, k))
    
    # Predict next 28 days based on the k days before it 
    sales_pred = model.predict(train_df)
    
    # evaluate predictions
    ref = Referee(val_df, train_df, sell_prices, calendar, verbose=False)
    metrics = ref.evaluate(sales_pred)
    # print(metrics)
    metrics_list.append(metrics)

In [None]:
def plot_dict(dictionaries, exclude_zero=False, labels=[]):
    f, ax = plt.subplots(1, 1)
    
    if isinstance(dictionaries, dict):
        dictionaries = [dictionaries]
        labels = [labels]
    
    for i, dic in enumerate(dictionaries):
        keys = list(dic.keys())
        vals = list(dic.values())
        if exclude_zero:
            keys = [k for k in keys if k != 0]
            vals = [val for (k, val) in dic.items() if k != 0]
        
        try:
            ax.plot(keys, vals, label=labels[i])
        except:
            ax.plot(keys, vals)
    
    if len(labels) > 0:
        plt.legend(loc=(1.05, 0.))
    
    plt.show()

plot_dict(metrics_list, labels=['Fold %d'%d for d in range(1, 1+10)])