In [1]:
import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
import pandas as pd
import statsmodels.api as sm
import matplotlib
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['text.color'] = 'k'

# Loading Data

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Evaluation Functions

In [3]:
def smape(y, y_pred):
    
    div = (abs(y_pred) + abs(y)) / 2
    errors = abs(y_pred - y) / div
        
    smape = np.sum(errors) / len(y)
    return smape

def compute_avg_smape (df_y, df_y_pred):
    
    avg_smape = 0
    for i in range(df_y_pred.shape[1]):
        err = smape(y=df_y.iloc[:,i],
                              y_pred=df_y_pred.iloc[:,i])
        avg_smape += err

    avg_smape /= df_y_pred.shape[1]
    return avg_smape

# Helper Functions

1. Categorisation by week of year
2. Categorisation by day of week
3. Calculate the average sales for every day of week
4. Calculate the average sales for every week of year
5. Adding or subtracting sesonality from the data - need a function or parameter for this action
6. Making changes to the data by adding/removing the seasonality

In [4]:
categorize_by_week_of_year = lambda df_train: df_train.index.dayofyear // 7
categorize_by_day_of_week = lambda df_train: df_train.index.dayofweek

In [7]:
def compute_seasonality(df, categorization):
    df = pd.DataFrame()
    df["values"] = series
    df.index = series.index
    df["cat"] = categorization(df)
    return df.groupby(by="cat")["values"].mean()

In [None]:
def train_and_forecast(data, categorization, trainer, forecaster, deseasonize, steps_to_forecast=90):
    """
    Split input data, deseasonalizes train data,
    train using trainer (data -> model),
    forecast using forecaster
    
    predicts values and applies seasonalization and returns predicted vs actual values
    
    :param data: dataset with the training data
    :param categorization: Function used to split values into various periods of the season.
    :type categorization: pd.DataFrame -> some categorical type, eg. int
    :param trainer: Function used to train the model
    :type trainer: pd.DataFrame -> model
    :param forecaster: (model, steps) -> prediction
    :param steps_to_forecast:  number of steps to forecast
    :returns:  a dataframe with:
                                date
                                sales - true values
                                forecast - forecasted values
    """
    
    #prepare trainig and validation datasets
    df_train = data.iloc[:-365].copy()
    df_validation = data.iloc[-365:].copy()
    df_validation.index = pd.DatetimeIndex(df_validation["date"])
    df_train.index = pd.DatetimeIndex(df_train["date"])
    
    if deseasonize:
        seas = compute_seasonality(df_train["sales"], categorization)
        series = remove_seasonal_component(df_train["sales"], categorization, seas)
        df_train["sales"] = series
    
    df_train = df_train.reset_index(drop=True)

    # train
    model = trainer(df_train)
    
    # forecast
    forecast = forecaster(model, steps_to_forecast)
    
    # Create the pandas series from the forecast
    forecast = pd.Series(forecast)
    forecast.name = "sales"
    forecast.index = pd.DatetimeIndex(start='2017-01-01', 
                                      freq="D",
                                      periods = forecast.size)
    
    if deseasonize:
        forecast = add_seasonal_component(forecast, categorization, seas)
    
    final_forecast = pd.DataFrame()
    final_forecast['real_values'] = df_validation["sales"][:steps_to_forecast]
    final_forecast['forecast'] = forecast
    
    return final_forecast