In [25]:
import os
import sys
import itertools
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
rootPath = os.path.split(sys.path[0])[0]
sys.path.append(rootPath)

In [37]:
def load_data(rootPath, filename):
    data_df = pd.read_csv(os.path.join(rootPath, filename))
    data_df.sales_datetime = pd.to_datetime(data_df.sales_datetime, format='%Y-%m-%d', utc=True)
    data_df.set_index('sales_datetime', inplace=True)
    return data_df

def visualize_seasonality(item_daily_data_df):
    year_month_mean = item_daily_data_df.groupby(pd.Grouper(freq='MS')).agg({'sales_qty':'mean'})
    month_mean = item_daily_data_df.groupby(item_daily_data_df.index.month).agg({'sales_qty':'mean'})
    day_of_week_mean = item_daily_data_df.groupby(item_daily_data_df.index.day_of_week).agg({'sales_qty':'mean'})
    day_of_month_mean = item_daily_data_df.groupby(item_daily_data_df.index.day).agg({'sales_qty':'mean'})

    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20,10))
    ax1.plot(year_month_mean)
    ax1.set_title("Overall trend (averaged monthly)")
    ax2.plot(month_mean)
    ax2.set_title("Monthly seasonality")
    ax3.plot(day_of_week_mean)
    ax3.set_title("Day of week average pattern")
    ax4.plot(day_of_month_mean)
    ax4.set_title("Day of month average pattern")

def fill_time_series(raw_data_df):
    # Dataframe must have DatetimeIndex format='%Y-%m-%d'
    data_df = raw_data_df.resample('D').sum()
    data_df.item_price = data_df.item_price.ffill().bfill()
    return data_df
    
    
def add_features_to_raw_data(raw_data_df):
    raw_data_df = raw_data_df.copy()
    raw_data_df.loc[:,'day_of_week'] = raw_data_df.index.day_of_week
    raw_data_df.loc[:,'month_of_year'] = raw_data_df.index.month
    day_of_week_dummies = pd.get_dummies(raw_data_df.day_of_week, prefix='day_of_week')
    month_of_year_dummies = pd.get_dummies(raw_data_df.month_of_year, prefix='month_of_year')
    raw_data_df = raw_data_df.merge(day_of_week_dummies, how='left', left_index=True, right_index=True)
    raw_data_df = raw_data_df.merge(month_of_year_dummies, how='left', left_index=True, right_index=True)
    raw_data_df.loc[:,'year'] = raw_data_df.index.year - raw_data_df.index.year.min()
    raw_data_df.loc[:,'first_third_of_month'] = (raw_data_df.index.day <= 10).astype('int8')
    raw_data_df.loc[:,'second_third_of_month'] = ((raw_data_df.index.day > 10) & (raw_data_df.index.day <= 20)).astype('int8')
    raw_data_df.loc[:,'last_third_of_month'] = (raw_data_df.index.day > 20).astype('int8')
    raw_data_df.drop(columns=['day_of_week', 'month_of_year'], inplace=True)
    return raw_data_df

def split_dataset(all_data_df: pd.DataFrame, 
                  validation_split_date: str, 
                  independent_vars: list, 
                  dependent_var: str):
    X_train = all_data_df[all_data_df.index < pd.to_datetime(validation_split_date, utc=True)][independent_vars].copy()
    X_test = all_data_df[all_data_df.index >= pd.to_datetime(validation_split_date, utc=True)][independent_vars].copy()
    y_train = all_data_df[all_data_df.index < pd.to_datetime(validation_split_date, utc=True)][dependent_var].copy()
    y_test = all_data_df[all_data_df.index >= pd.to_datetime(validation_split_date, utc=True)][dependent_var].copy()
    print(f"Train dataset is from {X_train.index.min().strftime('%Y-%m-%d')} to {X_train.index.max().strftime('%Y-%m-%d')}")
    print(f"Test dataset is from {X_test.index.min().strftime('%Y-%m-%d')} to {X_test.index.max().strftime('%Y-%m-%d')}")
    return X_train, X_test, y_train, y_test

def wmape(actual: pd.Series, predicted: pd.Series):
    score = (np.abs(actual - predicted).sum() / np.abs(actual).sum())*100
    return round(score, 2)

def smape(A, F):
    score = 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))
    return round(score, 2)

def plot_fit_and_residuals(train_sales, 
                           train_predictions, 
                           val_sales, 
                           val_predictions, 
                           features_coefs_df,
                           train_residuals, 
                           test_residuals):
    fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, figsize=(30,20))
    fig.suptitle('Predictions and coefficients plot')
    train_sales.plot(kind='line', ax=ax1)
    train_predictions.plot(kind='line', ax=ax1)
    
    val_sales.plot(kind='line', ax=ax2)
    val_predictions.plot(kind='line', ax=ax2)
    
    ax3.scatter(train_residuals, train_predictions)
    ax4.scatter(test_residuals, val_predictions)
    
    features_coefs_df.plot(kind='barh', ax=ax5)
    plt.show()
    
def time_series_cv(raw_data_filled_df, num_train_years):
    groups = raw_data_filled_df.reset_index().groupby(raw_data_filled_df.index.year).groups
    sorted_groups = [value.tolist() for (key, value) in sorted(groups.items())]#list of indices per year
    if len(groups.keys()) < 2:
        raise ValueError("Not enough groups for validation set.")
    elif len(groups.keys()) <= num_train_years+1:
        return [(list(itertools.chain(*sorted_groups[:-1])), sorted_groups[-1])]
    else:
        return [(list(itertools.chain(*sorted_groups[i:num_train_years+i])), sorted_groups[i+num_train_years])
          for i in range(len(sorted_groups) - num_train_years)]

def transform_and_fit(
    raw_data_df: pd.DataFrame, 
    independent_vars: list, 
    dependent_var: str, 
    validation_split_date: str,
    visualize: bool):
    """ 
    Fills missing time series data in between raw_data_df.index.min() and raw_data_df.index.max(),
    adds calendar features, splits training dataset into training and validation dataset,
    fits estimator on training data and predicts on validation data.
    
    Parameters
    -----------
    raw_data_df: Time series dataframe
    independent_vars: list of column names to be used as independent variables for scikit-learn estimator
    dependent_var: column name to be used as dependent variable (target) for scikit-learn estimator
    validation_split_date: first date of validation set
    visualize: to plot or not
    
    Returns
    --------
    metrics_dict: metadata and metrics for individual item
    
    """
    item_name = raw_data_df.item_name.unique()[0]
    print(f"Transform and fit for {item_name}")
    data_filled = fill_time_series(raw_data_df.copy())
    data_w_feats = add_features_to_raw_data(data_filled)
    X_train, X_val, y_train, y_val = split_dataset(data_w_feats, validation_split_date, independent_vars, dependent_var)
    lin_reg = LinearRegression().fit(X_train, y_train)
    y_pred_train = lin_reg.predict(X_train)
    y_pred_train = pd.Series(y_pred_train, index=y_train.index)
    train_wmape = wmape(y_train, y_pred_train)
    #train_smape = smape(y_train, y_pred_train)
    print(f"Train WMAPE: {train_wmape}")
    #print(f"Train SMAPE: {train_smape}")
    y_pred_val = lin_reg.predict(X_val)
    y_pred_val = pd.Series(y_pred_val, index=y_val.index)
    val_wmape = wmape(y_val, y_pred_val)
    #val_smape = smape(y_val, y_pred_val)
    print(f"Validation WMAPE: {val_wmape}")
    #print(f"Validation SMAPE: {val_smape}")
    
    features_coefs = {var:lin_reg.coef_[idx] for idx,var in enumerate(independent_vars)}
    features_coefs_df = pd.Series(features_coefs)
    train_residuals = y_train - y_pred_train
    test_residuals = y_val - y_pred_val
    
    if visualize:
        plot_fit_and_residuals(y_train, 
                               y_pred_train, 
                               y_val, 
                               y_pred_val, 
                               features_coefs_df,
                               train_residuals, 
                               test_residuals)
    return {
        "item_name": item_name,
        "estimator": lin_reg.__class__.__name__,
        "train_from": X_train.index.min().strftime('%Y-%m-%d'),
        "train_to": X_train.index.max().strftime('%Y-%m-%d'),
        "val_from": X_val.index.min().strftime('%Y-%m-%d'),
        "val_to": X_val.index.max().strftime('%Y-%m-%d'),
        "train_wmape": train_wmape,
        #"train_smape": train_smape,
        "val_wmape": val_wmape,
        #"val_smape": val_smape,
        "features_coefs": features_coefs
    }
    
def transform_and_fit_gridsearch(
    raw_data_df: pd.DataFrame, 
    independent_vars: list, 
    dependent_var: str, 
    validation_split_date: str,
    visualize: bool):
    """ 
    Fills missing time series data in between raw_data_df.index.min() and raw_data_df.index.max(),
    adds calendar features, splits training dataset into training and validation dataset,
    fits estimator on training data and predicts on validation data.
    
    Parameters
    -----------
    raw_data_df: Time series dataframe
    independent_vars: list of column names to be used as independent variables for scikit-learn estimator
    dependent_var: column name to be used as dependent variable (target) for scikit-learn estimator
    validation_split_date: first date of validation set
    visualize: to plot or not
    
    Returns
    --------
    metrics_dict: metadata and metrics for individual item
    
    """
    item_name = raw_data_df.item_name.unique()[0]
    print(f"Transform and gridsearch fit for {item_name}")
    data_filled = fill_time_series(raw_data_df.copy())
    data_w_feats = add_features_to_raw_data(data_filled)
    X_train, X_val, y_train, y_val = split_dataset(data_w_feats, validation_split_date, independent_vars, dependent_var)
    cv_split_idxs = time_series_cv(X_train, num_train_years=3)
    wmape_scorer = make_scorer(wmape, greater_is_better=False)
    grid_search_params = {'alpha': [0.01, 0.1, 1.0, 10, 100]}
    lin_reg_gscv = GridSearchCV(
        Ridge(), 
        grid_search_params,
        scoring=wmape_scorer,
        cv=cv_split_idxs).fit(X_train, y_train)
    
    y_pred_train = lin_reg_gscv.predict(X_train)
    y_pred_train = pd.Series(y_pred_train, index=y_train.index)
    train_wmape = wmape(y_train, y_pred_train)
    #train_smape = smape(y_train, y_pred_train)
    print(f"Train WMAPE: {train_wmape}")
    #print(f"Train SMAPE: {train_smape}")
    y_pred_val = lin_reg_gscv.predict(X_val)
    y_pred_val = pd.Series(y_pred_val, index=y_val.index)
    val_wmape = wmape(y_val, y_pred_val)
    #val_smape = smape(y_val, y_pred_val)
    print(f"Validation WMAPE: {val_wmape}")
    #print(f"Validation SMAPE: {val_smape}")
    
    features_coefs = {var:lin_reg_gscv.best_estimator_.coef_[idx] for idx,var in enumerate(independent_vars)}
    features_coefs_df = pd.Series(features_coefs)
    train_residuals = y_train - y_pred_train
    test_residuals = y_val - y_pred_val
    
    if visualize:
        plot_fit_and_residuals(y_train, 
                               y_pred_train, 
                               y_val, 
                               y_pred_val, 
                               features_coefs_df,
                               train_residuals, 
                               test_residuals)
    return {
        "item_name": item_name,
        "estimator": lin_reg_gscv.best_estimator_.__class__.__name__,
        "train_from": X_train.index.min().strftime('%Y-%m-%d'),
        "train_to": X_train.index.max().strftime('%Y-%m-%d'),
        "val_from": X_val.index.min().strftime('%Y-%m-%d'),
        "val_to": X_val.index.max().strftime('%Y-%m-%d'),
        "train_wmape": train_wmape,
        "val_wmape": val_wmape,
        "features_coefs": features_coefs
    }    

    
    
    
    
    
    

In [38]:
TRAIN_FILENAME = 'data/interim/train_data_90_perc_value_v1_3.csv'
train_data = load_data(rootPath, TRAIN_FILENAME)


In [None]:
train_data.groupby('item_name').agg({'item_name': 'count'}).rename(columns={'item_name':'num_days_sales'}).sort_values(by='num_days_sales', ascending=False)

Unnamed: 0_level_0,item_name,item_price,sales_qty
sales_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-06-14 00:00:00+00:00,Beck`s,16.0,1.0
2016-06-15 00:00:00+00:00,Beck`s,16.0,3.0
2016-06-16 00:00:00+00:00,Beck`s,16.0,2.0
2016-06-17 00:00:00+00:00,Beck`s,16.0,3.0
2016-06-18 00:00:00+00:00,Beck`s,16.0,1.0
...,...,...,...
2016-06-09 00:00:00+00:00,Zlatni_Pan,12.0,3.0
2016-06-10 00:00:00+00:00,Zlatni_Pan,12.0,6.0
2016-06-12 00:00:00+00:00,Zlatni_Pan,12.0,12.0
2016-06-14 00:00:00+00:00,Zlatni_Pan,12.0,1.0


In [17]:
sys.path[0]

'D:\\ML_Python\\Programi\\caffe_bar_sales_analysis\\caffe_bar_sales_analysis\\notebooks'

In [21]:
curPath = os.path.abspath(sys.path[0])
rootPath = os.path.split(curPath)[0]

In [24]:
os.path.split(sys.path[0])[0]

'D:\\ML_Python\\Programi\\caffe_bar_sales_analysis\\caffe_bar_sales_analysis'