In [66]:
import baseline_models

In [6]:
from google.colab import files


import numpy as np
import pandas as pd
import datetime
import sklearn
from sklearn import linear_model
from sklearn.metrics import *


import seaborn as sns
import itertools
from sklearn.model_selection import TimeSeriesSplit

from matplotlib import pyplot as plt
from sklearn import preprocessing

In [7]:
def wmape(y_true, y_pred):
  errors = y_true - y_pred
  wmape_val = 100*np.round(np.sum(abs(errors))/np.sum(y_true),6)
  return (wmape_val, errors)

In [8]:
def extend_to_right_boundary(df, max_time):
  df_tmp = df.reset_index(level = 0, drop = True).sort_index(ascending=True)
  min_left = min(df.index)[1]
  extended_time = pd.date_range(start = min_left, end = max_time, freq ='W-Mon')
  df_tmp = df_tmp.reindex(extended_time, fill_value = 0.0)
  df_tmp.index.name = "DateTime"
  #print((df.index.get_level_values(level = 0).unique(), extended_time))

  # this does not work
  # weekly_sales.loc[[3129],:].reindex(index = multiindex.at[3129], level = "DateTime", fill_value = 0.0)

  return df_tmp

In [67]:
def prepare_sales_data(sales_df, categories_df, traffic_df, extend_to_common_horizon = False, extend_only_to_max_date = False, shift_list = [None]):
  sales_df_copy = sales_df.copy()
  sales_df_copy['DateTime'] = sales_df_copy['week_starting_date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
  weekly_sales = sales_df_copy.groupby(["product_id", "DateTime"], as_index=True).agg({'sales':'sum'}) # czy to jest potrzebne?

  if extend_to_common_horizon:
    if extend_only_to_max_date:
      max_time_value = max(weekly_sales.index.get_level_values(level = 1))
      expanded_full = weekly_sales.groupby(level=0).apply(lambda x: extend_to_right_boundary(x, max_time_value))
    else:
      t1 = weekly_sales.index.get_level_values(level = 0).unique()
      t2 = weekly_sales.index.get_level_values(level = 1).unique()
      new_index = list(itertools.product(t1, t2))
      expanded_full = weekly_sales.reindex(pd.MultiIndex.from_tuples(new_index, names=['product_id', 'DateTime']), fill_value=0)
  else:
    expanded_full = weekly_sales.groupby(level=0).apply(lambda x: x.reset_index(level=0, drop=True).asfreq("W-Mon", fill_value = 0.0))
    #expanded_full = weekly_sales.groupby(level=0).apply(lambda x: x.reset_index(level=0, drop=True).asfreq("W-Mon").fillna(0))
  
  

  full_sales_dataset = expanded_full\
  .merge(categories_df, left_index=True, right_index = True,  how = 'left')\
  .merge(traffic_df, left_index = True,  right_index = True, how = 'left')

  full_sales_dataset["Date"] = full_sales_dataset.index.get_level_values(level = 1)
  
  # adding columns based on start time
  full_sales_dataset['Week_numb'] = full_sales_dataset['Date'].apply(lambda x: int(x.strftime("%V")))
  full_sales_dataset['YW'] = full_sales_dataset['Date'].apply(lambda x: int(x.strftime("%Y%V")))
  full_sales_dataset['Month'] = full_sales_dataset['Date'].apply(lambda x: int(x.month))

  if shift_list[0] is not None:
    for i in shift_list:
      # do this but within groups, otherwise there is a bug
      full_sales_dataset[[f"sales_lag_{i}W", f"traffic_lag_{i}W"]] = full_sales_dataset.groupby(level = 0)[['sales', 'traffic']].shift(i, fill_value = 0.0)

  #full_sales_dataset["sales_to_traffic"] = full_sales_dataset.sales/full_sales_dataset.traffic
  
  full_sales_dataset.drop("Date", axis = 1, inplace = True)


  return full_sales_dataset


In [10]:
def get_naive_forecast(ts_series, horizon, time_freq = "W-MON"):
  ts_time_index = ts_series.index
  ts_values = ts_series.values

  prediction = ts_values[-1]
  last_time_obs = ts_time_index[-1]

  #prediciton_time_vector = pd.date_range(last_time_obs, periods=3, freq=time_freq)
  horizon_for_loop = horizon + 1
  prediciton_time_vector = pd.Index([last_time_obs + pd.offsets.DateOffset(weeks=i) for i in range(1,horizon_for_loop,1)],  freq = "W-MON")

  prediction_vector = np.repeat(a = prediction, repeats = horizon)
  prediction_series = pd.Series(prediction_vector, index =prediciton_time_vector)

  return prediction_series

### Baseline models

In [None]:
# # my first custom estimator! :)
# # https://towardsdatascience.com/writing-your-own-scikit-learn-classes-for-beginners-1e4e7d4de203
# # https://scikit-learn.org/stable/developers/develop.html

# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
# from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

# class NaiveForecast(BaseEstimator, TransformerMixin, RegressorMixin):
#   def __init__(self, horizon, time_freq = "W-MON", **model_hyper_parameters):
#     super().__init__()
#     self.horizon = horizon
#     self.time_freq = time_freq

#   def fit(self, X, y = None):
#     ts_time_index = X.index
#     ts_values = X.values

#     self.prediction_ = ts_values[-1]
#     self.last_time_obs_ = ts_time_index[-1]
#     return self
    
#   def predict(self, X, y=None):
#     # make sure that it was fitted
#     check_is_fitted(self)
#     #X = check_array(X) ????
#     horizon_for_loop = self.horizon + 1
#     prediciton_time_vector = pd.Index([self.last_time_obs_  + pd.offsets.DateOffset(weeks=i) for i in range(1,horizon_for_loop,1)],  freq = self.time_freq)

#     prediction_vector = np.repeat(a = self.prediction_, repeats = self.horizon)
#     X = X.copy()
#     X = pd.Series(prediction_vector, index =prediciton_time_vector)
#     return X

In [None]:
# # my first custom estimator! :)
# # https://towardsdatascience.com/writing-your-own-scikit-learn-classes-for-beginners-1e4e7d4de203
# # https://scikit-learn.org/stable/developers/develop.html

# from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
# from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

# class MeanForecast(BaseEstimator, TransformerMixin, RegressorMixin):
#   def __init__(self, horizon, time_freq = "W-MON", **model_hyper_parameters):
#     super().__init__()
#     self.horizon = horizon
#     self.time_freq = time_freq

#   def fit(self, X, y = None):
#     ts_time_index = X.index
#     ts_values = X.values

#     self.prediction_ = np.mean(ts_values)
#     self.last_time_obs_ = ts_time_index[-1]
#     return self
    
#   def predict(self, X, y=None):
#     # make sure that it was fitted
#     check_is_fitted(self)
#     #X = check_array(X) ????
#     horizon_for_loop = self.horizon + 1
#     prediciton_time_vector = pd.Index([self.last_time_obs_  + pd.offsets.DateOffset(weeks=i) for i in range(1,horizon_for_loop,1)],  freq = self.time_freq)

#     prediction_vector = np.repeat(a = self.prediction_, repeats = self.horizon)
#     X = X.copy()
#     X = pd.Series(prediction_vector, index =prediciton_time_vector)
#     return X

In [11]:
sales_data = pd.read_csv("sales.csv",   header = 0, sep = ";")
categories = pd.read_csv("categories.csv",   header = 0, sep = ";")
traffic = pd.read_csv("traffic.csv",   header = 0, sep = ";")

In [12]:
traffic['DateTime'] = traffic['week_starting_date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
traffic.drop("week_starting_date", axis = 1, inplace = True)

traffic = traffic.set_index(["product_id", "DateTime"])
categories = categories.set_index(["product_id"])

In [13]:
# extended only to global max date horizon
# check nulls here because looks strange
sales_df_combined_right = prepare_sales_data(sales_data, categories, traffic, True, True, [1,2,3])

In [14]:
sample = sales_df_combined_right.loc[[2658, 1308]]

In [15]:
#!pip install scikit-learn==0.24.0

In [None]:
tscv = TimeSeriesSplit(n_splits=10, test_size = 3)

In [None]:
# score = []
# for tr_index, val_index in tscv.split(sample.loc[2658]):
#   print (tr_index, val_index)
#   X_tr, X_val = sample.loc[2658][["sales"]].iloc[tr_index], sample.loc[2658][["sales"]].iloc[val_index]
#   print(X_tr)
#   print(X_val)
#   forecast = NaiveForecast(X_tr.sales, horizon = 3)
#   print(forecast)
#   print(wmape(X_val.sales, forecast))

In [16]:
all_items = sales_df_combined_right.index.get_level_values(level = 0).unique().tolist()

In [17]:
tmp_dataset = sales_df_combined_right.loc[all_items[27]]

In [18]:
# for i in all_items:
#   print("*******************")
#   print(i)
#   tmp_dataset = sales_df_combined_right.loc[i]
#   #n_splits_tmp = int(np.floor((tmp_dataset.shape[0] - 3)/2))
#   try:
#     tscv = TimeSeriesSplit(n_splits=5, test_size = 3)
#     for tr_index, val_index in tscv.split(tmp_dataset):
#       print (tr_index, val_index)
#       X_tr, X_val = tmp_dataset[["sales"]].iloc[tr_index], tmp_dataset[["sales"]].iloc[val_index]
#       print(X_tr)
#       print(X_val)
#       forecast = get_naive_forecast(X_tr.sales, horizon = 3)
#       print(forecast)
#       print(mean_absolute_percentage_error(X_val.sales, forecast))
#   except Exception as e:
#     print(e)

In [None]:
mean_absolute_percentage_error(X_val.sales, get_naive_forecast(X_tr, horizon = 3))

12.564082594456428

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(tmp_dataset.Week_numb,tmp_dataset.sales)

ValueError: ignored

In [None]:
tmp_dataset.Week_numb

DateTime
2019-03-18    12
2019-03-25    13
2019-04-01    14
2019-04-08    15
2019-04-15    16
              ..
2020-11-30    49
2020-12-07    50
2020-12-14    51
2020-12-21    52
2020-12-28    53
Name: Week_numb, Length: 94, dtype: int64

## Linear regression try

In [None]:
from sklearn import linear_model

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(tmp_dataset[["Week_numb"]].values,tmp_dataset[["sales"]].values)

#predictions = lm.predict(x)


In [None]:
model

LinearRegression()

In [None]:
model.coef_

array([[1.2044772]])

In [None]:
model.intercept_

array([15.80977716])

In [45]:
naive_model = NaiveForecast(horizon = 3, time_freq="W-MON")

In [40]:
mean_model = MeanForecast(horizon = 3, time_freq="W-MON")

In [46]:
abc = naive_model.fit(tmp_dataset.sales[:-3], tmp_dataset.sales[:-3])

In [47]:
naive_model.predict(tmp_dataset.sales[:-3])

2020-12-14    124.0
2020-12-21    124.0
2020-12-28    124.0
Freq: W-MON, dtype: float64

In [22]:
abc = naive_model.fit(tmp_dataset.sales, tmp_dataset.sales)

In [42]:
mean_ = mean_model.fit(tmp_dataset.sales, tmp_dataset.sales)

In [23]:
tmp_dataset.sales

DateTime
2019-03-18      8.0
2019-03-25      0.0
2019-04-01     28.0
2019-04-08    105.0
2019-04-15     18.0
              ...  
2020-11-30    175.0
2020-12-07    124.0
2020-12-14    189.0
2020-12-21     20.0
2020-12-28      0.0
Name: sales, Length: 94, dtype: float64

In [24]:
abc.prediction_

0.0

In [25]:
naive_model.predict(tmp_dataset.sales)

2021-01-04    0.0
2021-01-11    0.0
2021-01-18    0.0
Freq: W-MON, dtype: float64

In [26]:
naive_model.predict(tmp_dataset.sales[-3:])

2021-01-04    0.0
2021-01-11    0.0
2021-01-18    0.0
Freq: W-MON, dtype: float64

In [27]:
# testing cross validation on my custom naive estimator
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate, cross_val_score
tscv = TimeSeriesSplit(n_splits=5, test_size = 3)

In [28]:
naive_model = NaiveForecast(horizon = 3)

In [43]:
%%time

cv_scores = cross_val_score(mean_model, tmp_dataset.sales, tmp_dataset.sales,  cv=tscv, 
                            scoring='neg_root_mean_squared_error', n_jobs=1)

CPU times: user 14.6 ms, sys: 997 µs, total: 15.6 ms
Wall time: 13.9 ms


In [44]:
cv_scores

array([ -33.12894826,  -57.5522575 ,  -85.90745859, -130.98460246,
        -86.95046548])

In [29]:
%%time

cv_scores = cross_val_score(naive_model, tmp_dataset.sales, tmp_dataset.sales,  cv=tscv, 
                            scoring='neg_root_mean_squared_error', n_jobs=1)

CPU times: user 12.6 ms, sys: 999 µs, total: 13.6 ms
Wall time: 11.7 ms


In [None]:
# multiple scoring functions

%%time

cv_scores = cross_validate(naive_model, tmp_dataset.sales, tmp_dataset.sales,  cv=tscv, 
                            scoring=['neg_root_mean_squared_error', 'neg_mean_squared_error'], n_jobs=1)

CPU times: user 22.2 ms, sys: 0 ns, total: 22.2 ms
Wall time: 26.3 ms


In [None]:
cv_scores

{'fit_time': array([0.00107598, 0.00076962, 0.0015161 , 0.00083804, 0.00079799]),
 'score_time': array([0.00239992, 0.00697184, 0.00336385, 0.00220847, 0.0022347 ]),
 'test_neg_mean_squared_error': array([ -1032.33333333,   -299.66666667,  -1492.66666667,  -4085.66666667,
        -10139.        ]),
 'test_neg_root_mean_squared_error': array([ -32.1299445 ,  -17.3108829 ,  -38.63504454,  -63.91921985,
        -100.69260152])}

In [31]:
[i for i in tscv.split(tmp_dataset)][0][0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78])

In [32]:
naive_model.fit(tmp_dataset.sales[[i for i in tscv.split(tmp_dataset)][0][0]])

NaiveForecast(horizon=3)

In [33]:
tmp_dataset.sales[[i for i in tscv.split(tmp_dataset)][0][0]]

DateTime
2019-03-18      8.0
2019-03-25      0.0
2019-04-01     28.0
2019-04-08    105.0
2019-04-15     18.0
              ...  
2020-08-17      0.0
2020-08-24     17.0
2020-08-31     53.0
2020-09-07     36.0
2020-09-14     61.0
Name: sales, Length: 79, dtype: float64

In [34]:
tmp_dataset.sales[[i for i in tscv.split(tmp_dataset)][0][1]]

DateTime
2020-09-21    15.0
2020-09-28    52.0
2020-10-05    91.0
Name: sales, dtype: float64

In [35]:
naive_model.predict(tmp_dataset.sales[[i for i in tscv.split(tmp_dataset)][0][0]])

2020-09-21    61.0
2020-09-28    61.0
2020-10-05    61.0
Freq: W-MON, dtype: float64

In [36]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_true = tmp_dataset.sales[[i for i in tscv.split(tmp_dataset)][0][1]]
                         , y_pred = naive_model.predict(tmp_dataset.sales[[i for i in tscv.split(tmp_dataset)][0][0]])
                         , squared=False)

32.129944496269104

In [37]:
naive_model.prediction_

61.0

In [38]:
# tscv = TimeSeriesSplit(n_splits=5, test_size = 3)
#     for tr_index, val_index in tscv.split(tmp_dataset):
#       print (tr_index, val_index)
#       X_tr, X_val = tmp_dataset[["sales"]].iloc[tr_index], tmp_dataset[["sales"]].iloc[val_index]
#       print(X_tr)
#       print(X_val)
#       forecast = get_naive_forecast(X_tr.sales, horizon = 3)
#       print(forecast)
#       print(mean_absolute_percentage_error(X_val.sales, forecast))

IndentationError: ignored

In [None]:
[i for i in tscv.split(tmp_dataset)]

[(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
         34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
         51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
         68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78]), array([79, 80, 81])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
         34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
         51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
         68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81]),
  array([82, 83, 84])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
         34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 4

In [56]:
data = np.array([40, 20]*100)
ser = pd.Series(data, index = pd.date_range(start = "2020-01-01", periods = 200, freq ='W-Mon'))

In [59]:
naive_test_model = NaiveForecast(horizon = 3, time_freq="W-MON")

In [64]:
mean_squared_error(ser[-3:], naive_test_model.fit(ser[:-3]).predict(ser[-3:]), squared = False)

16.32993161855452

In [65]:
# simple unit tests

from unittest import TestCase