# Forecasting beer sales - arima

In [53]:
import pandas as pd
import numpy as np
import os

import pmdarima as pm

from pmdarima.arima import auto_arima
from pmdarima.datasets import load_lynx
import joblib
import pickle

In [54]:
DATASET_NAME = 'off-trade' # all_cz, off-trade_cz, on-trade_cz

DATA_PATH = f'./data/{DATASET_NAME}_data.csv'

MODEL_PATH = f'./models/{DATASET_NAME}_model.joblib'

RESULT_CSV_PATH = f'./results/{DATASET_NAME}_result.csv'
RESULT_EXCEL_PATH = f'./results/{DATASET_NAME}_result.xlsx'

In [62]:
data = pd.read_csv(DATA_PATH, header=0, decimal=",")

data['Date'] = data[['Year','Week']].apply(lambda x : datetime.datetime.strptime(f'{x[0]}-W{x[1]}-1', "%Y-W%W-%w"), axis=1)
data.set_index("Date", drop=False, inplace=True)

data.head()

Unnamed: 0_level_0,SkuShort,ProductGroup,PrimaryPack,Country,Year,Week,NumberWorkdays,AvgTemp,AvgRain,AvgSun,IsLockdown,PdtHl,PrevWeekPdtHl1,BgtHl,PrevWeekBgtHl1,SalesHl,PrevWeekSalesHl1,PrevWeekSalesHl2,OldPredSalesHl,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2016-01-04,1015,CLEAR BEER,RB,CZ,2016,1,5,-1.164286,0.935714,1.014286,0,,,1637.344974,,1805.0,,,705.4995,2016-01-04
2016-01-04,1016,CLEAR BEER,RB,CZ,2016,1,5,-1.164286,0.935714,1.014286,0,,,293.344724,,159.1128,,,160.0957,2016-01-04
2016-01-04,1022,CLEAR BEER,RB,CZ,2016,1,5,-1.164286,0.935714,1.014286,0,,,59.144115,,72.49,,,74.442,2016-01-04
2016-01-04,1026,CLEAR BEER,RB,CZ,2016,1,5,-1.164286,0.935714,1.014286,0,,,10.669629,,23.8392,,,8.8567,2016-01-04
2016-01-04,1027,CLEAR BEER,RB,CZ,2016,1,5,-1.164286,0.935714,1.014286,0,,,2.109411,,3.96,,,4.9003,2016-01-04


In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 44195 entries, 2016-01-04 to 2021-01-04
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   SkuShort          44195 non-null  int64         
 1   ProductGroup      44195 non-null  object        
 2   PrimaryPack       44195 non-null  object        
 3   Country           44195 non-null  object        
 4   Year              44195 non-null  int64         
 5   Week              44195 non-null  int64         
 6   NumberWorkdays    44195 non-null  int64         
 7   AvgTemp           44195 non-null  float64       
 8   AvgRain           44195 non-null  float64       
 9   AvgSun            44195 non-null  float64       
 10  IsLockdown        44195 non-null  int64         
 11  PdtHl             29061 non-null  float64       
 12  PrevWeekPdtHl1    29071 non-null  float64       
 13  BgtHl             35449 non-null  float64       
 14  PrevW

In [63]:
cols_drop = ['Year', 'SalesHl', 'OldPredSalesHl']
#'SkuShort', 'ProductGroup', 'PrimaryPack', 'Country',

y = pd.DataFrame(data.SalesHl).round(0).astype(int)
X = data.drop(cols_drop, axis=1).fillna(0)

X.IsLockdown = X.IsLockdown.astype('bool')
X.SkuShort = X.SkuShort.astype('category')
X.ProductGroup = X.SkuShort.astype('category')
X.PrimaryPack = X.SkuShort.astype('category')
X.Country = X.Country.astype('category').cat.codes
X['PrevWeekSalesDiff'] = X.PrevWeekSalesHl1-X.PrevWeekSalesHl2

#X = X.drop(['PrevWeekSalesHl1', 'PrevWeekSalesHl2'], axis=1)

# X.PrevWeekSalesHl1 = X.PrevWeekSalesHl1.round(0).astype(int)
# X.PrevWeekSalesHl2 = X.PrevWeekSalesHl2.round(0).astype(int)
# X.BgtHl = X.BgtHl.round(0).astype(int)
# X.PdtHl = X.PdtHl.round(0).astype(int)


y_oldPred = pd.DataFrame(data.OldPredSalesHl).fillna(0).round(0).astype(int)

#y.head()
X.head()

Unnamed: 0_level_0,SkuShort,ProductGroup,PrimaryPack,Country,Week,NumberWorkdays,AvgTemp,AvgRain,AvgSun,IsLockdown,PdtHl,PrevWeekPdtHl1,BgtHl,PrevWeekBgtHl1,PrevWeekSalesHl1,PrevWeekSalesHl2,Date,PrevWeekSalesDiff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-01-04,1015,1015,1015,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,1637.344974,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1016,1016,1016,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,293.344724,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1022,1022,1022,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,59.144115,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1026,1026,1026,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,10.669629,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1027,1027,1027,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,2.109411,0.0,0.0,0.0,2016-01-04,0.0


In [64]:
from pmdarima import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

X_train.head()


Unnamed: 0_level_0,SkuShort,ProductGroup,PrimaryPack,Country,Week,NumberWorkdays,AvgTemp,AvgRain,AvgSun,IsLockdown,PdtHl,PrevWeekPdtHl1,BgtHl,PrevWeekBgtHl1,PrevWeekSalesHl1,PrevWeekSalesHl2,Date,PrevWeekSalesDiff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-01-04,1015,1015,1015,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,1637.344974,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1016,1016,1016,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,293.344724,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1022,1022,1022,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,59.144115,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1026,1026,1026,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,10.669629,0.0,0.0,0.0,2016-01-04,0.0
2016-01-04,1027,1027,1027,0,1,5,-1.164286,0.935714,1.014286,False,0.0,0.0,2.109411,0.0,0.0,0.0,2016-01-04,0.0


In [66]:

# Fit a simple auto_arima model
arima = pm.auto_arima(X_train, trace=True,
                      suppress_warnings=True, maxiter=10,
                      seasonal=True, m=52)

arima.summary()

# # #############################################################################
# # Plot actual test vs. forecasts:
# x = np.arange(test.shape[0])
# plt.scatter(X_train, test, marker='x')
# plt.plot(X_train, arima.predict(n_periods=test.shape[0]))
# plt.title('Actual test samples vs. forecasts')
# plt.show()

TypeError: float() argument must be a string or a number, not 'Timestamp'