#Libraries and data

In [1]:
#libraries
import numpy as np
import pandas as pd
from greykite.framework.templates.autogen.forecast_config import *
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.common.features.timeseries_features import *
from greykite.common.evaluation import EvaluationMetricEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results
from plotly.offline import iplot

In [5]:
df = pd.read_csv('./files/furniture_decor_data.csv')
df

Unnamed: 0.1,Unnamed: 0,date,price,freight_value,quantity_sold,cci_value,unemr_value,inflation_value
0,0,2017-07-25 10:50:22,75.0,13.08,1,99.34319,12.8,2.711626
1,1,2017-07-28 14:20:19,75.0,20.02,1,99.34319,12.8,2.711626
2,2,2017-07-28 15:56:14,75.0,20.02,1,99.34319,12.8,2.711626
3,3,2017-07-29 13:22:44,75.0,13.08,2,99.34319,12.8,2.711626
4,4,2017-08-01 13:11:47,75.0,13.08,3,99.37289,12.6,2.455909
...,...,...,...,...,...,...,...,...
420,420,2018-07-11 18:26:53,89.9,32.65,1,99.66840,12.3,4.484844
421,421,2018-07-22 13:09:08,89.9,26.93,1,99.66840,12.3,4.484844
422,422,2018-08-06 15:21:10,89.9,17.76,1,99.62103,12.1,4.193016
423,423,2018-08-09 11:07:40,109.9,22.07,1,99.62103,12.1,4.193016


In [6]:
# Drop the 'Unnamed: 0' column
df = df.drop(columns=['Unnamed: 0'])

# Extract the date portion from the 'date' column
df['date'] = pd.to_datetime(df['date']).dt.date

# Convert the 'date' column to 'dd/mm/yyyy' format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.strftime('%d/%m/%Y')

df.head()

Unnamed: 0,date,price,freight_value,quantity_sold,cci_value,unemr_value,inflation_value
0,25/07/2017,75.0,13.08,1,99.34319,12.8,2.711626
1,28/07/2017,75.0,20.02,1,99.34319,12.8,2.711626
2,28/07/2017,75.0,20.02,1,99.34319,12.8,2.711626
3,29/07/2017,75.0,13.08,2,99.34319,12.8,2.711626
4,01/08/2017,75.0,13.08,3,99.37289,12.6,2.455909


In [10]:
# Convert the 'date' column in the 'train' DataFrame to datetime format
df['date'] = pd.to_datetime(df['date'])


Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.



In [11]:
#inspecting df
df.dtypes


date               datetime64[ns]
price                     float64
freight_value             float64
quantity_sold               int64
cci_value                 float64
unemr_value               float64
inflation_value           float64
dtype: object

In [25]:
#Rename variable
df = df.rename(columns = {'quantity_sold': 'y'})
df.head(0)

Unnamed: 0,date,price,freight_value,y,cci_value,unemr_value,inflation_value


In [40]:
df.isnull().sum()

date               0
price              0
freight_value      0
y                  0
cci_value          0
unemr_value        0
inflation_value    0
dtype: int64

#Silverkite Preparations

In [26]:
#Specifying Time Series names
metadata = MetadataParam(time_col = "date",
                         value_col = "y",
                         freq = "D",
                         train_end_date = pd.to_datetime("2018-08-18"))
metadata

MetadataParam(anomaly_info=None, date_format=None, freq='D', time_col='date', train_end_date=Timestamp('2018-08-18 00:00:00'), value_col='y')

In [27]:
#growth terms possibilities
growth = dict(growth_term = ["linear", "quadratic", "sqrt"])
growth

{'growth_term': ['linear', 'quadratic', 'sqrt']}

In [28]:
#seasonalities
seasonality = dict(yearly_seasonality = "auto",
                   quarterly_seasonality = "auto",
                   monthly_seasonality = "auto",
                   weekly_seasonality = "auto",
                   daily_seasonality = "auto")
seasonality

{'yearly_seasonality': 'auto',
 'quarterly_seasonality': 'auto',
 'monthly_seasonality': 'auto',
 'weekly_seasonality': 'auto',
 'daily_seasonality': 'auto'}

In [29]:
#checking which countries are available and their holidays
get_available_holiday_lookup_countries(["Brazil"])
get_available_holidays_across_countries(countries = ["Brazil"],
                                        year_start = 2017,
                                        year_end = 2018)

['All Souls Day',
 'Ash Wednesday',
 'Christmas Day',
 'Corpus Christi Holiday',
 'Easter',
 'Good Friday',
 'Independence Day',
 'Labor Day',
 "New Year's Day",
 "Our Lady of Apparecida's Day",
 'Pentecost',
 'Proclamation of the Republic',
 'Rio Carnival',
 'Tiradentes',
 'Trinity Sunday']

In [30]:
#Changepoints -> reflects the changes in the trend
changepoints = dict(changepoints_dict = dict(method = "auto"))

In [31]:
#Regressors
regressors = dict(regressor_cols = ["price", "freight_value", "cci_value",
                                    "unemr_value","inflation_value"])
regressors

{'regressor_cols': ['price',
  'freight_value',
  'cci_value',
  'unemr_value',
  'inflation_value']}

In [32]:
#Lagged Regressors
lagged_regressors = dict(lagged_regressor_dict = {"price": "auto",
                                                  "freight_value": "auto",
                                                  "cci_value": "auto",
                                                  "unemr_value": "auto",
                                                  "inflation_value": "auto",})

In [33]:
#autogression -> dependent on the forecasting horizon
autoregression = dict(autoreg_dict = "auto")

In [34]:
#Fitting algorithms
custom = dict(fit_algorithm_dict = [dict(fit_algorithm = "linear"),
                                    dict(fit_algorithm = "ridge"),
                                    dict(fit_algorithm = "rf"),
                                    dict(fit_algorithm = "gradient_boosting")])
custom

{'fit_algorithm_dict': [{'fit_algorithm': 'linear'},
  {'fit_algorithm': 'ridge'},
  {'fit_algorithm': 'rf'},
  {'fit_algorithm': 'gradient_boosting'}]}

#Silverkite Model

In [35]:
#Build the model
model_components = ModelComponentsParam(growth = growth,
                                        seasonality = seasonality,
                                        changepoints = changepoints,
                                        regressors = regressors,
                                        lagged_regressors = lagged_regressors,
                                        autoregression = autoregression,
                                        custom = custom)

In [36]:
#Cross-validation
evaluation_period = EvaluationPeriodParam(cv_min_train_periods= df.shape[0] - 180 -31,
                                          cv_expanding_window = True,
                                          cv_max_splits = 50,
                                          cv_periods_between_splits = 16)

In [37]:
#Evaluation metric 
evaluation_metric = EvaluationMetricParam(
    cv_selection_metric = EvaluationMetricEnum.RootMeanSquaredError.name)

In [38]:
#Configuration
config = ForecastConfig(model_template = ModelTemplateEnum.SILVERKITE.name,
                        forecast_horizon = 31,
                        metadata_param = metadata,
                        model_components_param = model_components,
                        evaluation_period_param=evaluation_period,
                        evaluation_metric_param = evaluation_metric)

In [39]:
#Forecasting
forecaster = Forecaster()
result = forecaster.run_forecast_config(df = df,
                                        config = config)


Duplicate timestamps have been removed.


Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.


Duplicate timestamps have been removed.



Fitting 8 folds for each of 12 candidates, totalling 96 fits



Input data has many null values. Missing 61.11% of one input.


Input data has many null values. Missing 61.11% of one input.




The following Fourier series terms are removed due to collinearity:
['cos4_tow_weekly']


Input data has many null values. Missing 12.90% of one input.


Input data has many null values. Missing 12.90% of one input.


4 value(s) in y_true were NA or infinite and are omitted in error calc.


Input data has many null values. Missing 12.90% of one input.


Input data has many null values. Missing 12.90% of one input.


4 value(s) in y_true were NA or infinite and are omitted in error calc.


Input data has many null values. Missing 12.90% of one input.


Input data has many null values. Missing 12.90% of one input.


4 value(s) in y_true were NA or infinite and are omitted in error calc.


Input data has many null values. Missing 12.90% of one input.


Input data has many null values. Missing 12.90% of one input.


4 value(s) in y_true were NA or infinite and are omitted in error calc.


Input data has many null values. Missing 12.90% of one input.


Input data has many null values. Miss

ValueError: ``fut_df`` must be a dataframe of non-zero size.

In [None]:
#visualization
fig = result.backtest.plot()
iplot(fig)

#Parameter Tuning results

In [None]:
#CV results
cv_results = summarize_grid_search_results(
    grid_search = result.grid_search,
    decimals = 1,
    score_func = EvaluationMetricEnum.RootMeanSquaredError.name)

In [None]:
#Set the CV results index
cv_results["params"] = cv_results["params"].astype(str)
cv_results.set_index("params", drop = True, inplace = True)
cv_results

Unnamed: 0_level_0,rank_test_CORR,rank_test_R2,rank_test_MSE,rank_test_RMSE,rank_test_MAE,rank_test_MedAE,rank_test_MAPE,rank_test_MedAPE,rank_test_sMAPE,rank_test_Q80,...,std_test_OutsideTolerance5p,split0_train_OutsideTolerance5p,split1_train_OutsideTolerance5p,split2_train_OutsideTolerance5p,split3_train_OutsideTolerance5p,split4_train_OutsideTolerance5p,split5_train_OutsideTolerance5p,split6_train_OutsideTolerance5p,split7_train_OutsideTolerance5p,std_train_OutsideTolerance5p
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",2,2,2,2,1,3,1,3,1,8,...,0.1,0.5,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",1,1,1,1,2,1,2,1,2,6,...,0.1,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",3,3,3,3,3,2,3,2,3,7,...,0.1,0.5,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",5,6,6,6,6,6,5,6,6,12,...,0.0,0.4,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",4,4,4,4,4,4,4,4,4,10,...,0.1,0.4,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",6,5,5,5,5,5,6,5,5,11,...,0.0,0.4,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.0
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",10,10,10,10,10,11,10,10,10,5,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.0
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",12,11,11,11,12,10,12,11,12,9,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.0
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",11,12,12,12,11,12,11,12,11,4,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.0
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",8,7,7,7,7,7,7,7,7,1,...,0.1,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.0


In [None]:
#Looking at the best results
cv_results[["rank_test_RMSE", "mean_test_RMSE",
            "param_estimator__fit_algorithm_dict",
            "param_estimator__growth_term"]]

Unnamed: 0_level_0,rank_test_RMSE,mean_test_RMSE,param_estimator__fit_algorithm_dict,param_estimator__growth_term
params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",2,47.8,{'fit_algorithm': 'linear'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",1,47.8,{'fit_algorithm': 'linear'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'linear'})]",3,47.9,{'fit_algorithm': 'linear'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",6,48.9,{'fit_algorithm': 'ridge'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",4,48.5,{'fit_algorithm': 'ridge'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'ridge'})]",5,48.9,{'fit_algorithm': 'ridge'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",10,63.6,{'fit_algorithm': 'rf'},linear
"[('estimator__growth_term', 'quadratic'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",11,64.8,{'fit_algorithm': 'rf'},quadratic
"[('estimator__growth_term', 'sqrt'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'rf'})]",12,65.2,{'fit_algorithm': 'rf'},sqrt
"[('estimator__growth_term', 'linear'), ('estimator__fit_algorithm_dict', {'fit_algorithm': 'gradient_boosting'})]",7,56.5,{'fit_algorithm': 'gradient_boosting'},linear


In [None]:
best_params = cv_results[cv_results.rank_test_RMSE == 1][["mean_test_RMSE",
                                            "param_estimator__fit_algorithm_dict",
                                            "param_estimator__growth_term"]].transpose()

In [None]:
best_params.to_csv("C:/Users/ratho/Desktop/new/learn/time_series_forecasting/Forecasting Product/best_params_silverkite_lr.csv")