In [30]:
import os
import datetime
from dateutil.relativedelta import relativedelta
import math
import pickle

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import mlflow
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# from hyperopt.pyll import scope
import optuna
from sqlalchemy import create_engine
import pymysql

### MlFlow setup
* Tracking server: no
* Backend store: local filesystem
* Artifacts store: local filesystem

In [31]:
mlflow.set_tracking_uri("http://127.0.0.1:80")

### MySQL data

In [32]:
connection_string = 'mysql+pymysql://application:passpass@127.0.0.1'
database = 'retail_dataset_kaggle'

In [33]:
def get_db_connection(mysql_con_string, database_name):
    # sqlEngine       = create_engine('mysql+pymysql://application:passpass@127.0.0.1/retail_data', pool_recycle=3600)
    sqlEngine       = create_engine(mysql_con_string + '/' + database_name, pool_recycle=3600)
    dbConnection    = sqlEngine.connect()
    return dbConnection

In [34]:
con =  get_db_connection(connection_string, database)

### Prediction Monthly of sales for all stores and SKUS

### Feature Engineering

In [35]:
features_df  = pd.read_sql("select * from retail_dataset_kaggle.store_date_month_agg", con)

In [36]:
features_df.dtypes

Store                 int64
year_month_first     object
Temperature         float64
Fuel_Price          float64
MarkDown1           float64
MarkDown2           float64
MarkDown3           float64
MarkDown4           float64
MarkDown5           float64
CPI                 float64
Unemployment        float64
IsHoliday             int64
month                 int64
dtype: object

In [37]:
sales_df =  pd.read_sql("select * from retail_dataset_kaggle.sales_monthly_agg", con)

In [38]:
sales_df.dtypes

Store                 int64
Dept                  int64
year_month_first     object
Monthly_Sales       float64
dtype: object

In [39]:
stores_df = pd.read_sql("select * from retail_dataset_kaggle.store", con)

In [40]:
feature_eng = pd.merge(stores_df, sales_df, on="Store")
feature_eng = pd.merge(features_df, feature_eng, on=["Store", "year_month_first"])
feature_eng['year_month_first'] =  pd.to_datetime(feature_eng['year_month_first'])
feature_eng["month"] = feature_eng['year_month_first'].dt.month

In [41]:
feature_eng.head(2)

Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,month,Type,Size,Dept,Monthly_Sales
0,1,2010-02-01,41.12,2.5545,,,,,,211.236828,8.106,1,2,A,151315,1,131963.08
1,1,2010-02-01,41.12,2.5545,,,,,,211.236828,8.106,1,2,A,151315,2,187509.77


### Created Features

In [42]:
def fourier_terms(value, period, num_terms):
    terms = []
    for i in range(1, num_terms + 1):
        terms.extend([np.sin(2 * np.pi * i * value / period),
                      np.cos(2 * np.pi * i * value / period)])
    return terms

In [43]:
# Define the period of each component (in days)
# For example, we can choose 365 days for yearly seasonality and 30 days for monthly seasonality.
period_monthly = 30
num_terms_monthly = 3

In [44]:
feature_eng['monthly_terms'] = feature_eng['month'].apply(fourier_terms, args=(period_monthly, num_terms_monthly))

feature_eng[['monthly_sin_1', 'monthly_cos_1', 'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3']] = pd.DataFrame(feature_eng['monthly_terms'].to_list())
feature_eng.drop(columns=["monthly_terms"], inplace=True)
feature_eng.reset_index(drop=True, inplace=True)

feature_eng.head(2)


Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


### Model Training

In [45]:
train_end_date = "2011-12"
lag = 2 #months
test_start_Date = (datetime.datetime.strptime(train_end_date, "%Y-%m").date()+ relativedelta(months=2)).strftime("%Y-%m")

In [46]:
train = feature_eng[feature_eng["year_month_first"] <= train_end_date].copy()
train.drop(columns=['year_month_first'], inplace=True)
train.reset_index(drop=True, inplace=True)

In [47]:
train.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


In [48]:
test = feature_eng[feature_eng["year_month_first"] >= test_start_Date].copy()
test.drop(columns=['year_month_first'], inplace=True)
test.reset_index(drop=True, inplace=True)

In [49]:
test.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,A,151315,1,134683.3,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,A,151315,2,198068.89,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


### Define Model

In [50]:
mlflow.set_experiment("my-experiment")
mlflow.xgboost.autolog(log_datasets=False)

2023/07/31 23:20:02 INFO mlflow.tracking.fluent: Experiment with name 'my-experiment' does not exist. Creating a new experiment.


In [51]:
#mlflow.xgboost.autolog()
n_estimators = 10
seed= 123
tree_method = "approx"
enable_categorical = True
objective = 'reg:squarederror'

In [52]:
train.dtypes

Store              int64
Temperature      float64
Fuel_Price       float64
MarkDown1        float64
MarkDown2        float64
MarkDown3        float64
MarkDown4        float64
MarkDown5        float64
CPI              float64
Unemployment     float64
IsHoliday          int64
month              int32
Type              object
Size               int64
Dept               int64
Monthly_Sales    float64
monthly_sin_1    float64
monthly_cos_1    float64
monthly_sin_2    float64
monthly_cos_2    float64
monthly_sin_3    float64
monthly_cos_3    float64
dtype: object

In [53]:
with mlflow.start_run():
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("seed", 123)
    mlflow.log_param("tree_method", "approx")
    mlflow.log_param("enable_categorical", True)
    mlflow.log_param("objective", 'reg:squarederror')

    xgb_r = xgb.XGBRegressor(objective = objective,
                  n_estimators = n_estimators, seed = seed, tree_method=tree_method, enable_categorical=enable_categorical, max_cat_to_onehot=1)
    
    
    # Fitting the model
    cols = [col for col in train.columns if (col != "Monthly_Sales" and col != "Type")]
    
    X = train[cols]
    print(X.columns)
    y = train["Monthly_Sales"]
    xgb_r.fit(X, y)
    #save col order
    #save categorical transformer
      
    # Predict the model
    pred = xgb_r.predict(test[cols])

    ## Save model
    filename = "models/xgb_retail.bin"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as file:
        pickle.dump((xgb_r, cols), file)
    # mlflow.xgboost.log_model(xgb_r, artifact_path="model")

  
    
    # RMSE Computation
    mse = mean_squared_error(test["Monthly_Sales"], pred)
    mlflow.log_metric("RMSE", math.sqrt(mse))
    print("RMSE : % f" %(math.sqrt(mse)))

Index(['Store', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday', 'month', 'Size', 'Dept', 'monthly_sin_1', 'monthly_cos_1',
       'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3'],
      dtype='object')




RMSE :  41491.754569


In [54]:
cols

['Store',
 'Temperature',
 'Fuel_Price',
 'MarkDown1',
 'MarkDown2',
 'MarkDown3',
 'MarkDown4',
 'MarkDown5',
 'CPI',
 'Unemployment',
 'IsHoliday',
 'month',
 'Size',
 'Dept',
 'monthly_sin_1',
 'monthly_cos_1',
 'monthly_sin_2',
 'monthly_cos_2',
 'monthly_sin_3',
 'monthly_cos_3']

### Hyperparameter Tuning

In [55]:
mlflow.end_run()

In [56]:
def objective(trial):
    params = {
    'tree_method': 'approx',
    'max_depth': trial.suggest_int('max_depth', 1, 9),
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
    'n_estimators': trial.suggest_int('n_estimators', 50, 500),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10, log=True),
    'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
    'subsample': trial.suggest_float('subsample', 0.01, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0,log=True),
    'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
    'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    with mlflow.start_run():
        mlflow.set_tag("version", "tuned_v2") #add tags to additional groupings
        """Define the objective function"""
    
        # Fit the model
        model = xgb.XGBRegressor(**params)

        cols = [col for col in train.columns if (col != "Monthly_Sales" and col != "Type")]
        X = train[cols]
        y = train["Monthly_Sales"]
        model.fit(X, y)
    
        # Make predictions
        y_pred = model.predict(test[cols])
    
        # Evaluate predictions
        mse = mean_squared_error(test["Monthly_Sales"], y_pred)
        rmse = math.sqrt(mse)
        mlflow.log_metric("rmse", math.sqrt(mse))
    return rmse

In [57]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, timeout=600)

[I 2023-07-31 23:20:04,164] A new study created in memory with name: no-name-4ac680df-fb9c-4556-aa7d-108f47f25823
[I 2023-07-31 23:20:06,218] Trial 0 finished with value: 63522.75027567473 and parameters: {'max_depth': 9, 'learning_rate': 0.08362650131550022, 'n_estimators': 186, 'min_child_weight': 10, 'gamma': 0.9323703022889173, 'subsample': 0.5221705573915116, 'colsample_bytree': 0.010409166512049466, 'reg_alpha': 0.0024199444196875966, 'reg_lambda': 1.2410212458437055e-07}. Best is trial 0 with value: 63522.75027567473.
[I 2023-07-31 23:20:07,889] Trial 1 finished with value: 54066.514161821266 and parameters: {'max_depth': 3, 'learning_rate': 0.11869201869548698, 'n_estimators': 136, 'min_child_weight': 1, 'gamma': 0.25219052211033166, 'subsample': 0.9418401731472217, 'colsample_bytree': 0.28565915227121674, 'reg_alpha': 9.299680997334872e-08, 'reg_lambda': 0.0012918001475784081}. Best is trial 1 with value: 54066.514161821266.
[I 2023-07-31 23:20:09,597] Trial 2 finished with va

In [58]:
study.best_trial

FrozenTrial(number=18, state=TrialState.COMPLETE, values=[24675.795781190864], datetime_start=datetime.datetime(2023, 7, 31, 23, 20, 45, 940318), datetime_complete=datetime.datetime(2023, 7, 31, 23, 20, 49, 524045), params={'max_depth': 9, 'learning_rate': 0.05315811033889901, 'n_estimators': 262, 'min_child_weight': 1, 'gamma': 0.8243355569421869, 'subsample': 0.9868564918975349, 'colsample_bytree': 0.7329699294510453, 'reg_alpha': 0.8364821348479742, 'reg_lambda': 2.8303667351892163e-06}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=9, log=False, low=1, step=1), 'learning_rate': FloatDistribution(high=1.0, log=True, low=0.01, step=None), 'n_estimators': IntDistribution(high=500, log=False, low=50, step=1), 'min_child_weight': IntDistribution(high=10, log=True, low=1, step=1), 'gamma': FloatDistribution(high=1.0, log=False, low=1e-08, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.01, step=None), 'colsa