In [57]:
import datetime
from dateutil.relativedelta import relativedelta
import math
import pickle

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import mlflow
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# from hyperopt.pyll import scope
import optuna
from sqlalchemy import create_engine
import pymysql

### MlFlow setup
* Tracking server: no
* Backend store: local filesystem
* Artifacts store: local filesystem

In [73]:
mlflow.set_tracking_uri("http://127.0.0.1:80")

### MySQL data

In [69]:
connection_string = 'mysql+pymysql://application:passpass@127.0.0.1'
database = 'retail_dataset_kaggle'

In [70]:
def get_db_connection(mysql_con_string, database_name):
    # sqlEngine       = create_engine('mysql+pymysql://application:passpass@127.0.0.1/retail_data', pool_recycle=3600)
    sqlEngine       = create_engine(mysql_con_string + '/' + database_name, pool_recycle=3600)
    dbConnection    = sqlEngine.connect()
    return dbConnection

In [71]:
con =  get_db_connection(connection_string, database)

### Prediction Monthly of sales for all stores and SKUS

### Feature Engineering

In [26]:
features_df  = pd.read_sql("select * from retail_dataset_kaggle.store_date_month_agg", con)

In [27]:
features_df.dtypes

Store                 int64
year_month_first     object
Temperature         float64
Fuel_Price          float64
MarkDown1           float64
MarkDown2           float64
MarkDown3           float64
MarkDown4           float64
MarkDown5           float64
CPI                 float64
Unemployment        float64
IsHoliday             int64
month                 int64
dtype: object

In [28]:
sales_df =  pd.read_sql("select * from retail_dataset_kaggle.sales_monthly_agg", con)

In [29]:
sales_df.dtypes

Store                 int64
Dept                  int64
year_month_first     object
Monthly_Sales       float64
dtype: object

In [30]:
stores_df = pd.read_sql("select * from retail_dataset_kaggle.store", con)

In [37]:
feature_eng = pd.merge(stores_df, sales_df, on="Store")
feature_eng = pd.merge(features_df, feature_eng, on=["Store", "year_month_first"])
feature_eng['year_month_first'] =  pd.to_datetime(feature_eng['year_month_first'])
feature_eng["month"] = feature_eng['year_month_first'].dt.month

In [38]:
feature_eng.head(2)

Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,month,Type,Size,Dept,Monthly_Sales
0,1,2010-02-01,41.12,2.5545,,,,,,211.236828,8.106,1,2,A,151315,1,131963.08
1,1,2010-02-01,41.12,2.5545,,,,,,211.236828,8.106,1,2,A,151315,2,187509.77


### Created Features

In [39]:
def fourier_terms(value, period, num_terms):
    terms = []
    for i in range(1, num_terms + 1):
        terms.extend([np.sin(2 * np.pi * i * value / period),
                      np.cos(2 * np.pi * i * value / period)])
    return terms

In [40]:
# Define the period of each component (in days)
# For example, we can choose 365 days for yearly seasonality and 30 days for monthly seasonality.
period_monthly = 30
num_terms_monthly = 3

In [41]:
feature_eng['monthly_terms'] = feature_eng['month'].apply(fourier_terms, args=(period_monthly, num_terms_monthly))

feature_eng[['monthly_sin_1', 'monthly_cos_1', 'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3']] = pd.DataFrame(feature_eng['monthly_terms'].to_list())
feature_eng.drop(columns=["monthly_terms"], inplace=True)
feature_eng.reset_index(drop=True, inplace=True)

feature_eng.head(2)


Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


### Model Training

In [42]:
train_end_date = "2011-12"
lag = 2 #months
test_start_Date = (datetime.datetime.strptime(train_end_date, "%Y-%m").date()+ relativedelta(months=2)).strftime("%Y-%m")

In [43]:
train = feature_eng[feature_eng["year_month_first"] <= train_end_date].copy()
train.drop(columns=['year_month_first'], inplace=True)
train.reset_index(drop=True, inplace=True)

In [44]:
train.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


In [46]:
test = feature_eng[feature_eng["year_month_first"] >= test_start_Date].copy()
test.drop(columns=['year_month_first'], inplace=True)
test.reset_index(drop=True, inplace=True)

In [19]:
test.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Size,Dept,Monthly_Sales,month,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,151315,1,134683.3,2,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,151315,2,198068.89,2,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


### Define Model

In [74]:
mlflow.set_experiment("my-experiment")
mlflow.xgboost.autolog(log_datasets=False)

2023/07/31 00:46:02 INFO mlflow.tracking.fluent: Experiment with name 'my-experiment' does not exist. Creating a new experiment.


In [75]:
#mlflow.xgboost.autolog()
n_estimators = 10
seed= 123
tree_method = "approx"
enable_categorical = True
objective = 'reg:squarederror'

In [49]:
train.dtypes

Store              int64
Temperature      float64
Fuel_Price       float64
MarkDown1        float64
MarkDown2        float64
MarkDown3        float64
MarkDown4        float64
MarkDown5        float64
CPI              float64
Unemployment     float64
IsHoliday          int64
month              int32
Type              object
Size               int64
Dept               int64
Monthly_Sales    float64
monthly_sin_1    float64
monthly_cos_1    float64
monthly_sin_2    float64
monthly_cos_2    float64
monthly_sin_3    float64
monthly_cos_3    float64
dtype: object

In [76]:
with mlflow.start_run():
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("seed", 123)
    mlflow.log_param("tree_method", "approx")
    mlflow.log_param("enable_categorical", True)
    mlflow.log_param("objective", 'reg:squarederror')

    xgb_r = xgb.XGBRegressor(objective = objective,
                  n_estimators = n_estimators, seed = seed, tree_method=tree_method, enable_categorical=enable_categorical, max_cat_to_onehot=1)
    
    
    # Fitting the model
    cols = [col for col in train.columns if (col != "Monthly_Sales" and col != "Type")]
    
    X = train[cols]
    print(X.columns)
    y = train["Monthly_Sales"]
    xgb_r.fit(X, y)
    #save col order
    #save categorical transformer
      
    # Predict the model
    pred = xgb_r.predict(test[cols])

    ## Save model
    filename = "models/xgb_retail.bin"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as file:
        pickle.dump((xgb_r, cols), file)
    # mlflow.xgboost.log_model(xgb_r, artifact_path="model")

  
    
    # RMSE Computation
    mse = mean_squared_error(test["Monthly_Sales"], pred)
    mlflow.log_metric("RMSE", math.sqrt(mse))
    print("RMSE : % f" %(math.sqrt(mse)))

Index(['Store', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday', 'month', 'Size', 'Dept', 'monthly_sin_1', 'monthly_cos_1',
       'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3'],
      dtype='object')




RMSE :  41491.754569


In [None]:
cols

### Hyperparameter Tuning

In [51]:
mlflow.end_run()

In [66]:
def objective(trial):
    params = {
    'tree_method': 'approx',
    'max_depth': trial.suggest_int('max_depth', 1, 9),
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
    'n_estimators': trial.suggest_int('n_estimators', 50, 500),
    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10, log=True),
    'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
    'subsample': trial.suggest_float('subsample', 0.01, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0,log=True),
    'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
    'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }
    with mlflow.start_run():
        mlflow.set_tag("version", "tuned_v2") #add tags to additional groupings
        """Define the objective function"""
    
        # Fit the model
        model = xgb.XGBRegressor(**params)

        cols = [col for col in train.columns if (col != "Monthly_Sales" and col != "Type")]
        X = train[cols]
        y = train["Monthly_Sales"]
        model.fit(X, y)
    
        # Make predictions
        y_pred = model.predict(test[cols])
    
        # Evaluate predictions
        mse = mean_squared_error(test["Monthly_Sales"], y_pred)
        rmse = math.sqrt(mse)
        mlflow.log_metric("rmse", math.sqrt(mse))
    return rmse

In [67]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)

[I 2023-07-29 23:30:26,404] A new study created in memory with name: no-name-17c75258-470e-47ef-9268-445e6cf1eb27
[I 2023-07-29 23:30:28,488] Trial 0 finished with value: 67119.45237729332 and parameters: {'max_depth': 3, 'learning_rate': 0.05663763471261454, 'n_estimators': 168, 'min_child_weight': 1, 'gamma': 0.9203666963727942, 'subsample': 0.15458400491229496, 'colsample_bytree': 0.18318909823826893, 'reg_alpha': 0.0004297230297505684, 'reg_lambda': 6.335487380799085e-08}. Best is trial 0 with value: 67119.45237729332.
[I 2023-07-29 23:30:31,364] Trial 1 finished with value: 55475.106715756534 and parameters: {'max_depth': 8, 'learning_rate': 0.10694867828045648, 'n_estimators': 423, 'min_child_weight': 1, 'gamma': 0.7221822805868401, 'subsample': 0.7697062176988949, 'colsample_bytree': 0.0795789946139105, 'reg_alpha': 0.004495988157127503, 'reg_lambda': 0.00016010676019869135}. Best is trial 1 with value: 55475.106715756534.
[I 2023-07-29 23:30:33,225] Trial 2 finished with value:

In [62]:
study.best_trial

FrozenTrial(number=0, state=TrialState.COMPLETE, values=[67964.84448418471], datetime_start=datetime.datetime(2023, 7, 29, 23, 20, 42, 825205), datetime_complete=datetime.datetime(2023, 7, 29, 23, 20, 46, 847595), params={'max_depth': 2, 'learning_rate': 0.9279097267781184, 'n_estimators': 308, 'min_child_weight': 6, 'gamma': 0.2055048638668166, 'subsample': 0.2152304345905234, 'colsample_bytree': 0.01857075186710803, 'reg_alpha': 2.967452552671559e-05, 'reg_lambda': 0.0007139556712167809}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=9, log=False, low=1, step=1), 'learning_rate': FloatDistribution(high=1.0, log=True, low=0.01, step=None), 'n_estimators': IntDistribution(high=500, log=False, low=50, step=1), 'min_child_weight': IntDistribution(high=10, log=True, low=1, step=1), 'gamma': FloatDistribution(high=1.0, log=False, low=1e-08, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.01, step=None), 'colsa