In [1]:
import os
import datetime
from dateutil.relativedelta import relativedelta
import math
import pickle

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sqlalchemy import create_engine
import pymysql

### MySQL data

In [2]:
connection_string = 'mysql+pymysql://application:passpass@127.0.0.1'
database = 'retail_dataset_kaggle'

In [3]:
def get_db_connection(mysql_con_string, database_name):
    # sqlEngine       = create_engine('mysql+pymysql://application:passpass@127.0.0.1/retail_data', pool_recycle=3600)
    sqlEngine       = create_engine(mysql_con_string + '/' + database_name, pool_recycle=3600)
    dbConnection    = sqlEngine.connect()
    return dbConnection

In [4]:
con =  get_db_connection(connection_string, database)

### Prediction Monthly of sales for all stores and SKUS

### Feature Engineering

In [5]:
features_df  = pd.read_sql("select * from retail_dataset_kaggle.store_date_month_agg", con)

In [6]:
features_df.dtypes

Store                 int64
year_month_first     object
Temperature         float64
Fuel_Price          float64
MarkDown1           float64
MarkDown2           float64
MarkDown3           float64
MarkDown4           float64
MarkDown5           float64
CPI                 float64
Unemployment        float64
IsHoliday             int64
month                 int64
dtype: object

In [7]:
sales_df =  pd.read_sql("select * from retail_dataset_kaggle.sales_monthly_agg", con)

In [8]:
sales_df.dtypes

Store                 int64
Dept                  int64
year_month_first     object
Monthly_Sales       float64
dtype: object

In [9]:
stores_df = pd.read_sql("select * from retail_dataset_kaggle.store", con)

In [10]:
feature_eng = pd.merge(stores_df, sales_df, on="Store")
feature_eng = pd.merge(features_df, feature_eng, on=["Store", "year_month_first"])
feature_eng['year_month_first'] =  pd.to_datetime(feature_eng['year_month_first'])
feature_eng["month"] = feature_eng['year_month_first'].dt.month

In [11]:
feature_eng.head(2)

Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,month,Type,Size,Dept,Monthly_Sales
0,1,2010-02-01,41.12,2.5545,,,,,,211.236828,8.106,1,2,A,151315,1,131963.08
1,1,2010-02-01,41.12,2.5545,,,,,,211.236828,8.106,1,2,A,151315,2,187509.77


### Created Features

In [12]:
def fourier_terms(value, period, num_terms):
    terms = []
    for i in range(1, num_terms + 1):
        terms.extend([np.sin(2 * np.pi * i * value / period),
                      np.cos(2 * np.pi * i * value / period)])
    return terms

In [13]:
# Define the period of each component (in days)
# For example, we can choose 365 days for yearly seasonality and 30 days for monthly seasonality.
period_monthly = 30
num_terms_monthly = 3

In [14]:
feature_eng['monthly_terms'] = feature_eng['month'].apply(fourier_terms, args=(period_monthly, num_terms_monthly))

feature_eng[['monthly_sin_1', 'monthly_cos_1', 'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3']] = pd.DataFrame(feature_eng['monthly_terms'].to_list())
feature_eng.drop(columns=["monthly_terms"], inplace=True)
feature_eng.reset_index(drop=True, inplace=True)

feature_eng.head(2)


Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


## Artificially create two "different" datasets for training and production (fake) - to explain how evidently works

In [15]:
## We are assuming only Type A for training and only Type B in prod
feature_eng_prod = feature_eng[feature_eng.Type == 'B'].copy()
feature_eng = feature_eng[feature_eng.Type == 'A']

### Model Training

In [16]:
train_end_date = "2011-12"
lag = 2 #months
test_start_Date = (datetime.datetime.strptime(train_end_date, "%Y-%m").date()+ relativedelta(months=2)).strftime("%Y-%m")

In [17]:
train = feature_eng[feature_eng["year_month_first"] <= train_end_date].copy()
train.drop(columns=['year_month_first'], inplace=True)
train.reset_index(drop=True, inplace=True)

In [18]:
train.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


In [19]:
test = feature_eng[feature_eng["year_month_first"] >= test_start_Date].copy()
test.drop(columns=['year_month_first'], inplace=True)
test.reset_index(drop=True, inplace=True)

In [20]:
test.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,A,151315,1,134683.3,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,A,151315,2,198068.89,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


### Run and Create Model

In [21]:
#mlflow.xgboost.autolog()
n_estimators = 10
seed= 123
tree_method = "approx"
enable_categorical = True
objective = 'reg:squarederror'

In [22]:
train.dtypes

Store              int64
Temperature      float64
Fuel_Price       float64
MarkDown1        float64
MarkDown2        float64
MarkDown3        float64
MarkDown4        float64
MarkDown5        float64
CPI              float64
Unemployment     float64
IsHoliday          int64
month              int64
Type              object
Size               int64
Dept               int64
Monthly_Sales    float64
monthly_sin_1    float64
monthly_cos_1    float64
monthly_sin_2    float64
monthly_cos_2    float64
monthly_sin_3    float64
monthly_cos_3    float64
dtype: object

In [23]:

xgb_r = xgb.XGBRegressor(objective = objective,
              n_estimators = n_estimators, seed = seed, tree_method=tree_method, enable_categorical=enable_categorical, max_cat_to_onehot=1)


# Fitting the model
cols = [col for col in train.columns if (col != "Monthly_Sales" and col != "Type")]

X = train[cols]
print(X.columns)
y = train["Monthly_Sales"]
xgb_r.fit(X, y)
#save col order
#save categorical transformer
  
# Predict the model
pred = xgb_r.predict(test[cols])


# RMSE Computation
mse = mean_squared_error(test["Monthly_Sales"], pred)
print("RMSE : % f" %(math.sqrt(mse)))

Index(['Store', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday', 'month', 'Size', 'Dept', 'monthly_sin_1', 'monthly_cos_1',
       'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3'],
      dtype='object')
RMSE :  42496.484688


## Model in production (use the fake prod data created)

In [24]:
feature_eng_prod.head(2)

Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
4821,3,2010-02-01,47.5,2.5545,,,,,,214.566768,...,B,37392,1,33112.61,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
4822,3,2010-02-01,47.5,2.5545,,,,,,214.566768,...,B,37392,2,63532.36,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


In [25]:
# Fitting the model
cols = [col for col in train.columns if (col != "Monthly_Sales" and col != "Type")]

X_prod = feature_eng_prod[cols]
print(X_prod.columns)
y_prod = feature_eng_prod["Monthly_Sales"]

Index(['Store', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday', 'month', 'Size', 'Dept', 'monthly_sin_1', 'monthly_cos_1',
       'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3'],
      dtype='object')


In [26]:
# Predict the model
pred = xgb_r.predict(X_prod[cols])


# RMSE Computation
mse = mean_squared_error(y_prod, pred)
print("RMSE : % f" %(math.sqrt(mse)))

RMSE :  67923.180951


## Analyze with Evidently

Examples: https://github.com/evidentlyai/evidently/blob/main/examples/sample_notebooks/getting_started_tutorial.ipynb

### Kill this session run the following commands on terminal before restarting jupyter notebook (if you want viz to appear on jupyter notebook env)
jupyter nbextension install --sys-prefix --symlink --overwrite --py evidently

jupyter nbextension enable evidently --py --sys-prefix

In [27]:
import evidently
evidently.__version__

'0.4.1'

In [28]:
from evidently import ColumnMapping

from evidently.report import Report
from evidently.metrics.base_metric import generate_column_metrics
from evidently.metric_preset import DataDriftPreset, TargetDriftPreset, DataQualityPreset, RegressionPreset
from evidently.metrics import *

from evidently.test_suite import TestSuite
from evidently.tests.base_test import generate_column_tests
from evidently.test_preset import DataStabilityTestPreset, NoTargetPerformanceTestPreset, RegressionTestPreset
from evidently.tests import *

In [39]:
reference = X
current = X_prod

In [40]:
report = Report(metrics=[
    DataDriftPreset(), 
])

report.run(reference_data=reference, current_data=current)
report.save_html("datadrift.html")

In [41]:
report.as_dict()

{'metrics': [{'metric': 'DatasetDriftMetric',
   'result': {'drift_share': 0.5,
    'number_of_columns': 20,
    'number_of_drifted_columns': 13,
    'share_of_drifted_columns': 0.65,
    'dataset_drift': True}},
  {'metric': 'DataDriftTable',
   'result': {'number_of_columns': 20,
    'number_of_drifted_columns': 13,
    'share_of_drifted_columns': 0.65,
    'dataset_drift': True,
    'drift_by_columns': {'CPI': {'column_name': 'CPI',
      'column_type': 'num',
      'stattest_name': 'Wasserstein distance (normed)',
      'stattest_threshold': 0.1,
      'drift_score': 0.1941171524911442,
      'drift_detected': True,
      'current': {'small_distribution': {'x': [126.078846775,
         136.1855300775,
         146.29221338,
         156.3988966825,
         166.505579985,
         176.6122632875,
         186.71894659,
         196.8256298925,
         206.932313195,
         217.0389964975,
         227.14567979999998],
        'y': [0.033113819349259145,
         0.02029433521566

In [42]:
report.as_dict()

{'metrics': [{'metric': 'DatasetDriftMetric',
   'result': {'drift_share': 0.5,
    'number_of_columns': 20,
    'number_of_drifted_columns': 13,
    'share_of_drifted_columns': 0.65,
    'dataset_drift': True}},
  {'metric': 'DataDriftTable',
   'result': {'number_of_columns': 20,
    'number_of_drifted_columns': 13,
    'share_of_drifted_columns': 0.65,
    'dataset_drift': True,
    'drift_by_columns': {'CPI': {'column_name': 'CPI',
      'column_type': 'num',
      'stattest_name': 'Wasserstein distance (normed)',
      'stattest_threshold': 0.1,
      'drift_score': 0.1941171524911442,
      'drift_detected': True,
      'current': {'small_distribution': {'x': [126.078846775,
         136.1855300775,
         146.29221338,
         156.3988966825,
         166.505579985,
         176.6122632875,
         186.71894659,
         196.8256298925,
         206.932313195,
         217.0389964975,
         227.14567979999998],
        'y': [0.033113819349259145,
         0.02029433521566

In [43]:
tests = TestSuite(tests=[
    TestNumberOfColumnsWithMissingValues(),
    TestNumberOfRowsWithMissingValues(),
    TestNumberOfConstantColumns(),
    TestNumberOfDuplicatedRows(),
    TestNumberOfDuplicatedColumns(),
    TestColumnsType(),
    TestNumberOfDriftedColumns(),
])

tests.run(reference_data=reference, current_data=current)
tests.save_html("tests.html")


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison

