In [11]:
import os
import datetime
from dateutil.relativedelta import relativedelta
import math
import pickle

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import mlflow
# from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# from hyperopt.pyll import scope
import optuna
from sqlalchemy import create_engine
import pymysql

### https://mlflow.org/docs/latest/tracking.html#where-runs-are-recorded

### MlFlow setup 1
* Tracking server: No
* Backend store: local filesystem
* Artifacts store: local filesystem

In [16]:
mlflow.get_tracking_uri()

'file:///Users/kanchanapadmanabhan/Library/CloudStorage/OneDrive-Personal/Personal-Course/Vector/Model%20Deployment/mlops_course/Week%201/experiment_tracking/mlruns'

### MlFlow setup 2
* Tracking server: No
* Backend store: SQLite
* Artifacts store: local filesystem

In [32]:
mlflow.set_tracking_uri("mysql+pymysql://application:passpass@127.0.0.1/mlflow_backend")

In [33]:
mlflow.get_tracking_uri()

'mysql+pymysql://application:passpass@127.0.0.1/mlflow_backend'

### MlFlow setup 3
* Tracking server: Yes (local or remote)
* Backend store: SQLite
* Artifacts store: local filesystem

In [62]:
mlflow.set_tracking_uri("http://127.0.0.1:80")

### Model Training

In [3]:
feature_eng = pd.read_csv("sample_features.csv")

In [5]:
feature_eng.head()

Unnamed: 0,Store,year_month_first,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
2,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,3,47286.6,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
3,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,4,146792.36,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
4,1,2010-02-01,41.12,2.5545,,,,,,211.236828,...,A,151315,5,112420.35,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


In [6]:
train_end_date = "2011-12"
lag = 2 #months
test_start_Date = (datetime.datetime.strptime(train_end_date, "%Y-%m").date()+ relativedelta(months=2)).strftime("%Y-%m")

In [7]:
train = feature_eng[feature_eng["year_month_first"] <= train_end_date].copy()
train.drop(columns=['year_month_first'], inplace=True)
train.reset_index(drop=True, inplace=True)

In [8]:
train.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,1,131963.08,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,41.12,2.5545,,,,,,211.236828,8.106,...,A,151315,2,187509.77,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


In [9]:
test = feature_eng[feature_eng["year_month_first"] >= test_start_Date].copy()
test.drop(columns=['year_month_first'], inplace=True)
test.reset_index(drop=True, inplace=True)

In [10]:
test.head(2)

Unnamed: 0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,...,Type,Size,Dept,Monthly_Sales,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3
0,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,A,151315,1,134683.3,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017
1,1,52.285,3.4595,16931.265,7281.18,68.7925,12626.245,6015.6725,220.374964,7.348,...,A,151315,2,198068.89,0.406737,0.913545,0.743145,0.669131,0.951057,0.309017


### Define Model

In [34]:
mlflow.end_run()

In [63]:
mlflow.set_experiment("my-experiment_test_mlflow")
mlflow.xgboost.autolog(log_datasets=False)

2023/08/07 22:31:43 INFO mlflow.tracking.fluent: Experiment with name 'my-experiment_test_mlflow' does not exist. Creating a new experiment.


In [64]:
#mlflow.xgboost.autolog()
n_estimators = 10
seed= 123
tree_method = "approx"
enable_categorical = True
objective = 'reg:squarederror'

In [65]:
with mlflow.start_run() as run:
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("seed", 123)
    mlflow.log_param("tree_method", "approx")
    mlflow.log_param("enable_categorical", True)
    mlflow.log_param("objective", 'reg:squarederror')

    xgb_r = xgb.XGBRegressor(objective = objective,
                  n_estimators = n_estimators, seed = seed, tree_method=tree_method, enable_categorical=enable_categorical, max_cat_to_onehot=1)
    
    
    # Fitting the model
    cols = [col for col in train.columns if (col != "Monthly_Sales" and col != "Type")]
    
    X = train[cols]
    print(X.columns)
    y = train["Monthly_Sales"]
    xgb_r.fit(X, y)
    #save col order
    #save categorical transformer
      
    # Predict the model
    pred = xgb_r.predict(test[cols])

    ## Save model
    filename = "models/xgb_retail.bin"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "wb") as file:
        pickle.dump((xgb_r, cols), file)
    # mlflow.xgboost.log_model(xgb_r, artifact_path="model")

  
    
    # RMSE Computation
    mse = mean_squared_error(test["Monthly_Sales"], pred)
    mlflow.log_metric("RMSE", math.sqrt(mse))
    print("RMSE : % f" %(math.sqrt(mse)))

Index(['Store', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday', 'month', 'Size', 'Dept', 'monthly_sin_1', 'monthly_cos_1',
       'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3'],
      dtype='object')




RMSE :  41738.028283


In [66]:
mlflow.end_run()

In [67]:
run.info.run_id

'3ea71337c1ce439b9aa77ef1186faff4'

In [68]:
from mlflow.tracking import MlflowClient
from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository

In [69]:
client = MlflowClient(mlflow.get_tracking_uri())

In [70]:
client.tracking_uri

'http://127.0.0.1:80'

In [71]:
client.get_experiment_by_name("my-experiment_test_mlflow")

<Experiment: artifact_location='mlflow-artifacts:/305559933122650085', creation_time=1691461903914, experiment_id='305559933122650085', last_update_time=1691461903914, lifecycle_stage='active', name='my-experiment_test_mlflow', tags={}>

In [72]:
name = "monthly_sales_sql_v2"

In [73]:
client.create_registered_model(name)

<RegisteredModel: aliases={}, creation_timestamp=1691461920793, description='', last_updated_timestamp=1691461920793, latest_versions=[], name='monthly_sales_sql_v2', tags={}>

In [74]:
runs_uri = "runs:/{}/my-experiment_test_mlflow".format(run.info.run_id)
model_src = RunsArtifactRepository.get_underlying_uri(runs_uri)
mv = client.create_model_version(name, model_src, run.info.run_id, description="xgboost model")

2023/08/07 22:32:02 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: monthly_sales_sql_v2, version 1
