### Load the model from Model Registry

In [4]:
import mlflow
import pandas as pd
from datetime import datetime, date
import numpy as np
from sqlalchemy import create_engine
import pymysql

In [5]:
name = 'monthly_sales'
stage = 'None'
start_date = '2010-05-02'
end_date = '2010-11-05'
connection_string = 'mysql+pymysql://application:passpass@127.0.0.1'
database = 'retail_dataset_kaggle'

In [6]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [7]:
def load_model(name, stage):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    return model    

In [8]:
def fourier_terms(value, period=30, num_terms=3):
    terms = []
    for i in range(1, num_terms + 1):
        terms.extend([np.sin(2 * np.pi * i * value / period),
                      np.cos(2 * np.pi * i * value / period)])
    return terms

In [9]:
def get_db_connection(mysql_con_string, database_name):
    # sqlEngine       = create_engine('mysql+pymysql://application:passpass@127.0.0.1/retail_data', pool_recycle=3600)
    sqlEngine       = create_engine(mysql_con_string + '/' + database_name, pool_recycle=3600)
    dbConnection    = sqlEngine.connect()
    return dbConnection

In [10]:
def create_inference_context(con, start_date, end_date, frequency='MS'):
    #required_forecasts = sales_df_month[["Store", "Dept"]].drop_duplicates()
    df = pd.read_sql("select DISTINCT Store, Dept from retail_dataset_kaggle.sales", con)
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    date_list = pd.date_range(start_date, end_date, freq=frequency).to_frame()
    context_df = df.merge(date_list, how='cross')
    context_df.columns = ['Store', 'Dept', 'year_month_first']
    context_df['month'] = context_df['year_month_first'].dt.month
    return context_df

In [11]:
def get_features(context_df, start_date, end_date):
    store_list = ','.join([str(i) for i in list(context_df['Store'].drop_duplicates())])
    stores_df = pd.read_sql('select Store, Size from retail_dataset_kaggle.store where Store in (' + store_list +')', con)
    predict_df = pd.merge(context_df, stores_df, on="Store")
    
    
    feature_eng_future_df = pd.read_sql('''select *
                                        from retail_dataset_kaggle.store_date_month_agg
                                        where year_month_first >= ''' + start_date + ' AND year_month_first <= ' + end_date, con)

    predict_df = pd.merge(predict_df, feature_eng_future_df, on=["Store", "year_month_first", "month"], how="left")
    predict_df.drop(["Temperature", "Fuel_Price", "CPI", "Unemployment"], axis=1, inplace=True)

    

    pull_forward_df = pd.read_sql('''SELECT t1.Store, t1.Fuel_Price, t1.CPI, t1.Unemployment
                          FROM retail_dataset_kaggle.store_date_month_agg t1
                          JOIN (
                                SELECT Store, MAX(year_month_first) AS max_year_month
                                FROM retail_dataset_kaggle.store_date_month_agg
                                GROUP BY Store
                            ) t2 ON t1.Store = t2.Store AND t1.year_month_first = t2.max_year_month''',con)
    
    temp_same_last_year_df = pd.read_sql('''SELECT t1.Store, t1.month, t1.Temperature
                                            FROM retail_dataset_kaggle.store_date_month_agg t1
                                            JOIN (
                                                SELECT Store, month, MAX(year_month_first) AS max_year_month
                                                FROM retail_dataset_kaggle.store_date_month_agg as t2
                                                GROUP BY Store, month
                                            ) t2 ON t1.Store = t2.Store AND t1.month = t2.month AND t1.year_month_first = t2.max_year_month
                                            ''', con)

    final_features_df = pd.merge(predict_df, pull_forward_df, on=["Store"], how="left")
    final_features_df = pd.merge(final_features_df, temp_same_last_year_df, on=["Store", "month"], how="left")

    final_features_df["monthly_terms"] = final_features_df['month'].apply(fourier_terms)
    final_features_df[['monthly_sin_1', 'monthly_cos_1', 'monthly_sin_2', 'monthly_cos_2', 'monthly_sin_3', 'monthly_cos_3']] = pd.DataFrame(final_features_df['monthly_terms'].to_list())
    final_features_df.drop(columns=["monthly_terms"], inplace=True)
    final_features_df.reset_index(drop=True, inplace=True)
    final_features_df['IsHoliday'] = final_features_df['IsHoliday'].fillna(0)
    return final_features_df
        

In [12]:
def getPythonType(val):
    type = None
    if val == 'DataType.long':
        type = 'Int64'
    if val == 'DataType.double':
        type = 'float'
    return type

In [13]:
def type_check_dataset(input_df, curmodel):
    prediction_df = input_df.copy()
    column_order = [name for name in cur_model.metadata.get_input_schema().input_names()]
    for spec in cur_model.metadata.get_input_schema():
        if spec.name in prediction_df.columns:
            type = getPythonType(str(spec.type))
            if type:
                prediction_df[spec.name] = prediction_df[spec.name].astype(type)
    return prediction_df, column_order

In [14]:
def get_prediction(data, model):
    result = model.predict(data)
    return result

In [15]:
cur_model =  load_model(name, stage)
con =  get_db_connection(connection_string, database)
context_df = create_inference_context(con, start_date, end_date)
prediction_df = get_features(context_df, start_date, end_date)
prediction_df, cols = type_check_dataset(prediction_df, cur_model)

In [16]:
results = get_prediction(prediction_df[cols], cur_model)

In [17]:
prediction_df['predicted_monthly_sales'] = results
prediction_df['model'] = cur_model.metadata.run_id
prediction_df['prediction_date'] = date.today()

In [18]:
prediction_df['adjusted_predicted_monthly_sales'] = prediction_df['predicted_monthly_sales'].apply(lambda x: 0 if x < 0 else x)

In [19]:
prediction_df.head(2)

Unnamed: 0,Store,Dept,year_month_first,month,Size,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,...,monthly_sin_1,monthly_cos_1,monthly_sin_2,monthly_cos_2,monthly_sin_3,monthly_cos_3,predicted_monthly_sales,model,prediction_date,adjusted_predicted_monthly_sales
0,1,1,2010-06-01,6,151315,,,,,,...,0.951057,0.309017,0.587785,-0.809017,-0.587785,-0.809017,72919.5,8c040a338c4e48f89992b091ae5f46f5,2023-07-31,72919.5
1,1,1,2010-07-01,7,151315,,,,,,...,0.994522,0.104528,0.207912,-0.978148,-0.951057,-0.309017,77776.125,8c040a338c4e48f89992b091ae5f46f5,2023-07-31,77776.125


In [20]:
prediction_df.dtypes

Store                                        Int64
Dept                                         Int64
year_month_first                    datetime64[ns]
month                                        int32
Size                                         Int64
MarkDown1                                  float64
MarkDown2                                  float64
MarkDown3                                  float64
MarkDown4                                  float64
MarkDown5                                  float64
IsHoliday                                    Int64
Fuel_Price                                 float64
CPI                                        float64
Unemployment                               float64
Temperature                                float64
monthly_sin_1                              float64
monthly_cos_1                              float64
monthly_sin_2                              float64
monthly_cos_2                              float64
monthly_sin_3                  

In [21]:
prediction_df.to_sql('predicted_monthly_sales_new', con, index=False, if_exists='replace')

19986

In [22]:
# con.rollback()

### Metrics

In [23]:
missing_values = len(prediction_df[prediction_df.predicted_monthly_sales.isnull()])

In [24]:
stats = dict(prediction_df.predicted_monthly_sales.describe())

In [25]:
dict(stats)

{'count': 19986.0,
 'mean': 58949.2890625,
 'std': 67108.265625,
 'min': -5026.5234375,
 '25%': 20387.81640625,
 '50%': 33423.71484375,
 '75%': 70949.279296875,
 'max': 616357.625}

In [26]:
number_neg_values = len(prediction_df[prediction_df.predicted_monthly_sales < 0])

In [27]:
number_neg_values

42

In [28]:
run_metrics_df = pd.DataFrame(columns=['start_date', 'end_date', 'update_date', 'missing_values', 'neg_values', 'count', 'mean', 'min', 'std', '25p', '50p', '75p', 'max'])

In [29]:
run_metrics_df.loc[0] = [start_date, end_date, date.today(), missing_values, number_neg_values, stats['count'],
                         stats['mean'], stats['std'], stats['min'], stats['25%'], 
                         stats['50%'], stats['75%'], stats['max']]

In [30]:
run_metrics_df.to_sql('run_metrics', con, index=False, if_exists='append')

1

In [31]:
con.commit()

In [32]:
con.close()