In [24]:
import mlflow
import pandas as pd
from datetime import datetime, date, timedelta
import numpy as np
from sqlalchemy import create_engine
import pymysql
from sklearn.metrics import mean_squared_error
import math

In [25]:
start_date = '2010-05-01'
# start_date = date.today() - timedelta(days=7)

In [26]:
def get_db_connection(mysql_con_string, database_name):
    # sqlEngine       = create_engine('mysql+pymysql://application:passpass@127.0.0.1/retail_data', pool_recycle=3600)
    sqlEngine       = create_engine(mysql_con_string + '/' + database_name, pool_recycle=3600)
    dbConnection    = sqlEngine.connect()
    return dbConnection

In [27]:
con =  get_db_connection('mysql+pymysql://application:passpass@127.0.0.1', 'retail_dataset_kaggle')

In [28]:
predicted_df = pd.read_sql("select Store, Dept, year_month_first, adjusted_predicted_monthly_sales from retail_dataset_kaggle.predicted_monthly_sales_new where year_month_first >= " + start_date, con)

In [29]:
predicted_df.dtypes

Store                                        int64
Dept                                         int64
year_month_first                    datetime64[ns]
adjusted_predicted_monthly_sales           float64
dtype: object

In [30]:
actual_df = pd.read_sql("select Store, Dept, year_month_first, Monthly_Sales from retail_dataset_kaggle.sales_monthly_agg where year_month_first >= " + start_date, con)

In [31]:
actual_df['year_month_first'] = actual_df['year_month_first'].astype('datetime64[ns]')

In [32]:
actual_df.dtypes

Store                        int64
Dept                         int64
year_month_first    datetime64[ns]
Monthly_Sales              float64
dtype: object

In [33]:
merged_dataset = pd.merge(predicted_df, actual_df, on=['Store', 'Dept', 'year_month_first'], how='outer')

In [34]:
merged_dataset.head(5)

Unnamed: 0,Store,Dept,year_month_first,adjusted_predicted_monthly_sales,Monthly_Sales
0,1,1,2010-06-01,72919.5,66740.7
1,1,1,2010-07-01,77776.125,81915.01
2,1,1,2010-08-01,75631.492188,64578.81
3,1,1,2010-09-01,75631.492188,71913.27
4,1,1,2010-10-01,87976.335938,134644.53


In [35]:
missing_forecasts = len(merged_dataset[merged_dataset.adjusted_predicted_monthly_sales.isnull()])

In [36]:
missing_forecasts

82192

In [37]:
missing_actuals = len(merged_dataset[merged_dataset.Monthly_Sales.isnull()])

In [38]:
total_forecasts = len(predicted_df.adjusted_predicted_monthly_sales.notnull())

In [39]:
clean_dataset = merged_dataset[~merged_dataset.Monthly_Sales.isnull() & ~merged_dataset.adjusted_predicted_monthly_sales.isnull()]

In [40]:
mse = mean_squared_error(clean_dataset['Monthly_Sales'], clean_dataset['adjusted_predicted_monthly_sales'])

In [41]:
rmse = math.sqrt(mse)

In [42]:
rmse

43636.844925205296

In [43]:
metrics_df = pd.DataFrame(columns=['start_date', 'rmse', 'missing_forecasts', 'missing_actuals', 'total_forecasts', 'update_date'])

In [44]:
metrics_df.loc[0] = [start_date, rmse, missing_forecasts, missing_actuals, total_forecasts, date.today()]

In [45]:
metrics_df.to_sql('metrics_on_completion', con, index=False, if_exists='append')

1

In [46]:
con.commit()

In [47]:
con.close()