# 1. Settings

In [None]:
# Auto-reload for packages
%load_ext autoreload
%autoreload 2

In [None]:
# Import packages
import pandas as pd
from datetime import datetime
import joblib
import ast

# Import viz tools
from matplotlib import pyplot as plt
import seaborn as sns

# Modelling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Logger
import logging

In [None]:
# Remove warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Configure logger
# Load config and logger
from eztools.operations import Logger, ConfigReader
logger = Logger('/mnt/logs/', logger_name = 'L&L').get_logger()

In [None]:
# Import packages
from src.etl.get_data import read_csv_data
from src.etl.get_missing_values import get_df_na, get_na_columns, impute_nan, plot_kdensity
from src.etl.get_train_test_set import get_train_test_set
from src.etl.mlops import string_to_list
from src.ml.get_lasso_model_predictions import get_lasso_model_predictions
from src.ml.get_model_accuracy import get_model_accuracy

In [None]:
# Read config.ini
CONFIG_PATH = '/repos/poc-model-drift/src/config/config.ini'
config = ConfigReader(CONFIG_PATH, config_tuple = False).read_config()

# Unpack config
DATA_PATH = config['data']['data_path']

# 2. ML Pipeline

## 2.1 Read data

In [None]:
# Read data
df = read_csv_data(DATA_PATH)

In [None]:
# Plot info about the data
df.info()

## 2.2 Missing values

### 2.2.1 Calculate missing values

In [None]:
# Get df with na values
df_na = get_df_na(df)
df_na

In [None]:
# Columns with nan values
COLS_TO_IMPUTE = get_na_columns(df_na)
COLS_TO_IMPUTE

### 2.2.2 Impute nan values

In [None]:
# Plot distribution of the missing columns
plot_kdensity(df, 'pH')

In [None]:
# Impute nan values
df = impute_nan(df, cols = 'pH', replacement = 'mean')

## 2.3 Modelling

### 2.3.1 Lasso Logistic regression

In [None]:
# Split data into train and test set
X_train, X_test, y_train, y_test = get_train_test_set(df, response = 'wine_colour', pos_class = 'white')

In [None]:
# Train lasso model & make the predictions
y_pred = get_lasso_model_predictions(X_train, X_test, y_train)

# 3. Model evaluation

In [None]:
# Get classification metrics
accuracy = get_model_accuracy(y_test, y_pred)
accuracy

# 4. Model Drift in Production

In [None]:
# Import model_monitor packages
from src.ml.model_monitor import ModelMonitorReports, MonitorReportReader

In [None]:
# Unpack the config file
LATEST_DRIFT_REPORT = config['json_reports_path']['latest_drift_report_json']
DRIFT_EMAIL_RECEIVER = string_to_list(config['settings']['drift_email_receiver'])
PROJECT_NAME = config['settings']['project_name']

MODEL_PATH = config['model_monitor']['model_path']
REFERENCE_DATA_PATH = config['model_monitor']['reference_data_path']
MODEL_PATH = config['model_monitor']['model_path']

# Take today's data to capture the latest report
today = datetime.today().strftime('%Y%m%d')
LATEST_DRIFT_REPORT_PATH = LATEST_DRIFT_REPORT.format(today=today)

# Have the column mapping as a global parameter
column_mapping = ast.literal_eval(config['model_monitor']['column_mapping'])

### Model Performance 

The _"evidently"_ package allows of the following model monitor options: <br><br>
_**1. Data Drift**_ <br>
    Data drift is checking to see if there is any drift between the **independent** variables of the reference & current data. <br><br>
_**2. Target and/or Prediction Drift (a.k.a. Model drift)**_ <br>
Target and/or prediction drift detects if drift occures in the **response** (ground truth) and/or **predictions**. <br><br>
_**3. Performance Monitor**_ <br>
Performance monitor can check for overall **performance** of models for different **metrics** and different segments of data

In [None]:
# As part of the predictions in production, we are expected to use the pre-trained model
lasso_model = joblib.load(MODEL_PATH)

In [None]:
# Create the reference data (with target and predict)
X_train['prediction'] = lasso_model.predict(X_train)
X_train['prediction'] = X_train['prediction'].map({1: 'white', 0: 'red'})

df_reference = pd.concat([X_train, y_train], axis = 1).reset_index(drop=True)
df_reference['wine_colour'] = df_reference['wine_colour'].map({1: 'white', 0: 'red'})
df_reference.head(3)

In [None]:
# Create the current data (with target and predict)
X_test['prediction'] = lasso_model.predict(X_test)
X_test['prediction'] = X_test['prediction'].map({1: 'white', 0: 'red'})

df_current = pd.concat([X_test, y_test], axis = 1).reset_index(drop=True)
df_current['wine_colour'] = df_current['wine_colour'].map({1: 'white', 0: 'red'})
df_current.head(3)

_**Column Mapping**_ <br>
Column Mapping is a dictionary that holds information about the data at hand. More specifically, it is a dictionary that shows:
1. Column name of the _response_
2. Column names of the _numerical_ independent variables
3. Column names of the _categorical_ independent variables <br>
These should already be encoded (e.g. one-hot encoded)

For more information, please refer to the "boston_example.ipynb" in the "/src/notebooks/" location

In [None]:
# Check the literal evaluation
column_mapping = ast.literal_eval(config['model_monitor']['column_mapping'])
column_mapping

In [None]:
# Create the model monitor reports by using the ModelMonitorReports
# Note: The ModelMonitorReports could be imported from ezTools
model_monitor_reports = ModelMonitorReports(df_reference, df_current, column_mapping)
model_monitor_reports

In [None]:
# Generate the model performance report
model_monitor_reports.generate_model_performance_report(report_type = 'classification')

_**Usage of the model performance report**_
1. **Analyse the results of a model test:** <br>
Explore the results of an offline test and contrast the different model's performance in an A/B test or shadow model deployment <br>
(possible integration with MLFlow in the future)
2. **To analyze the model performance on the slices of data** <br>
By manipulating the input data frame, you can explore how the model performs on different data segments (e.g. users from a specific region).
3. **To trigger or decide on the model retraining** <br>
You can use this report to check if your performance is below the threshold to initiate a model update and evaluate if retraining is likely to improve performance.
4. **To debug or improve model performance.** <br>
You can use the Classification Quality table to identify underperforming segments and decide on ways to address them.

For more information about the performance reports please address to the following link: <br>
https://docs.evidentlyai.com/reports/classification-performance

#### TO DO

Create the automation part of triggering an alert if any of the metrics degrade below to a pre-specified threshold.

In [None]:
# Idea: Generate a pd.DataFrame that holds the following information:
# ____________________________________________________________
#
#   Metrics  |    Decay (Ref - Curent)    |  Decay Threshold |  Flag
# --------------------------------------------------------------
#  Accuracy   |      2%                    |  3%             |  False
# Sensitivity |      5%                    |  3%             |  True
#    ...      |     ...                    |  ...            |  ...

## 4.1 Production - Day 1

In [None]:
# As part of the predictions in production, we are expected to use the pre-trained model
lasso_model = joblib.load(MODEL_PATH)

In [None]:
# Get the reference data (i.e. data used for training)
df_reference = df.copy()

# Get the production data (i.e. new data that our model is expected to classify in production)
# Note: Data has been stored by Ioannis M. for demonstrating purposes
df_production_day1 = pd.read_pickle('src/data/assets/df_day1.pickle')
df_production_day1.head(3)

In [None]:
# Make the prediction on the training data (just for demonstrating purposes)
df_production_day1['prediction'] = lasso_model.predict(df_production_day1)
df_production_day1['prediction'] = df_production_day1['prediction'].map({1: 'white', 0: 'red'})
df_production_day1.head(3)

In [None]:
# Drop the Target as it is not needed when the ground truth is not available
df_reference.drop('wine_colour', axis=1, inplace=True)

# Make the prediction on the training data (just for demonstrating purposes)
df_reference['prediction'] = lasso_model.predict(df_reference)
df_reference['prediction'] = df_reference['prediction'].map({1: 'white', 0: 'red'})

# Explore how the reference data looks
df_reference.head(3)

In [None]:
# Explore the column mapping dictionary
column_mapping

In [None]:
# Update the column mapping
column_mapping['target'] = None
model_monitor_reports.update_column_mapping(column_mapping)

# Check the updated column mapping
model_monitor_reports.column_mapping

In [None]:
# Generate the model & prediction drift in production (e.g. day 1)
model_monitor_reports.generate_model_data_drift_report(response_type = 'categorical', report_name = 'poc')

_**Usage of the data and/or model drift reports**_
1. **Support model maintenance** <br>
Decide on when to retrain the model or which features to drop due to drift.
2. **When debugging model decay** <br>
If the model quality has dropped, the dashboard can help explore where the change comes from.
3. **When no ground truth is available** <br>
You can use this report to check if your performance is below the threshold to initiate a model update and evaluate if retraining is likely to improve performance.
4. **To debug or improve model performance.** <br>
If there is no immediate feedback, the dashboard can be used to explore the changes in the model output and the relationship between the features and prediction.

For more information about the data and model reports please address to the following links: <br>
1. https://docs.evidentlyai.com/reports/classification-performance
2. https://docs.evidentlyai.com/reports/data-drift

In [None]:
# Import the MonitorReportReader class that automates the generation 
# of the automation alter in case drift is detected
monitor_report = MonitorReportReader(LATEST_DRIFT_REPORT_PATH, PROJECT_NAME)
monitor_report

In [None]:
# Get the dataframe for data drift that shows if drift is detected
# anywhere in the independent variables
monitor_report.create_data_drift_table()
monitor_report.df_data_drift

In [None]:
# Generate an e-mail alert if drift is detected "data_drift"
monitor_report.send_drift_email_alert(DRIFT_EMAIL_RECEIVER, send_for = 'data_drift')

In [None]:
# Get the dataframe for model drift that shows if drift is detected
# anywhere in the target/response variables
monitor_report.create_model_drift_table()
monitor_report.df_target_drift

In [None]:
# Generate an e-mail alert if drift is detected for "model_drift"
monitor_report.send_drift_email_alert(DRIFT_EMAIL_RECEIVER, send_for = 'model_drift')

## Ground Truth - Day 1

In [None]:
# Get the production data (i.e. new data that our model is expected to classify in production)
# Note: Data has been stored by Ioannis M. for demonstrating purposes
df_truth_day1 = pd.read_pickle('./src/data/assets/df_day1_ground_truth.pickle')
df_truth_day1.head(3)

In [None]:
# Make the prediction on the training data (just for demonstrating purposes)
df_truth_day1['prediction'] = lasso_model.predict(df_truth_day1.drop('wine_colour', axis=1))
df_truth_day1['prediction'] = df_truth_day1['prediction'].map({1: 'white', 0: 'red'})
df_truth_day1.head(3)

In [None]:
# Capture the prediction for the ground truth data
df_reference = df.copy()

df_reference['prediction'] = lasso_model.predict(df_reference.drop('wine_colour', axis=1)) # Drop the response temporarily to predict
df_reference['prediction'] = df_reference['prediction'].map({1: 'white', 0: 'red'})
df_reference.head(3)

In [None]:
# Read the global column_mapping
column_mapping = ast.literal_eval(config['model_monitor']['column_mapping'])
column_mapping

In [None]:
# Create the model monitor report and generate the data & model dashboard
# for the ground truth day 1
model_monitor_reports = ModelMonitorReports(df_reference, df_truth_day1, column_mapping)
model_monitor_reports.generate_model_data_drift_report(response_type = 'categorical', report_name = 'poc')

In [None]:
# Import the MonitorReportReader class
monitor_report = MonitorReportReader(LATEST_DRIFT_REPORT_PATH, PROJECT_NAME)

In [None]:
# Capture the data drift
monitor_report.create_model_drift_table()
monitor_report.df_target_drift

In [None]:
# Send alert for model_drift
monitor_report.send_drift_email_alert(DRIFT_EMAIL_RECEIVER, send_for = 'model_drift')

## 4.2 Production - Day 2 (Drift)

In [None]:
# Get the production data (i.e. new data that our model is expected to classify in production)
# Note: Data has been stored by Ioannis M. for demonstrating purposes
df_production_day2 = pd.read_pickle('src/data/assets/df_day2.pickle')
df_production_day2.head(3)

In [None]:
# Make the prediction on the training data (just for demonstrating purposes)
df_production_day2['prediction'] = lasso_model.predict(df_production_day2)
df_production_day2['prediction'] = df_production_day2['prediction'].map({1: 'white', 0: 'red'})
df_production_day2.head(3)

In [None]:
# Explore how the reference data looks
df_reference.drop('wine_colour', axis=1, inplace=True)
df_reference.head(3)

In [None]:
# Update the column mapping
column_mapping['target'] = None
column_mapping

In [None]:
# Create the model monitor report and generate the data & model dashboard
# for the production day2
model_monitor_reports = ModelMonitorReports(df_reference, df_production_day2, column_mapping)
model_monitor_reports.generate_model_data_drift_report(response_type = 'categorical', report_name = 'poc')

In [None]:
# Import the MonitorReportReader class
monitor_report = MonitorReportReader(LATEST_DRIFT_REPORT_PATH, PROJECT_NAME)

In [None]:
# Create the model drift dataframe
monitor_report.create_model_drift_table(response_type = 'categorical', p_value_threshold = 1)
monitor_report.df_target_drift

In [None]:
# Send an automated e-mail to capture data drift
monitor_report.send_drift_email_alert(DRIFT_EMAIL_RECEIVER, send_for = 'model_drift')

# References

In [None]:
# https://github.com/evidentlyai/evidently/blob/main/evidently/examples/bicycle_demand_monitoring.ipynb
# https://evidentlyai.com/blog/tutorial-1-model-analytics-in-production
# https://docs.evidentlyai.com/