# 1. Settings

In [None]:
# Import packages not present in prod/base/1.1 (e.g. evidently)
import os

os.system('sudo pip install evidently')
os.system('sudo pip install plotly.express')
# os.system('pip install ...')

# Note: Similar approach can be used to install different packages

In [None]:
# Auto-reload for packages
%load_ext autoreload
%autoreload 2

In [None]:
# Import packages
import pandas as pd
import numpy as np
from datetime import datetime

# Import viz tools
from matplotlib import pyplot as plt
import seaborn as sns

# Modelling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Logger
import logging

In [None]:
# Remove warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Configure logger
# Load config and logger
from eztools.operations import Logger, ConfigReader
logger = Logger('/mnt/logs/', logger_name = 'L&L').get_logger()

In [None]:
# Import packages
from src.etl.get_data import read_csv_data
from src.etl.get_missing_values import get_df_na, get_na_columns, impute_nan, plot_kdensity
from src.etl.get_train_test_set import get_train_test_set
from src.ml.get_lasso_model_predictions import get_lasso_model_predictions
from src.ml.get_model_accuracy import get_model_accuracy

In [None]:
# Read config.ini
CONFIG_PATH = '/repos/poc-model-drift/src/config/config.ini'
config = ConfigReader(CONFIG_PATH, config_tuple = False).read_config()

# Unpack config
DATA_PATH = config['data']['data_path']

# 2. ML Pipeline

## 2.1 Read data

In [None]:
# Read data
df = read_csv_data(DATA_PATH)

In [None]:
# Plot info about the data
df.info()

## 2.2 Missing values

### 2.2.1 Calculate missing values

In [None]:
# Get df with na values
df_na = get_df_na(df)
df_na

In [None]:
# Columns with nan values
COLS_TO_IMPUTE = get_na_columns(df_na)
COLS_TO_IMPUTE

### 2.2.2 Impute nan values

In [None]:
# Plot distribution of the missing columns
plot_kdensity(df, 'pH')

In [None]:
# Impute nan values
df = impute_nan(df, cols = 'pH', replacement = 'mean')

## 2.3 Modelling

### 2.3.1 Lasso Logistic regression

In [None]:
# Split data into train and test set
X_train, X_test, y_train, y_test = get_train_test_set(df, response = 'wine_colour', pos_class = 'white')

In [None]:
# Train lasso model & make the predictions
y_pred = get_lasso_model_predictions(X_train, X_test, y_train)

# 3. Model evaluation

In [None]:
# Get classification metrics
accuracy = get_model_accuracy(y_test, y_pred)
accuracy

# 4. Model Drift in Production

In [None]:
# Import mlops packages
from src.etl.mlops import create_column_mapping, generate_model_data_drift_report, generate_model_performance_report

### Model Performance 

In [None]:
# As part of the predictions in production, we are expected to use the pre-trained model
lasso_model = joblib.load('./src/data/models/lasso_model.sav')

In [None]:
# Create the reference data (with target and predict)
X_train['prediction'] = lasso_model.predict(X_train)
X_train['prediction'] = X_train['prediction'].map({1: 'white', 0: 'red'})

df_reference = pd.concat([X_train, y_train], axis = 1).reset_index(drop=True)
df_reference['wine_colour'] = df_reference['wine_colour'].map({1: 'white', 0: 'red'})
df_reference.head(3)

In [None]:
# Create the current data (with target and predict)
X_test['prediction'] = lasso_model.predict(X_test)
X_test['prediction'] = X_test['prediction'].map({1: 'white', 0: 'red'})

df_current = pd.concat([X_test, y_test], axis = 1).reset_index(drop=True)
df_current['wine_colour'] = df_current['wine_colour'].map({1: 'white', 0: 'red'})
df_current.head(3)

In [None]:
# Get the column mapping
column_mapping = create_column_mapping(df_reference, target = 'wine_colour', prediction = 'prediction')
column_mapping

In [None]:
# Generate the report of the model and target/prediction drift
generate_model_data_drift_report(df_ref = df_reference, df_prod = df_current,
                                 column_mapping = column_mapping, response_type = 'categorical')

In [None]:
# Generate the performance report (for classification)
generate_model_performance_report(df_ref = df_reference, df_prod = df_current,
                                  column_mapping = column_mapping, report_type = 'classification')

## 4.1 Production - Day 1

In [None]:
# Get the reference data (i.e. data used for training)
df_reference = df.copy()

# Get the production data (i.e. new data that our model is expected to classify in production)
# Note: Data has been stored by Ioannis M. for demonstrating purposes
df_production_day1 = pd.read_pickle('./src/data/assets/df_day1.pickle')
df_production_day1.head(3)

In [None]:
# Make the prediction on the training data (just for demonstrating purposes)
df_production_day1['prediction'] = lasso_model.predict(df_production_day1)
df_production_day1['prediction'] = df_production_day1['prediction'].map({1: 'white', 0: 'red'})
df_production_day1.head(3)

In [None]:
# Drop the Target as it is not needed when the ground truth is not available
df_reference.drop('wine_colour', axis=1, inplace=True)

# Make the prediction on the training data (just for demonstrating purposes)
df_reference['prediction'] = lasso_model.predict(df_reference)
df_reference['prediction'] = df_reference['prediction'].map({1: 'white', 0: 'red'})

# Explore how the reference data looks
df_reference.head(3)

In [None]:
# Generate the model & prediction drift in production (e.g. day 1)
generate_model_data_drift_report(df_ref = df_reference, df_prod = df_production_day1,
                                 column_mapping = column_mapping, response_type = 'categorical')

## 4.2 Production - Day 2

In [None]:
# Get the production data (i.e. new data that our model is expected to classify in production)
# Note: Data has been stored by Ioannis M. for demonstrating purposes
df_production_day2 = pd.read_pickle('./src/data/assets/df_day2.pickle')
df_production_day2.head(3)

In [None]:
# Make the prediction on the training data (just for demonstrating purposes)
df_production_day2['prediction'] = lasso_model.predict(df_production_day2)
df_production_day2['prediction'] = df_production_day2['prediction'].map({1: 'white', 0: 'red'})
df_production_day2.head(3)

In [None]:
# Explore how the reference data looks
df_reference.head(3)

In [None]:
# Generate the model & prediction drift in production (e.g. day 2)
generate_model_data_drift_report(df_ref = df_reference, df_prod = df_production_day2,
                                 column_mapping = column_mapping, response_type = 'categorical')

## Ground Truth - Day 1

In [None]:
# Get the production data (i.e. new data that our model is expected to classify in production)
# Note: Data has been stored by Ioannis M. for demonstrating purposes
df_truth_day1 = pd.read_pickle('./src/data/assets/df_day1_ground_truth.pickle')
df_truth_day1.head(3)

In [None]:
# Make the prediction on the training data (just for demonstrating purposes)
df_truth_day1['prediction'] = lasso_model.predict(df_truth_day1.drop('wine_colour', axis=1))
df_truth_day1['prediction'] = df_truth_day1['prediction'].map({1: 'white', 0: 'red'})
df_truth_day1.head(3)

In [None]:
# Capture the prediction for the ground truth data
df_reference = df.copy()

df_reference['prediction'] = lasso_model.predict(df_reference.drop('wine_colour', axis=1)) # Drop the response temporarily to predict
df_reference['prediction'] = df_reference['prediction'].map({1: 'white', 0: 'red'})
df_reference.head(3)

In [None]:
# Get the column mapping for the ground truth day
column_mapping = create_column_mapping(df_reference, target = 'wine_colour', prediction = 'prediction')
column_mapping

In [None]:
# Generate the model & prediction/target drift in production (e.g. ground_truth day1)
generate_model_data_drift_report(df_ref = df_reference, df_prod = df_truth_day1,
                                 column_mapping = column_mapping, response_type = 'categorical')

# References

In [None]:
# https://github.com/evidentlyai/evidently/blob/main/evidently/examples/bicycle_demand_monitoring.ipynb
# https://evidentlyai.com/blog/tutorial-1-model-analytics-in-production
# https://docs.evidentlyai.com/