In [17]:
# Import python libraries
from loguru import logger

import warnings
warnings.filterwarnings("ignore")

from evidently import ColumnMapping
from evidently.metric_preset import DataDriftPreset, RegressionPreset
from evidently.report import Report

In [18]:
# Import helper functions
from src.etl.utils import read_parquet_file, read_toml_config, load_pickle
from src.ml.inference import get_best_model

In [19]:
mean_encoder = load_pickle("./src/etl/transformers/mean_encoder.pkl")

In [20]:
mean_encoder

In [3]:
CONFIG_PATH = "src/config/config.toml"

In [4]:
# 1. Read config
config = read_toml_config(CONFIG_PATH)

# Unpack config file
reference_data_path = config["monitoring"]["reference_data_path"]
current_data_path = config["monitoring"]["current_data_path"]
model_run_path = config["inference"]["model_run_path"]
categorical_features = config["preprocessing"]["cat_variables"]

[32m2023-08-02 08:46:13.464[0m | [1mINFO    [0m | [36msrc.etl.utils[0m:[36mread_toml_config[0m:[36m9[0m - [1mReading toml config file[0m


In [5]:
# 2. Read reference and current data
reference_df = read_parquet_file(reference_data_path)
current_df = read_parquet_file(current_data_path)

[32m2023-08-02 08:46:17.512[0m | [1mINFO    [0m | [36msrc.etl.utils[0m:[36mread_parquet_file[0m:[36m14[0m - [1mReading parquet file from path: ./src/data/preprocessed/train_df.parquet[0m
[32m2023-08-02 08:46:17.993[0m | [1mINFO    [0m | [36msrc.etl.utils[0m:[36mread_parquet_file[0m:[36m14[0m - [1mReading parquet file from path: ./src/data/preprocessed/test_df.parquet[0m


In [6]:
# 3. Load model
model = get_best_model.fn(model_run_path)

[32m2023-08-02 08:46:19.648[0m | [1mINFO    [0m | [36msrc.ml.inference[0m:[36mget_best_model[0m:[36m29[0m - [1mGet the best registered model from mlflow from ./src/etl/transformers/[0m


In [7]:
# 4. Predict on current and reference data
current_pred = model.predict(current_df.drop("price", axis=1))
reference_pred = model.predict(reference_df.drop("price", axis=1))

current_df["prediction"] = current_pred
reference_df["prediction"] = reference_pred


In [8]:
# Set evidently column mapping
column_mapping = ColumnMapping()

column_mapping.target = "price"
column_mapping.prediction = "prediction"
column_mapping.categorical_features = categorical_features

In [12]:
# Create regression report of model performance
logger.info("Get the regression report.")
regression_report = Report(metrics=[RegressionPreset()])
regression_report.run(
    reference_data=reference_df,
    current_data=current_df,
    column_mapping=column_mapping,
)

[32m2023-08-02 08:58:34.248[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mGet the regression report.[0m


In [16]:
from evidently.ui.workspace import Workspace

ws = Workspace.create("workspace")
project = ws.create_project("workspace-test-name")
project.description = "My demo"

project.save()

In [14]:
regression_report.save_json("./workspace/test_suite.json")

In [15]:
regression_report._save('./workspace/snapshot.json')

In [10]:
# Create data drift report
logger.info("Get the drift report.")
drift_report = Report(metrics=[DataDriftPreset()])
drift_report.run(
    reference_data=reference_df,
    current_data=current_df,
    column_mapping=column_mapping,
)
logger.info("Saving reports...")
regression_report.save_html("./workspace/test_suite.html")
drift_report.save_html("./workspace/test_drift_report.html")

logger.info("Monitoring procedure finished.")

[32m2023-08-02 08:46:45.020[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mGet the drift report.[0m
[32m2023-08-02 08:46:47.021[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mSaving reports...[0m
[32m2023-08-02 08:46:47.979[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mMonitoring procedure finished.[0m


In [None]:
# Run evidently ui
!evidently ui --workspace /port:0:0

In [None]:
# Requirements
# 1. Select a dataset & describe the problem ✅
# 2. Train it using MLFlow ✅
# 3. Create a pipeline (e.g. main.py) ✅
# 4. Deploy with Docker
#   - TODO: Test docker image starts with evidently UI
# 5. Monitor using Evidently-AI

# Extra
# 7. Integration test???
# 9. Create a Makefile


In [None]:
# Set up
# Create a virtual environment and execute "pip install -r requirements.txt"

In [None]:
# Step 1
# Execute "mlflow ui --backend-store-uri sqlite:///mlflow.db"

In [None]:
# Step 2.
# Execute "prefect server start"

In [None]:
# Step 3.
# python src/etl/preprocessing.py

# a. Run prefect flow of preprocessing
# b. Saves encoder to src/etl/transformers/mean_encoder.pkl
# c. Generates processed train_df.parquet in src/data/preprocessed/train_df.parquet

In [None]:
# Step 4.
# python src/ml/hyperparameter_tuning.py

# a. Run prefect flow of hyperparameter tuning
# b. Stores runs in MLFlow
# c. Uses optuna for hyperparameter tuning
# d. Stores the data used for the hyperparameter tuning in src/data/final/...

In [None]:
# Step 5.
# python src/ml/register_best_model.py

# a. Run prefect flow of registering best model
# b. Stores runs in MLFlow and registers the best model in model registry
# c. Stores the best model in src/etl/transformers/model.pkl


In [None]:
# Step 6.
# python src/ml/inference.py

# a. Takes unseen data from src/data/raw/vehicles_2023-05.parquet and conducts all the preprocessing
# b. Load the best model and produces inference
# c. Run prefect flow of doing inference on unseen data stored in 

In [None]:
# Step 7.
# Execute "evidently ui" to generate the workspace folder to store the monitoring web reports

In [None]:
# Step 8.
# python src/ml/monitoring.py

# Created performance and data draft reports that can be opened with a web browser for visualisation or through the evidently UI