In [None]:
# Requirements
# 1. Select a dataset & describe the problem ✅
# 2. Train it using MLFlow ✅
# 3. Create a pipeline (e.g. main.py) ✅
# 4. Deploy with Prefect & Docker (Do the evidently UI)
#   - TODO: Test docker image starts with evidently UI
# 5. Monitor using Evidently-AI

# Extra
# 6. Write unit tests ✅
# 7. Integration test???
# 8. Use black to format the code ✅
# 9. Create a Makefile
# 10. There is a pre-commit hook ✅
# 11. Create a CI-CD pipeline ✅

In [2]:
import os
import sys

import pandas as pd

# Define entry point for paths
CWD = os.getcwd()
os.chdir(CWD)
sys.path.append(CWD)

from src.etl.utils import read_parquet_file
file_path = "./src/data/preprocessed/train_df.parquet"

df = read_parquet_file(file_path)
df

[32m2023-08-01 16:15:02.918[0m | [1mINFO    [0m | [36msrc.etl.utils[0m:[36mread_parquet_file[0m:[36m14[0m - [1mReading parquet file from path: ./src/data/preprocessed/train_df.parquet[0m


Unnamed: 0,year,manufacturer,fuel,odometer,title_status,transmission,type,paint_color,lat,long,price
0,2018.0,56017.679086,43819.143863,20856.0,48474.908901,29053.471939,226318.953830,47512.673861,32.590000,-85.480000,34590
1,2016.0,260223.895202,27196.958714,30176.0,48474.908901,29053.471939,226318.953830,18750.561166,32.590000,-85.480000,30590
2,2020.0,19288.527325,43819.143863,20581.0,48474.908901,29053.471939,16984.277784,154159.260339,32.590000,-85.480000,32990
3,2017.0,29337.855243,43819.143863,39508.0,48474.908901,29053.471939,226318.953830,47512.673861,32.590000,-85.480000,22590
4,2020.0,56017.679086,43819.143863,10688.0,48474.908901,29053.471939,226318.953830,47512.673861,32.590000,-85.480000,27990
...,...,...,...,...,...,...,...,...,...,...,...
184233,2018.0,21029.908981,43819.143863,15080.0,48474.908901,54248.592918,27034.109690,47512.673861,33.779214,-84.411811,30990
184234,2018.0,21110.407358,43819.143863,30814.0,48474.908901,54248.592918,16031.359948,47512.673861,33.779214,-84.411811,33590
184235,2020.0,18475.158377,43819.143863,12029.0,48474.908901,29053.471939,16031.359948,18750.561166,33.786500,-84.445400,30590
184236,2020.0,22300.600817,140154.602573,4174.0,48474.908901,29053.471939,15327.332520,47512.673861,33.779214,-84.411811,34990


In [4]:
raw_file_path = "./src/data/raw/vehicles_2023-04.parquet"

raw_df = read_parquet_file(raw_file_path)
raw_df

[32m2023-08-01 16:16:06.885[0m | [1mINFO    [0m | [36msrc.etl.utils[0m:[36mread_parquet_file[0m:[36m14[0m - [1mReading parquet file from path: ./src/data/raw/vehicles_2023-04.parquet[0m


Unnamed: 0,price,year,manufacturer,fuel,odometer,title_status,transmission,type,paint_color,lat,long,posting_date
0,34590,2018.0,ford,gas,20856.0,clean,other,pickup,white,32.590000,-85.480000,2021-04-30
1,30590,2016.0,toyota,other,30176.0,clean,other,pickup,red,32.590000,-85.480000,2021-04-30
2,32990,2020.0,jeep,gas,20581.0,clean,other,SUV,silver,32.590000,-85.480000,2021-04-30
3,22590,2017.0,ram,gas,39508.0,clean,other,pickup,white,32.590000,-85.480000,2021-04-29
4,27990,2020.0,ford,gas,10688.0,clean,other,pickup,white,32.590000,-85.480000,2021-04-29
...,...,...,...,...,...,...,...,...,...,...,...,...
184233,30990,2018.0,mercedes-benz,gas,15080.0,clean,automatic,other,white,33.779214,-84.411811,2021-04-04
184234,33590,2018.0,lexus,gas,30814.0,clean,automatic,sedan,white,33.779214,-84.411811,2021-04-04
184235,30590,2020.0,volvo,gas,12029.0,clean,other,sedan,red,33.786500,-84.445400,2021-04-04
184236,34990,2020.0,cadillac,diesel,4174.0,clean,other,hatchback,white,33.779214,-84.411811,2021-04-04


In [10]:
raw_df["paint_color"].unique()

array(['white', 'red', 'silver', 'black', 'blue', 'brown', 'grey',
       'yellow', 'orange', 'custom', 'green', 'purple'], dtype=object)

In [None]:
# Set up
# Create a virtual environment and execute "pip install -r requirements.txt"

In [None]:
# Step 1
# Execute "mlflow ui --backend-store-uri sqlite:///mlflow.db"

In [None]:
# Step 2.
# Execute "prefect server start"

In [None]:
# Step 3.
# python src/etl/preprocessing.py

# a. Run prefect flow of preprocessing
# b. Saves encoder to src/etl/transformers/mean_encoder.pkl
# c. Generates processed train_df.parquet in src/data/preprocessed/train_df.parquet

In [None]:
# Step 4.
# python src/ml/hyperparameter_tuning.py

# a. Run prefect flow of hyperparameter tuning
# b. Stores runs in MLFlow
# c. Uses optuna for hyperparameter tuning
# d. Stores the data used for the hyperparameter tuning in src/data/final/...

In [None]:
# Step 5.
# python src/ml/register_best_model.py

# a. Run prefect flow of registering best model
# b. Stores runs in MLFlow and registers the best model in model registry
# c. Stores the best model in src/etl/transformers/model.pkl


In [None]:
# Step 6.
# python src/ml/inference.py

# a. Takes unseen data from src/data/raw/vehicles_2023-05.parquet and conducts all the preprocessing
# b. Load the best model and produces inference
# c. Run prefect flow of doing inference on unseen data stored in 

In [None]:
# Step 7.
# Execute "evidently ui" to generate the workspace folder to store the monitoring web reports

In [None]:
# Step 8.
# python src/ml/monitoring.py

# Created performance and data draft reports that can be opened with a web browser for visualisation or through the evidently UI