In [None]:
# Generic libraries
import pickle
from loguru import logger
from typing import List

# Preprocessing
import pandas as pd
from feature_engine.encoding import MeanEncoder

# Monitor test suites
from evidently.test_suite import TestSuite
from evidently.tests import (
    # Quality
    TestHighlyCorrelatedColumns,
    # Intergrity
    TestNumberOfRows,
    TestColumnNumberOfMissingValues,
    TestNumberOfOutRangeValues,
    # Drift
    TestColumnDrift,
    TestNumberOfDriftedColumns,
)

# Filter warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Functions needed to execute notebook
def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def read_parquet_file(path: str) -> pd.DataFrame:
    logger.info(f"Reading parquet file from path: {path}")
    return pd.read_parquet(path, engine="pyarrow")


def drop_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    return df.drop(columns, axis=1)


def encode_categorical_variables(
    df: pd.DataFrame,
    cat_variables: list[str],
    target: str = "price",
    fit: bool = "True",
    path: str = "transformers/mean_encoder.pkl",

) -> pd.DataFrame:
    """Encode categorical features using mean encoder.

    Args:
        df (pd.DataFrame): The raw dataframe
        cat_variables (list[str]): Categorical variables to encode
        target (str, optional): The name of the response variable. Defaults to "price".

    Returns:
        pd.DataFrame: Encoded dataframe.
    """
    logger.info("Fitting mean encoder transforming categorical variables")
    encoder_path = "transformers/mean_encoder.pkl"

    if fit:
        encoder = MeanEncoder(variables=cat_variables)
        encoder.fit(df.drop(target, axis=1), df[target])
        encoded_df = encoder.transform(df.drop(target, axis=1))

        # Save the encoder to a pre-specified path
        logger.info(f"Saving encoder to path: {encoder_path}")
        dump_pickle(encoder, encoder_path)
    else:
        encoder = load_pickle(encoder_path)
        encoded_df = encoder.transform(df.drop(target, axis=1))

    return pd.concat([encoded_df, df[target]], axis=1)

# 0. Settings 

In [None]:
# Paths for reference and current datasets
reference_data_path = "data/raw/vehicles_2023-04.parquet"
current_data_path = "data/raw/vehicles_2023-05.parquet"

# 1. Preprocessing

In [None]:
reference_df = read_parquet_file(reference_data_path)
current_df = read_parquet_file(current_data_path)

display(reference_df.head())
display(current_df.head())

In [None]:
drop_cols = ["posting_date"]

reference_df = drop_columns(reference_df, drop_cols)
current_df = drop_columns(current_df, drop_cols)

display(reference_df.head())
display(current_df.head())

In [None]:
cat_variables = [
    "manufacturer",
    "fuel",
    "title_status",
    "transmission",
    "type",
    "paint_color",
]

reference_df = encode_categorical_variables(reference_df, cat_variables, fit=True)
current_df = encode_categorical_variables(current_df, cat_variables, fit=False)

display(reference_df.head())
display(current_df.head())


# 2. Test Suites

In [None]:
custom_test_suite = TestSuite(tests=[
    TestHighlyCorrelatedColumns(),
    TestNumberOfRows(),
    TestColumnNumberOfMissingValues(column_name="paint_color"),
    TestNumberOfOutRangeValues(column_name="transmission"),
    TestColumnDrift(column_name='odometer', stattest='psi', stattest_threshold=0.3),
    TestNumberOfDriftedColumns(),
])
custom_test_suite.run(reference_data=reference_df, current_data=current_df)
custom_test_suite.show(mode='inline')

In [None]:
# Complete view of summary
custom_test_suite.as_dict()["summary"]

In [None]:
# Get number of passed tests
number_of_passed = custom_test_suite.as_dict()["summary"]["success_tests"]
print(f"The number of passed tests is: {number_of_passed}.")

In [None]:
# Get number of failed tests
number_of_failed = custom_test_suite.as_dict()["summary"]["failed_tests"]
print(f"The number of failed tests is: {number_of_failed}.")
