# Baseline model for batch monitoring example

In [None]:
!pip install -r /workspaces/mlops-zoomcamp/05-monitoring/homework/requirements.txt

In [4]:
import requests
import datetime
import pandas as pd
import pickle

# from evidently import DataDefinition
# from evidently import Dataset
# from evidently import Report
# from evidently.metrics import ValueDrift, DriftedColumnsCount, MissingValueCount
# from evidently.presets import DataSummaryPreset, DataDriftPreset

from evidently import ColumnMapping
from evidently.report import Report

from evidently.metrics import (
    ColumnDriftMetric,
    DatasetDriftMetric,
    DatasetMissingValuesMetric,
    ColumnQuantileMetric,
    DatasetCorrelationsMetric
)

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [5]:
#! mkdir data

In [None]:
files = [('green_tripdata_2022-01.parquet', './data'), ('green_tripdata_2024-01.parquet', './data'), 
          ('green_tripdata_2024-02.parquet', './data'), ('green_tripdata_2024-03.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [5]:
march_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')

In [None]:
march_data.describe()

In [None]:
# Q1
print(f"There are {len(march_data)} rows in the March 2024 Green Taxi Dataset")

## Model training

In [8]:
jan_data = pd.read_parquet('data/green_tripdata_2022-01.parquet')
# The `ehail_fee` column is empty. Here we're setting it to float64
# for consistency with March 2024 data
jan_data['ehail_fee'] = jan_data['ehail_fee'].astype('float64')

In [9]:
def preprocess(data):
    # Create target
    data["duration_min"] = data.lpep_dropoff_datetime - data.lpep_pickup_datetime
    data.duration_min = data.duration_min.apply(lambda td : float(td.total_seconds())/60)
    # Filter out outliers
    data = data[(data.duration_min >= 0) & (data.duration_min <= 60)]
    data = data[(data.passenger_count > 0) & (data.passenger_count <= 8)]
    return data

jan_data = preprocess(jan_data)
march_data = preprocess(march_data)

In [10]:
# Data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [11]:
train_data = jan_data[:30000]
val_data = jan_data[30000:]

In [12]:
model = LinearRegression()

In [None]:
model.fit(train_data[num_features + cat_features], train_data[target])

In [14]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

mar_24_preds = model.predict(march_data[num_features + cat_features])
march_data['prediction'] = mar_24_preds

In [None]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

In [17]:
!mkdir models

In [14]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump(model, f_out)

In [15]:
val_data.to_parquet('data/reference.parquet')

## Evidently Reports

In [16]:
num_feat = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_feat = ["PULocationID", "DOLocationID"]

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

report = Report(metrics=[
    DatasetDriftMetric()
]
)

report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

report.show(mode='inline')

In [None]:
from pprint import pprint

pprint(report.as_dict())

Since val_data is just part of the training dataset, it makes sense that there is no dataset drift. However, let's modify the fare_amount value to simulate a case in which the taxi fares got a sudden increase in price due to an external influence, such as a tax increase. This will help us observe how Evidently detects and reports the drift.

Here's how we can modify the fare_amount column to simulate this scenario:

* Create a copy of the val_data dataset.
* Apply an increase to the fare_amount values in this copy.
This simulation will allow us to observe how Evidently's DatasetDriftMetric reacts to a sudden change in the dataset.

In [None]:
drift_data = val_data.copy()
drift_data['fare_amount'] = drift_data['fare_amount'] + 5


report = Report(metrics=[
    DatasetDriftMetric()
]
)
report.run(reference_data=train_data, current_data=drift_data, column_mapping=column_mapping)
pprint(report.as_dict())

Now let's run the report using the March 2024 dataset as our current data. Since we're comparing data that are two years apart (2022 vs 2024), we expect Evidently to detect multiple instances of column drift. This comparison will help us identify significant changes in the dataset over time.

In [None]:
report = Report(metrics=[
    DatasetDriftMetric()
]
)
report.run(reference_data=train_data, current_data=march_data, column_mapping=column_mapping)
pprint(report.as_dict())

### Q2: DatasetCorrelationsMetric
This is part of Question 2: add one metric of your choice (...)

This metric is pretty straightforward. It will calculate the correlations between all columns in the dataset. It uses:

Pearson
Spearman
Kendall
Cramer's V

In [None]:
report = Report(metrics=[
    DatasetCorrelationsMetric()
]
)

report.run(reference_data=train_data, current_data=march_data, column_mapping=column_mapping)
report.show(mode='inline')

In [None]:
report = Report(metrics=[
    ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)
]
)
report.run(reference_data=train_data, current_data=march_data, column_mapping=column_mapping)
report.show(mode='inline')

In [None]:
# To be able to visualize the distribution of duration_min, we need to include it in the 
# column_mapping used in the report.
column_mapping = ColumnMapping(
    target='duration_min',
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

report = Report(metrics=[
    ColumnQuantileMetric(column_name='duration_min', quantile=0.5)
]
)
report.run(reference_data=train_data, current_data=march_data, column_mapping=column_mapping)
report.show(mode='inline')

In [29]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetCorrelationsMetric(),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)
]
)
report.run(reference_data=train_data, current_data=march_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')

In [None]:
report_dict = report.as_dict()
print(f"Example: 50th Percentile for the current fare_amount column = {report_dict['metrics'][4]['result']['current']['value']}")

## Visualization with Grafana

In [None]:
!cat ./grafana_db_config/init.sql

# Evidently Report

In [None]:
data_definition = DataDefinition(numerical_columns=num_features + ['prediction'], categorical_columns=cat_features)
train_dataset = Dataset.from_pandas(
    train_data,
    data_definition
)

val_dataset = Dataset.from_pandas(
    val_data,
    data_definition
)

In [None]:
report = Report(metrics=[
    ValueDrift(column='prediction'),
    DriftedColumnsCount(),
    MissingValueCount(column='prediction'),
]
)

In [None]:
snapshot = report.run(reference_data=train_dataset, current_data=val_dataset)

In [None]:
snapshot

In [None]:
result = snapshot.dict()

In [None]:
result