# Baseline model for batch monitoring example

In [None]:
import os
import requests
import datetime
import pandas as pd
from pathlib import Path

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric
from evidently.metrics import ColumnQuantileMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
files = [('green_tripdata_2023-03.parquet', os.environ["DATA_DIR"])]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [None]:
data = pd.read_parquet(Path(os.environ["DATA_DIR"]) / "green_tripdata_2023-03.parquet")

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
# create target
data["duration_min"] = data.lpep_dropoff_datetime - data.lpep_pickup_datetime
data.duration_min = data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [None]:
# filter out outliers
data = data[(data.duration_min >= 0) & (data.duration_min <= 60)]
data = data[(data.passenger_count > 0) & (data.passenger_count <= 8)]

In [None]:
data.duration_min.hist()

In [None]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [None]:
train_data = data[:30000]
val_data = data[30000:]

In [None]:
model = LinearRegression()

In [None]:
model.fit(train_data[num_features + cat_features], train_data[target])

In [None]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

In [None]:
val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

In [None]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

# Dump model and reference data

In [None]:
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)

In [None]:
val_data.to_parquet("data/reference.parquet")

# Evidently Report

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
]
)

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')

In [None]:
result = report.as_dict()

In [None]:
result

In [None]:
#prediction drift
result['metrics'][0]['result']['drift_score']

In [None]:
#number of drifted columns
result['metrics'][1]['result']['number_of_drifted_columns']

In [None]:
#share of missing values
result['metrics'][2]['result']['current']['share_of_missing_values']