# Baseline model for batch monitoring example

In [1]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, ColumnQuantileMetric, ColumnValuePlot, DatasetCorrelationsMetric, DatasetSummaryMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
# files = [('green_tripdata_2024-03.parquet', './data'), ('green_tripdata_2022-02.parquet', './data'), ('green_tripdata_2022-01.parquet', './data')]

# print("Download files:")
# for file, path in files:
#     url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
#     resp=requests.get(url, stream=True)
#     save_path=f"{path}/{file}"
#     with open(save_path, "wb") as handle:
#         for data in tqdm(resp.iter_content(),
#                         desc=f"{file}",
#                         postfix=f"save to {save_path}",
#                         total=int(resp.headers["Content-Length"])):
#             handle.write(data)

In [2]:
march24_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')
march24_data.shape

(57457, 20)

In [3]:
# create target
march24_data["duration_min"] = march24_data.lpep_dropoff_datetime - march24_data.lpep_pickup_datetime
march24_data.duration_min = march24_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

# filter out outliers
march24_data = march24_data[(march24_data.duration_min >= 0) & (march24_data.duration_min <= 60)]
march24_data = march24_data[(march24_data.passenger_count > 0) & (march24_data.passenger_count <= 8)]

# march24_data.duration_min.hist()

# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

test_part_q = 0.5
test_part = int(march24_data.shape[0] * test_part_q)
train_data = march24_data[:test_part]
val_data = march24_data[test_part:]

model = LinearRegression()

model.fit(train_data[num_features + cat_features], train_data[target])


In [4]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

3.713560693539708
3.7212354172855644


In [5]:
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)

val_data.to_parquet('data/reference.parquet')

# Evidently Report

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnQuantileMetric(column_name='fare_amount', quantile=0.5),
    # ColumnValuePlot(column_name='fare_amount'), 
    DatasetCorrelationsMetric(),
    # DatasetSummaryMetric()

]
)

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)
report.show(mode='inline')

# result = report.as_dict()
# result['metrics'][3]['result']['current']

In [None]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [None]:
ws = Workspace("workspace")

project = ws.create_project("NYC Taxi Data Quality Project")
project.description = "My project descriotion"
project.save()

In [None]:
regular_report = Report(
    metrics=[
        DataQualityPreset()
    ],
    timestamp=datetime.datetime(2024,3,28)
)

regular_report.run(reference_data=None,
                  current_data=val_data.loc[val_data.lpep_pickup_datetime.between('2024-03-28', '2024-03-29', inclusive="left")],
                  column_mapping=column_mapping)

regular_report

In [None]:
ws.add_report(project.id, regular_report)

In [None]:
#configure the dashboard
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Inference Count",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_rows",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Number of Missing Values",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_missing_values",
                legend="count"
            ),
        ],
        plot_type=PlotType.LINE,
        size=WidgetSize.HALF,
    ),
)

project.save()

In [None]:
regular_report = Report(
    metrics=[
        DataQualityPreset(),
        # DataDriftPreset()
    ],
    timestamp=datetime.datetime(2024,3,29)
)

regular_report.run(reference_data=None,
                  current_data=val_data.loc[val_data.lpep_pickup_datetime.between('2024-03-29', '2024-03-30', inclusive="left")],
                  column_mapping=column_mapping)

regular_report

In [None]:
ws.add_report(project.id, regular_report)

In [None]:
jan_data = pd.read_parquet('data/green_tripdata_2022-01.parquet')
jan_data.describe()

In [None]:
jan_data.shape

In [None]:
# create target
jan_data["duration_min"] = jan_data.lpep_dropoff_datetime - jan_data.lpep_pickup_datetime
jan_data.duration_min = jan_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [None]:
# filter out outliers
jan_data = jan_data[(jan_data.duration_min >= 0) & (jan_data.duration_min <= 60)]
jan_data = jan_data[(jan_data.passenger_count > 0) & (jan_data.passenger_count <= 8)]

In [None]:
jan_data.duration_min.hist()

In [None]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [None]:
jan_data.shape

In [None]:
train_data = jan_data[:30000]
val_data = jan_data[30000:]

In [None]:
model = LinearRegression()

In [None]:
model.fit(train_data[num_features + cat_features], train_data[target])

In [None]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

In [None]:
val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

In [None]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

# Dump model and reference data

In [None]:
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)

In [None]:
val_data.to_parquet('data/reference.parquet')

# Evidently Report

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
]
)

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')

In [None]:
result = report.as_dict()

In [None]:
result

In [None]:
#prediction drift
result['metrics'][0]['result']['drift_score']

In [None]:
#number of drifted columns
result['metrics'][1]['result']['number_of_drifted_columns']

In [None]:
#share of missing values
result['metrics'][2]['result']['current']['share_of_missing_values']

# Evidently Dashboard

In [None]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [None]:
ws = Workspace("workspace")

In [None]:
project = ws.create_project("NYC Taxi Data Quality Project")
project.description = "My project descriotion"
project.save()

In [None]:
regular_report = Report(
    metrics=[
        DataQualityPreset()
    ],
    timestamp=datetime.datetime(2022,1,28)
)

regular_report.run(reference_data=None,
                  current_data=val_data.loc[val_data.lpep_pickup_datetime.between('2022-01-28', '2022-01-29', inclusive="left")],
                  column_mapping=column_mapping)

regular_report

In [None]:
ws.add_report(project.id, regular_report)

In [None]:
#configure the dashboard
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Inference Count",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_rows",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Number of Missing Values",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_missing_values",
                legend="count"
            ),
        ],
        plot_type=PlotType.LINE,
        size=WidgetSize.HALF,
    ),
)

project.save()

In [None]:
regular_report = Report(
    metrics=[
        DataQualityPreset()
    ],
    timestamp=datetime.datetime(2022,1,29)
)

regular_report.run(reference_data=None,
                  current_data=val_data.loc[val_data.lpep_pickup_datetime.between('2022-01-29', '2022-01-30', inclusive="left")],
                  column_mapping=column_mapping)

regular_report

In [None]:
ws.add_report(project.id, regular_report)