In [1]:
# to evaluate, test, and monitor ML model drift
#!pip install evidently

Evidently is a tool to analyse and monitor data and machine learning models.

In [2]:
# checking python version
from platform import python_version
print(python_version())

3.9.7


In [3]:
# import libraries
import pandas as pd
import numpy as np
from sklearn import datasets
from evidently.report import Report
from evidently.metrics import DataDriftTable
from evidently.metrics import DatasetDriftMetric

import warnings
warnings.filterwarnings('ignore')

In [4]:
# create reference and target datasets for drift detection
adult_data = datasets.fetch_openml(name = 'adult', version = 2, as_frame = 'auto')
adult = adult_data.frame
adult.shape

(48842, 15)

In [5]:
adult.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [6]:
adult_ref = adult[~adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_cur = adult[adult.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]
adult_ref.shape, adult_cur.shape

((14155, 15), (34687, 15))

In [7]:
from evidently.metric_preset import DataDriftPreset
data_drift_report = Report(metrics=[
    DataDriftPreset(),
])
data_drift_report.run(current_data=adult_cur.iloc[:60], reference_data=adult_ref.iloc[60:], column_mapping=None)
data_drift_report.save_html("file.html")

In [8]:
adult_cur.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K
6,29.0,,227026.0,HS-grad,9.0,Never-married,,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K
8,24.0,Private,369667.0,Some-college,10.0,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K


In [9]:
adult_cur.iloc[:2000, 3:5] = np.nan

In [10]:
adult_cur.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
1,38.0,Private,89814.0,,,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
3,44.0,Private,160323.0,,,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,,,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K
6,29.0,,227026.0,,,Never-married,,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K
8,24.0,Private,369667.0,,,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K


### Drift Detection Dashboard - created using EvidentlyAI

In [11]:
# generate Drift Report
data_drift_dataset_report = Report(metrics = [
    DatasetDriftMetric(), # calculates the drift between two datasets based on the distribution of the features
    DataDriftTable(), # creates a table that shows the drift between the reference and current datasets for each feature
])

# data_drift_dataset_report.run(reference_data=adult_ref.iloc[60:], current_data=adult_cur.iloc[:60])
data_drift_dataset_report.run(reference_data = adult_ref, current_data = adult_cur)
data_drift_dataset_report

In [12]:
data_drift_dataset_report.save_html("file.html")

In [13]:
# export Drift Report in JSON format
data_drift_dataset_report.json()

'{"version": "0.4.0", "metrics": [{"metric": "DatasetDriftMetric", "result": {"drift_share": 0.5, "number_of_columns": 15, "number_of_drifted_columns": 5, "share_of_drifted_columns": 0.3333333333333333, "dataset_drift": false}}, {"metric": "DataDriftTable", "result": {"number_of_columns": 15, "number_of_drifted_columns": 5, "share_of_drifted_columns": 0.3333333333333333, "dataset_drift": false, "drift_by_columns": {"age": {"column_name": "age", "column_type": "num", "stattest_name": "Wasserstein distance (normed)", "stattest_threshold": 0.1, "drift_score": 0.18534692319041995, "drift_detected": true, "current": {"small_distribution": {"x": [17.0, 24.3, 31.6, 38.9, 46.2, 53.5, 60.8, 68.1, 75.4, 82.7, 90.0], "y": [0.02471021672878118, 0.025839691234843417, 0.0262859521410848, 0.025211766596857754, 0.015942967066340047, 0.010173168977679455, 0.0061528716099474344, 0.0018640278561586543, 0.000568686464590777, 0.0002369526935794904]}}, "reference": {"small_distribution": {"x": [17.0, 24.3, 

In [14]:
# report as a python obj
# data_drift_dataset_report.as_dict()

{'metrics': [{'metric': 'DatasetDriftMetric',
   'result': {'drift_share': 0.5,
    'number_of_columns': 15,
    'number_of_drifted_columns': 5,
    'share_of_drifted_columns': 0.3333333333333333,
    'dataset_drift': False}},
  {'metric': 'DataDriftTable',
   'result': {'number_of_columns': 15,
    'number_of_drifted_columns': 5,
    'share_of_drifted_columns': 0.3333333333333333,
    'dataset_drift': False,
    'drift_by_columns': {'age': {'column_name': 'age',
      'column_type': 'num',
      'stattest_name': 'Wasserstein distance (normed)',
      'stattest_threshold': 0.1,
      'drift_score': 0.18534692319041995,
      'drift_detected': True,
      'current': {'small_distribution': {'x': [17.0,
         24.3,
         31.6,
         38.9,
         46.2,
         53.5,
         60.8,
         68.1,
         75.4,
         82.7,
         90.0],
        'y': [0.02471021672878118,
         0.025839691234843417,
         0.0262859521410848,
         0.025211766596857754,
         0.01

Note: 

There are some issues in displaying the Report, please check this, https://github.com/evidentlyai/evidently/issues/23

Ref: https://www.datacamp.com/tutorial/understanding-data-drift-model-drift