In [1]:
import pandas as pd
from model_monitoring import ModelMonitoring

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
incoming_df = pd.read_csv("../data/incoming_data_cleaned.csv")
test_df = pd.read_csv("../data/test_data_cleaned.csv")
incoming_features_df_processed = pd.read_csv("../data/incoming_features_df_processed.csv")
test_features_df_processed = pd.read_csv("../data/test_features_df_processed.csv")

In [3]:
# instantiate model monitoring object
model_monitoring = ModelMonitoring()

In [4]:
# firstly, do data checks to ensure data quality and preprocessing pipeline got no problems
model_monitoring.data_check(test_df, incoming_df, test_features_df_processed, incoming_features_df_processed)

In [5]:
# next, call write_to_config method so that the column names and thresholds can be written into config.json
model_monitoring.write_to_config()

After running write_to_config, proceed to config.json file. It will show a dictionary with all the column names as keys and their respective thresholds as values. User can change the thresholds in this config file for the individual columns to suit their requirements for the respective columns

In [6]:
# after configuring the thresholds in config.json, user can now run feature_drift_report to see how the distribution of features has drifted

model_monitoring.feature_drift_report(test_df.drop(columns=['target', 'prediction']), incoming_df.drop(columns=['target', 'prediction']), 'json')
# model_monitoring.feature_drift_report(test_df.drop(columns=['target', 'prediction']), incoming_df.drop(columns=['target', 'prediction']), 'html')

In [7]:
# on top feature_drfit_report, user can also run prediction_drift_report to see how the distribution of prediction has shifted
"""
For categorical: 
chisquare, z, fisher_exact, g_test, TVD
For numerical: 
ks, wasserstein, anderson, cramer_von_mises, mannw, ed, es, t_test, emperical_mmd
For both categorical and numerical: 
kl_div, psi, jensenshannon, hellinger
for more information on the stats test, please refer to: 
https://docs.evidentlyai.com/user-guide/customization/options-for-statistical-tests
"""
model_monitoring.prediction_drift_report(test_df, incoming_df, 'chisquare', 0.05, 'json')
# model_monitoring.prediction_drift_report(test_df, incoming_df, 'chisquare', 0.05, 'html')

In [8]:
# lastly, users can check for dataset drift, by running check_dataset_drift which uses 
# feature importance scores as proxy to determine dataset drift.
model_monitoring.check_dataset_drift()

processed feature importance mapping : {'categorical__Department_Human Resources': 0.00047832981922485163, 'categorical__Department_Research & Development': 0.013079180154469435, 'categorical__Department_Sales': 0.011184133840550987, 'categorical__EducationField_Human Resources': 6.061732173675546e-05, 'categorical__EducationField_Life Sciences': 0.010137571129742893, 'categorical__EducationField_Marketing': 0.0020409604642020653, 'categorical__EducationField_Medical': 0.007474116748387542, 'categorical__EducationField_Other': 0.0006598829980929501, 'categorical__EducationField_Technical Degree': 0.003765665306522748, 'categorical__Gender_Female': 0.008187215571305235, 'categorical__Gender_Male': 0.010449071303759295, 'categorical__JobRole_Healthcare Representative': 0.0034660732451281726, 'categorical__JobRole_Human Resources': 0.00033984528586818575, 'categorical__JobRole_Laboratory Technician': 0.0063745083440093445, 'categorical__JobRole_Manager': 0.000786919301051128, 'categorical