In [64]:
import os
import mlflow
import pandas as pd
from evidently import Report
from evidently import DataDefinition
from evidently import Dataset
from evidently.metrics import ValueDrift, DriftedColumnsCount, MissingValueCount


In [65]:
reference_data = pd.read_parquet("../src/mlops/data/train_dataset.parquet")
raw_data = pd.read_parquet("../src/mlops/data/test_dataset.parquet").reset_index(drop=True)

In [66]:
def identify_categorical_encoded_vars(df, maxcat=8):
    categorical = []
    for c in df.columns:
        unique_values = df[c].unique()
        if (len(unique_values) < maxcat) & (c!='target'):
            categorical.append(c)
    return categorical

In [67]:
def load_model():
    mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5500"))
    experiment_name = os.getenv("EXPERIMENT_NAME", "xgb_best_model")
    model_uri = f"models:/{experiment_name}/latest"
    model = mlflow.pyfunc.load_model(model_uri)
    return model

In [69]:
categorical_vars = identify_categorical_encoded_vars(raw_data)
categorical_vars

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

In [70]:
numerical_vars = [x for x in raw_data.columns if x not in categorical_vars and x!='target']
numerical_vars

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [27]:
data_definition = DataDefinition(
    numerical_columns=numerical_vars + ['prediction'],
    categorical_columns=categorical_vars,
)

In [28]:
report = Report(metrics = [
    ValueDrift(column='prediction'),
    DriftedColumnsCount(),
    MissingValueCount(column='prediction'),
])

In [29]:
model = load_model()

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [54]:
reference_data["prediction"] = (model.predict(reference_data.drop("target", axis=1)) > 0.5).astype(int)

In [61]:
def calculate_metrics(i):
    current_data = raw_data.iloc[[i]]

    current_data['prediction'] = model.predict(current_data[numerical_vars + categorical_vars])
    current_dataset = Dataset.from_pandas(current_data, data_definition=data_definition)
    reference_dataset = Dataset.from_pandas(reference_data, data_definition=data_definition)
    run = report.run(reference_data=reference_dataset, current_data=current_dataset)
    result = run.dict()
    prediction_drift = result['metrics'][0]['value']
    num_drifted_columns = result['metrics'][1]['value']['count']
    share_missing_values = result['metrics'][2]['value']['share']
    return prediction_drift, num_drifted_columns, share_missing_values

In [63]:
for i in range(10, 30):
    x = calculate_metrics(i)
    print(i, x)


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divi

10 (np.float64(0.0), 2.0, np.float64(0.0))
11 (np.float64(0.0), 1.0, np.float64(0.0))
12 (np.float64(0.0), 2.0, np.float64(0.0))
13 (np.float64(0.0), 1.0, np.float64(0.0))
14 (np.float64(0.0), 1.0, np.float64(0.0))
15 (np.float64(0.0), 1.0, np.float64(0.0))



divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divi

16 (np.float64(0.0), 1.0, np.float64(0.0))
17 (np.float64(0.0), 2.0, np.float64(0.0))
18 (np.float64(0.0), 3.0, np.float64(0.0))
19 (np.float64(0.0), 1.0, np.float64(0.0))
20 (np.float64(0.0), 2.0, np.float64(0.0))
21 (np.float64(0.0), 1.0, np.float64(0.0))



Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


22 (np.float64(0.0), 2.0, np.float64(0.0))
23 (np.float64(0.0), 1.0, np.float64(0.0))
24 (np.float64(0.0), 2.0, np.float64(0.0))
25 (np.float64(0.0), 3.0, np.float64(0.0))
26 (np.float64(0.0), 5.0, np.float64(0.0))
27 (np.float64(0.0), 3.0, np.float64(0.0))



divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide



28 (np.float64(0.0), 2.0, np.float64(0.0))
29 (np.float64(0.0), 1.0, np.float64(0.0))



Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide


divide by zero encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


divide by zero encountered in divide



In [44]:
current_dataset = Dataset.from_pandas(raw_data, data_definition=data_definition)
)

In [46]:
run = report.run(reference_data=reference_dataset, current_data=current_dataset)

In [47]:
result = run.dict()

In [48]:
result

{'metrics': [{'id': 'bfc6e8246d39abff41fc2e002575d9a3',
   'metric_id': 'ValueDrift(column=prediction)',
   'value': np.float64(0.6617391044549499)},
  {'id': '15e89f895b482f9b84ba7274ed18a106',
   'metric_id': 'DriftedColumnsCount(drift_share=0.5)',
   'value': {'count': 1.0, 'share': 0.07142857142857142}},
  {'id': 'd57fce37e7dac2a48797649e0e142902',
   'metric_id': 'MissingValueCount(column=prediction)',
   'value': {'count': 0.0, 'share': np.float64(0.0)}}],
 'tests': []}