# NannyML: Model Evaluation without labels

[HMEQ_Data](https://www.kaggle.com/datasets/ajay1735/hmeq-data) analysis for ML monitoring in production without labels.

### Library installation/Importing

In [None]:
!pip install nannyml

### Data Preparation

In [None]:
import numpy as np
import pandas as pd
import random
import datetime as dt
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
import nannyml as nml

In [None]:
df = pd.read_csv("/kaggle/input/hmeq-data/hmeq.csv")
df.head()

Handling categorical data and removing missing values

In [None]:
df = pd.get_dummies(df).dropna()

Due to the heavy class imbalance oversampling and undersampling is used

In [None]:
df["BAD"].value_counts()

In [None]:
over = RandomOverSampler(sampling_strategy=0.8)
df = pd.concat(over.fit_resample(df[df.columns[1:]], df["BAD"]), axis=1)
under = RandomUnderSampler(sampling_strategy=0.9)
df = pd.concat(under.fit_resample(df[df.columns[:-1]], df["BAD"]), axis=1)

In [None]:
df["BAD"].value_counts()

Dividing dataset in training and test sets

In [None]:
X_train, X_rest, y_train, y_rest = train_test_split(df[df.columns[:-1]], 
                                                    df["BAD"],
                                                    stratify=df["BAD"], 
                                                    test_size=0.5)
X_test, X_prod, y_test, y_prod = train_test_split(X_rest, 
                                                  y_rest,
                                                  stratify=y_rest, 
                                                  test_size=0.2)

### Modelling

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

In [None]:
X_test['partition'], X_test['BAD'] = 'reference', y_test.values  
X_prod['partition'], X_prod['BAD'] = 'analysis', y_prod.values
X_prod['YOJ'] = X_prod['YOJ'] * (random.uniform(0, 12) + 0.4)
X_prod['MORTDUE'] = X_prod['MORTDUE'] * random.uniform(0, 5)
X_prod['CLAGE'] = X_prod['CLAGE'] * 0.1
nanny_ml_df = pd.concat([X_test, X_prod])
nanny_ml_df['Time'] = [dt.datetime(2020,1,1) + dt.timedelta(hours=x+5) 
                       for x in range(len(nanny_ml_df))]
nanny_ml_df['identifier'] = nanny_ml_df.index
nanny_ml_df['y_pred_proba'] = clf.predict_proba(nanny_ml_df[nanny_ml_df.columns[:-4]])[:,1]
nanny_ml_df['y_pred'] = nanny_ml_df['y_pred_proba'].map(lambda p: int(p >= 0.75))

reference = nanny_ml_df[nanny_ml_df['partition']=='reference'].copy()
analysis = nanny_ml_df[nanny_ml_df['partition']=='analysis'].copy()

In [None]:
for partition_name, partition_data in nanny_ml_df.groupby('partition', sort=False):
    print(partition_name, recall_score(partition_data['BAD'], partition_data['y_pred']))

### Nanny ML

In [None]:
metadata = nml.extract_metadata(reference, 
                                exclude_columns=['identifier', 'y_pred_proba', 
                                                 'y_pred', 'BAD'], 
                                model_type='classification_binary')
metadata.target_column_name = 'BAD'
metadata.timestamp_column_name = 'Time'

### Performance Estimation

In [None]:
cbpe = nml.CBPE(model_metadata=metadata, chunk_period='W', metrics=['recall'])
cbpe.fit(reference_data=reference)
est_perf = cbpe.estimate(pd.concat([reference, analysis]))

In [None]:
est_perf.data['end_index'] - est_perf.data['start_index']

In [None]:
est_perf.data = est_perf.data[:-1].copy()

In [None]:
fig = est_perf.plot(kind='performance', metric='recall')
fig.show()

### Comparison with the actual performance

In [None]:
df_all = pd.concat([reference, analysis]).reset_index(drop=True)

# Comparing actual recall with the estimated one, against time
actual_performance = []
for idx in est_perf.data.index:
    start_date, end_date = est_perf.data.loc[idx, 'start_date'], est_perf.data.loc[idx, 'end_date']
    sub = df_all[df_all['Time'].between(start_date, end_date)]
    actual_perf = recall_score(sub['BAD'], sub['y_pred'])
    est_perf.data.loc[idx, 'realized_recall'] = actual_perf
    
# Plotting
first_analysis = est_perf.data[est_perf.data['partition']=='analysis']['key'].values[0]
plt.figure(figsize=(10,6))
plt.plot(est_perf.data['key'], est_perf.data['estimated_recall'], label='Estimated Recall')
plt.plot(est_perf.data['key'], est_perf.data['realized_recall'], label='Actual Recall')
plt.xticks(rotation=90, fontsize=12)
plt.axvline(x=first_analysis, label='First Analysis Chunk', linestyle=':', color='grey')
plt.xlabel('Time', fontsize=14)
plt.ylabel('Recall', fontsize=14)
plt.legend(fontsize=12)
plt.title('Actual Recall vs Estimated', fontsize=16)
plt.show()

### Drift detection

In [None]:
univariate_calculator = (nml.UnivariateStatisticalDriftCalculator(model_metadata=metadata)
                            .fit(reference_data=reference))
univariate_results = univariate_calculator.calculate(data=analysis)
nml.Ranker.by('alert_count').rank(univariate_results, metadata)

In [None]:
# get columns with d statistics only
d_stat_cols = [x for x in univariate_results.data if 'dstat' in x]
univariate_results.data[d_stat_cols].mean().sort_values(ascending=False)[:3]

In [None]:
top_drift_cols = univariate_results.data[d_stat_cols].mean().sort_values(ascending=False)[:3].index
top_drift_cols = [i.split('_')[0] for i in top_drift_cols]

for label in top_drift_cols:
    fig = univariate_results.plot(
            kind='feature_distribution',
            feature_label=label)
    fig.show()

In [None]:
analysis_res = est_perf.data[est_perf.data['partition']=='analysis']

plt.figure(figsize=(10,8))
for idx in analysis_res.index:
    start_date, end_date = analysis_res.loc[idx, 'start_date'], analysis_res.loc[idx, 'end_date']
    sub = df_all[df_all['Time'].between(start_date, end_date)]
    plt.scatter(sub[top_drift_cols[0]], sub[top_drift_cols[1]], s=5,
                label="Chunk {}".format(str(idx)))
plt.legend(fontsize=12)
plt.xlabel(top_drift_cols[0], fontsize=14)
plt.ylabel(top_drift_cols[1], fontsize=14)
plt.show()

### Reconstruction Error with PCA

In [None]:
# Computing multivariate drift and visualizing results
# across the whole dataset
rcerror_calculator = (nml.DataReconstructionDriftCalculator(model_metadata=metadata)
                         .fit(reference_data=reference))
rcerror_results = rcerror_calculator.calculate(data=pd.concat([reference, analysis], 
                                               ignore_index=True))
fig = rcerror_results.plot()
fig.show()