In [2]:
import nannyml as nml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# To Do

**Datasets**:
- add time stamps to raw dataset
- create new train dataset and alter it for drift detection

**Univariante Data Drift:**
- Continuous:
  - Kolmogorov-Smirnov Test
  - Jensen-Shannon Distance
  - + 1 more if possible
- Categorical:
  - Chi-squared Test
  - Jensen-Shannon Distance
  - + 1 more if possible

**Multivariante Data Drift:**
- https://nannyml.readthedocs.io/en/latest/tutorials/detecting_data_drift/multivariate_drift_detection.html

**Performance Estimation:**
- https://nannyml.readthedocs.io/en/latest/tutorials/performance_estimation/regression_performance_estimation.html
- https://nannyml.readthedocs.io/en/latest/how_it_works/performance_estimation.html

**Data Quality checks:**
- look for further unit tests
- Missing value detection
- Unseen value detection

In [3]:
reference_df, analysis_df, _ = nml.load_synthetic_car_loan_dataset()
display(reference_df.head())

column_names = ['car_value', 
                'salary_range', 
                'debt_to_income_ratio', 
                'loan_length', 
                'repaid_loan_on_prev_car', 
                'size_of_downpayment', 
                'driver_tenure', 
                'y_pred_proba', 
                'y_pred']

Unnamed: 0,car_value,salary_range,debt_to_income_ratio,loan_length,repaid_loan_on_prev_car,size_of_downpayment,driver_tenure,repaid,timestamp,y_pred_proba,y_pred
0,39811.0,40K - 60K €,0.63295,19.0,False,40%,0.212653,1.0,2018-01-01 00:00:00.000,0.99,1
1,12679.0,40K - 60K €,0.718627,7.0,True,10%,4.927549,0.0,2018-01-01 00:08:43.152,0.07,0
2,19847.0,40K - 60K €,0.721724,17.0,False,0%,0.520817,1.0,2018-01-01 00:17:26.304,1.0,1
3,22652.0,20K - 20K €,0.705992,16.0,False,10%,0.453649,1.0,2018-01-01 00:26:09.456,0.98,1
4,21268.0,60K+ €,0.671888,21.0,True,30%,5.695263,1.0,2018-01-01 00:34:52.608,0.99,1


# Univariante Drift Detection

In [8]:
univariate_calculator = nml.UnivariateDriftCalculator(
    column_names=column_names,
    treat_as_categorical=['y_pred'],
    timestamp_column_name='timestamp',
    continuous_methods=['kolmogorov_smirnov', 'jensen_shannon'],
    categorical_methods=['chi2', 'jensen_shannon'],
    chunk_size=5000,
)

univariate_calculator.fit(reference_df)
results = univariate_calculator.calculate(analysis_df)
display(results.filter(period='analysis', column_names=['debt_to_income_ratio']).to_df())

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio,debt_to_income_ratio
Unnamed: 0_level_1,chunk,chunk,chunk,chunk,chunk,chunk,chunk,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,kolmogorov_smirnov,jensen_shannon,jensen_shannon,jensen_shannon,jensen_shannon
Unnamed: 0_level_2,key,chunk_index,start_index,end_index,start_date,end_date,period,value,upper_threshold,lower_threshold,alert,value,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,2018-10-30 18:00:00,2018-11-30 00:27:16.848,analysis,0.01576,0.018584,,False,0.031661,0.1,,False
1,[5000:9999],1,5000,9999,2018-11-30 00:36:00,2018-12-30 07:03:16.848,analysis,0.01268,0.018584,,False,0.030011,0.1,,False
2,[10000:14999],2,10000,14999,2018-12-30 07:12:00,2019-01-29 13:39:16.848,analysis,0.01734,0.018584,,False,0.031129,0.1,,False
3,[15000:19999],3,15000,19999,2019-01-29 13:48:00,2019-02-28 20:15:16.848,analysis,0.0128,0.018584,,False,0.029464,0.1,,False
4,[20000:24999],4,20000,24999,2019-02-28 20:24:00,2019-03-31 02:51:16.848,analysis,0.01918,0.018584,,True,0.030809,0.1,,False
5,[25000:29999],5,25000,29999,2019-03-31 03:00:00,2019-04-30 09:27:16.848,analysis,0.00824,0.018584,,False,0.028681,0.1,,False
6,[30000:34999],6,30000,34999,2019-04-30 09:36:00,2019-05-30 16:03:16.848,analysis,0.01058,0.018584,,False,0.043628,0.1,,False
7,[35000:39999],7,35000,39999,2019-05-30 16:12:00,2019-06-29 22:39:16.848,analysis,0.01002,0.018584,,False,0.029253,0.1,,False
8,[40000:44999],8,40000,44999,2019-06-29 22:48:00,2019-07-30 05:15:16.848,analysis,0.01068,0.018584,,False,0.030628,0.1,,False
9,[45000:49999],9,45000,49999,2019-07-30 05:24:00,2019-08-29 11:51:16.848,analysis,0.0068,0.018584,,False,0.02833,0.1,,False


## Continuous

Jensen-Shannon Distance

In [17]:
figure = results.filter(column_names=results.continuous_column_names, 
                        methods=['jensen_shannon'])\
                        .plot(kind='drift')
figure.show()

In [None]:
figure = results.filter(column_names=results.continuous_column_names,
                        methods=['jensen_shannon'])\
                        .plot(kind='distribution')
figure.show()

Kolmogorov-Smirnov Test

In [None]:
figure = results.filter(column_names=results.continuous_column_names, 
                        methods=['kolmogorov_smirnov'])\
                        .plot(kind='drift')
figure.show()

In [None]:
figure = results.filter(column_names=results.continuous_column_names, 
                        methods=['kolmogorov_smirnov'])\
                        .plot(kind='distribution')
figure.show()

Wasserstein Distance  
Hellinger

## Categorical

Chi Squared

In [None]:
figure = results.filter(column_names=results.categorical_column_names, 
                        methods=['chi2'])\
                        .plot(kind='drift')
figure.show()

In [None]:
figure = results.filter(column_names=results.categorical_column_names,
                        methods=['chi2'])\
                        .plot(kind='distribution')
figure.show()

Jensen Shannon Distance

In [None]:
figure = results.filter(column_names=results.categorical_column_names, 
                        methods=['jensen_shannon'])\
                        .plot(kind='drift') 
figure.show()

In [None]:
figure = results.filter(column_names=results.categorical_column_names,
                        methods=['jensen_shannon'])\
                        .plot(kind='distribution')
figure.show()

L-Infinity  
Hellinger

# Multivariante Drift Detection

In [11]:
non_feature_columns = ['timestamp', 'y_pred_proba', 'y_pred', 'repaid']

# Define feature columns
feature_column_names = [
    col for col in reference_df.columns
    if col not in non_feature_columns
]

multivariante_calculator = nml.DataReconstructionDriftCalculator(
    column_names=feature_column_names,
    timestamp_column_name='timestamp',
    chunk_size=5000
)
multivariante_calculator.fit(reference_df)
results = multivariante_calculator.calculate(analysis_df)

# display(results.filter(period='analysis').to_df())

# display(results.filter(period='reference').to_df())

figure = results.plot()
figure.show()

# Data Quality Checks

## Missing Values Detection

In [14]:
# impute in first 1000 rows in columns loan_length value NaN
analysis_df.loc[0:1000, 'loan_length'] = np.nan

In [16]:
calc = nml.MissingValuesCalculator(
    column_names=feature_column_names,
)

calc.fit(reference_df)
results = calc.calculate(analysis_df)
display(results.filter(period='all').to_df())

for column_name in results.column_names:
    results.filter(column_names=column_name).plot().show()

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,car_value,car_value,car_value,...,size_of_downpayment,size_of_downpayment,size_of_downpayment,driver_tenure,driver_tenure,driver_tenure,driver_tenure,driver_tenure,driver_tenure,driver_tenure
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,value,sampling_error,upper_confidence_boundary,...,upper_threshold,lower_threshold,alert,value,sampling_error,upper_confidence_boundary,lower_confidence_boundary,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
1,[5000:9999],1,5000,9999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
2,[10000:14999],2,10000,14999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
3,[15000:19999],3,15000,19999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
4,[20000:24999],4,20000,24999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
5,[25000:29999],5,25000,29999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
6,[30000:34999],6,30000,34999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
7,[35000:39999],7,35000,39999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
8,[40000:44999],8,40000,44999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False
9,[45000:49999],9,45000,49999,,,reference,0.0,0.0,0.0,...,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0,0.0,False


## Unseen Values Detection

In [21]:
categorical_columns = ['salary_range', 'repaid_loan_on_prev_car']

In [22]:
calc = nml.UnseenValuesCalculator(
    column_names=categorical_columns,
)

calc.fit(reference_data=reference_df)
results = calc.calculate(analysis_df)
display(results.filter(period='all').to_df())

for column_name in results.column_names:
    results.filter(column_names=column_name).plot().show()

Unnamed: 0_level_0,chunk,chunk,chunk,chunk,chunk,chunk,chunk,salary_range,salary_range,salary_range,salary_range,repaid_loan_on_prev_car,repaid_loan_on_prev_car,repaid_loan_on_prev_car,repaid_loan_on_prev_car
Unnamed: 0_level_1,key,chunk_index,start_index,end_index,start_date,end_date,period,value,upper_threshold,lower_threshold,alert,value,upper_threshold,lower_threshold,alert
0,[0:4999],0,0,4999,,,reference,0.0,0,,False,0.0,0,,False
1,[5000:9999],1,5000,9999,,,reference,0.0,0,,False,0.0,0,,False
2,[10000:14999],2,10000,14999,,,reference,0.0,0,,False,0.0,0,,False
3,[15000:19999],3,15000,19999,,,reference,0.0,0,,False,0.0,0,,False
4,[20000:24999],4,20000,24999,,,reference,0.0,0,,False,0.0,0,,False
5,[25000:29999],5,25000,29999,,,reference,0.0,0,,False,0.0,0,,False
6,[30000:34999],6,30000,34999,,,reference,0.0,0,,False,0.0,0,,False
7,[35000:39999],7,35000,39999,,,reference,0.0,0,,False,0.0,0,,False
8,[40000:44999],8,40000,44999,,,reference,0.0,0,,False,0.0,0,,False
9,[45000:49999],9,45000,49999,,,reference,0.0,0,,False,0.0,0,,False
