# DICE Basic Results

In [6]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../../..'))

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn import neighbors

import matplotlib.pyplot as plt
import seaborn as sns
from models import model_interface, model_loader, model_constants
from data import data_loader
from data.adapters import continuous_adapter

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Preliminaries -- load everything

In [7]:
DATASET, DATASET_INFO = data_loader.load_data(data_loader.DatasetName('credit_card_default'))
MODEL = model_loader.load_model(model_constants.ModelType('logistic_regression'), data_loader.DatasetName('credit_card_default'))
ADAPTER = continuous_adapter.StandardizingAdapter(
    label_column = DATASET_INFO.label_column, positive_label=DATASET_INFO.positive_label
).fit(DATASET)

KDE = neighbors.KernelDensity(bandwidth=0.251188643150958)  # From previous cross validation
KDE.fit(ADAPTER.transform(DATASET.drop(columns='Y')))

index_df = pd.read_csv('./index_df.csv')
path_df = pd.read_csv('./path_df.csv')
index_df

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,batch_id,run_id,run_seed,confidence_cutoff,dataset_name,max_iterations,model_type,noise_ratio,num_paths,rescale_ratio
0,3,112,171,0.6,credit_card_default,30,logistic_regression,,5,
1,10,302,1701,0.8,credit_card_default,30,logistic_regression,,4,
2,15,459,1498,0.9,credit_card_default,30,logistic_regression,,5,
3,0,14,51,0.6,credit_card_default,30,logistic_regression,,2,
4,8,252,4470,0.8,credit_card_default,30,logistic_regression,,2,
...,...,...,...,...,...,...,...,...,...,...
475,9,283,4526,0.8,credit_card_default,30,logistic_regression,,3,
476,3,97,5653,0.6,credit_card_default,30,logistic_regression,,5,
477,7,213,478,0.7,credit_card_default,30,logistic_regression,,5,
478,8,254,51,0.8,credit_card_default,30,logistic_regression,,2,


In [8]:
def get_path_length(path):
    path = ADAPTER.transform(path)
    if len(path) == 1:
        return np.nan
    length = 0
    for i in range(1, len(path)):
        diff = path.iloc[i] - path.iloc[i-1]
        length += np.linalg.norm(diff.to_numpy())
    return length


def get_final_density(path):
    return KDE.score_samples(ADAPTER.transform(path.iloc[-1:]))


def analyze_paths(paths, target_certainty):
    metrics = {
        'success_ratio': 0,
    }
    path_lengths = []
    cfe_densities = []
    for i, path in enumerate(paths):
        if MODEL.predict_pos_proba_series(path.iloc[-1]) > target_certainty:
            metrics['success_ratio'] += 1
        path_length = get_path_length(path)
        metrics[f'path_{i}_length'] = path_length
        if not np.isnan(path_length):
            path_lengths.append(path_length)
        cfe_densities.append(get_final_density(path))
     
    metrics['path_length'] = np.mean(path_lengths)
    metrics['success_ratio'] = metrics['success_ratio'] / len(paths)
    metrics['min_density'] = np.min(cfe_densities)
    return metrics


def aggregate_metrics(run_metrics):
    aggregated_metrics = {}
    for metrics in run_metrics:
        for metric, metric_value in metrics.items():
            aggregated_metrics[metric] = aggregated_metrics.get(metric, []) + [metric_value]
    
    return pd.DataFrame(aggregated_metrics)


def analyze_runs(runs: pd.DataFrame, target_certainty, num_paths):
    run_metrics = []
    for run_id in runs.run_id.unique():
        paths = []
        run = runs[runs.run_id == run_id]
        for path_id in range(num_paths):
            path = run[run.path_id == path_id].sort_values(['step_id']).drop(columns=['batch_id', 'run_id', 'step_id', 'path_id'])
            paths.append(path)
        run_metrics.append(analyze_paths(paths, target_certainty))
    return aggregate_metrics(run_metrics)

# Analyze the results. This will take a while...
Expect about 5-10 minutes.

In [9]:
final_results = None

batch_ids = index_df.batch_id.unique()


for i, batch_id in enumerate(batch_ids):
    print(f"Process batch {i+1}/{len(batch_ids)}")
    batched_runs = path_df[path_df.batch_id == batch_id]
    target_certainty = index_df[index_df.batch_id == batch_id].confidence_cutoff.iloc[0]
    num_paths = index_df[index_df.batch_id == batch_id].num_paths.iloc[0]

    metrics_df = analyze_runs(batched_runs, target_certainty, num_paths)
    metrics_df['batch_id'] = batch_id
    if final_results is not None:
        final_results = pd.concat([final_results, metrics_df]).reset_index(drop=True)
    else:
        final_results = metrics_df

Process batch 1/16
Process batch 2/16
Process batch 3/16
Process batch 4/16
Process batch 5/16
Process batch 6/16
Process batch 7/16
Process batch 8/16
Process batch 9/16
Process batch 10/16
Process batch 11/16
Process batch 12/16
Process batch 13/16
Process batch 14/16
Process batch 15/16
Process batch 16/16


# Let's look at the average

Like STEP, DICE always crosses the decision boundary.

However,
* STEP path lengths are between 4 and 12, DICE path lengths between 30 and 40
* STEP densities are between -25 and -40. This is pretty close to the densities within the dataset
* DICE densities are between -4000 and -10000. These are very low-probability

In [10]:
mean_results = final_results.groupby('batch_id').mean().sort_values('success_ratio')
mean_results = pd.merge(mean_results, index_df[['batch_id', 'num_paths', 'confidence_cutoff']], on='batch_id').drop_duplicates('batch_id')
mean_results.sort_values('path_length')

Unnamed: 0,batch_id,success_ratio,path_0_length,path_1_length,path_2_length,path_3_length,path_4_length,path_length,min_density,num_paths,confidence_cutoff
0,0,1.0,32.617622,28.416914,,,,30.517268,-4046.42801,2,0.6
30,1,1.0,32.617622,28.416914,30.64749,,,30.560676,-5334.512451,3,0.6
60,2,1.0,32.617622,28.416914,30.64749,30.70379,,30.596454,-6853.076081,4,0.6
90,3,1.0,32.617622,28.416914,30.64749,30.70379,31.054807,30.688125,-7426.156258,5,0.6
180,6,1.0,35.185955,36.845441,31.776641,32.807716,,34.153938,-5886.527933,4,0.7
210,7,1.0,35.185955,36.845441,31.776641,32.807716,35.368014,34.396753,-7304.616781,5,0.7
150,5,1.0,35.185955,36.845441,31.776641,,,34.602679,-5287.978398,3,0.7
120,4,1.0,35.185955,36.845441,,,,36.015698,-4232.702496,2,0.7
270,9,1.0,39.470745,36.112184,36.936314,,,37.506414,-6733.730461,3,0.8
240,8,1.0,39.470745,36.112184,,,,37.791464,-4904.034013,2,0.8


# Examples of path lengths

In [11]:
def get_paths(batch_id, path_df):
    run_id = path_df[path_df.batch_id == batch_id].run_id.sample(1).iloc[0]
    selected_paths = path_df[(path_df.run_id == run_id)]
    paths = []
    for path_id in selected_paths.sort_values('path_id').path_id.unique():
        paths.append(selected_paths[selected_paths.path_id == path_id])
    return paths

In [12]:
short_batch_id = final_results[final_results.path_length < 4].iloc[0].batch_id

short_path = get_paths(short_batch_id, path_df)[0]

print("Path length", get_path_length(short_path))

print("\nPATH START:\n")
print(short_path.iloc[0])
print("\nPATH END:\n")
print(short_path.iloc[-1])

Path length 56.94607720601974

PATH START:

LIMIT_BAL    60000.0
AGE             28.0
PAY_1            1.0
PAY_2            0.0
PAY_3            0.0
PAY_4            0.0
PAY_5            2.0
PAY_6            2.0
BILL_AMT1        0.0
BILL_AMT2     3132.0
BILL_AMT3     3144.0
BILL_AMT4     3638.0
BILL_AMT5     3436.0
BILL_AMT6    22219.0
PAY_AMT1      3132.0
PAY_AMT2      1053.0
PAY_AMT3       547.0
PAY_AMT4         0.0
PAY_AMT5     19000.0
PAY_AMT6         0.0
step_id          0.0
path_id          0.0
run_id           6.0
batch_id         0.0
Name: 1514, dtype: float64

PATH END:

LIMIT_BAL      60000.0
AGE               28.0
PAY_1              1.0
PAY_2              0.0
PAY_3              0.0
PAY_4              0.0
PAY_5              2.0
PAY_6              2.0
BILL_AMT1          0.0
BILL_AMT2       3132.0
BILL_AMT3       3144.0
BILL_AMT4       3638.0
BILL_AMT5       3436.0
BILL_AMT6      22219.0
PAY_AMT1        3132.0
PAY_AMT2     1312916.0
PAY_AMT3         547.0
PAY_AMT4           0.0