In [299]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from models.core import logistic_regression
from data import data_loader
from models import model_constants
from data.adapters import continuous_adapter
from recourse_methods import mrmc_method


dataset, datainfo = data_loader.load_data(data_loader.DatasetName.CREDIT_CARD_DEFAULT)
adapter = continuous_adapter.StandardizingAdapter(label_name=datainfo.label_name, positive_label=datainfo.positive_label).fit(dataset)

model = logistic_regression.LogisticRegression(data_loader.DatasetName.CREDIT_CARD_DEFAULT, model_constants.ModelName.DEFAULT).load_model()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test parameters used

In [302]:
method = 'dice'

data_df = pd.read_csv(f'../experiments/{method}_results/data_df.csv')
index_df = pd.read_csv(f'../experiments/{method}_results/index_df.csv')
if method == 'mrmc':
    cluster_df = pd.read_csv(f'../experiments/{method}_results/cluster_df.csv')

print("Data collected on tests with these input parameters:")
index_df.drop_duplicates('test_id').drop(columns=['test_id', 'trial_id', 'seed']).sort_values(['confidence_cutoff', 'noise_ratio', 'rescale_ratio'])

Data collected on tests with these input parameters:


Unnamed: 0,confidence_cutoff,max_iterations,noise_ratio,num_paths,rescale_ratio
5,0.6,30,0.0,3,0.8
9,0.6,30,0.0,3,0.9
0,0.6,30,0.0,3,1.0
2,0.6,30,0.5,3,0.8
19,0.6,30,0.5,3,0.9
3,0.6,30,0.5,3,1.0
4,0.6,30,1.0,3,0.8
8,0.6,30,1.0,3,0.9
11,0.6,30,1.0,3,1.0


## Experiment results

The only parameters varied are confidence cutoff and noise. For each path, we count the number of trials out of 30 which achieve the desired model confidence.

In [301]:
def path_length(path):
    path = adapter.transform(path)
    length = 0
    for i in range(1, len(path)):
        length += np.sqrt(((path.iloc[i] - path.iloc[i-1])**2).sum())
    return length

for test_id in index_df.sort_values(['confidence_cutoff', 'rescale_ratio', 'noise_ratio']).test_id.unique():
    test_success_counts = [0] * len(data_df.path_id.unique())
    test_path_lengths = [0] * len(data_df.path_id.unique())
    test_path_counts = [0] * len(data_df.path_id.unique())
    test_df = index_df[index_df.test_id == test_id]
    confidence = test_df.confidence_cutoff.iloc[0]
    for trial_id in index_df[index_df.test_id == test_id].trial_id.unique():
        trial_df = data_df[data_df.trial_id == trial_id]
        for path_id in trial_df.path_id.unique():
            path = trial_df[trial_df.path_id == path_id].sort_values('step_id')
            if model.predict_pos_proba_series(path.drop(columns=['path_id', 'step_id', 'test_id', 'trial_id']).iloc[-1]) > confidence:
                test_success_counts[path_id] += 1
            test_path_counts[path_id] += len(path)
            test_path_lengths[path_id] += path_length(path)

    test_success_counts = [count / 30 for count in test_success_counts]
    test_path_lengths = [length / 30 for length in test_path_lengths]
    test_path_counts = [count / 30 for count in test_path_counts]
    print(test_df[['confidence_cutoff', 'rescale_ratio', 'noise_ratio']].iloc[:1])
    print("Successes:\t\t", test_success_counts)
    print("Path Lengths:\t\t", test_path_lengths)
    print("Path Counts:\t\t", test_path_counts)
    print("\n")

   confidence_cutoff  rescale_ratio  noise_ratio
5                0.6            0.8          0.0
Successes:		 [1.0, 1.0, 1.0]
Path Lengths:		 [40.47968026194978, 44.69895332760224, 41.95355593650513]
Path Counts:		 [2.7333333333333334, 2.7666666666666666, 2.7]


   confidence_cutoff  rescale_ratio  noise_ratio
2                0.6            0.8          0.5
Successes:		 [1.0, 1.0, 1.0]
Path Lengths:		 [43.668992992702194, 42.14190471306161, 45.32475887670175]
Path Counts:		 [2.8333333333333335, 2.8, 2.8]


   confidence_cutoff  rescale_ratio  noise_ratio
4                0.6            0.8          1.0
Successes:		 [1.0, 1.0, 1.0]
Path Lengths:		 [52.48739126938504, 54.115221721579, 48.41684953044366]
Path Counts:		 [3.1666666666666665, 3.2666666666666666, 2.966666666666667]


   confidence_cutoff  rescale_ratio  noise_ratio
9                0.6            0.9          0.0
Successes:		 [1.0, 1.0, 1.0]
Path Lengths:		 [47.148410521982306, 47.39445484755855, 43.95279770799545]
Path Cou

## Let's view the paths

In [296]:
test_id = index_df[(index_df.rescale_ratio == 0.8) & (index_df.confidence_cutoff == 0.7)].test_id.iloc[0]

trial_ids = index_df[index_df.test_id == test_id].sort_values('seed').trial_id

trial_id = trial_ids.iloc[0]

paths = data_df[data_df.trial_id == trial_id]

model.predict_pos_proba(paths.sort_values(['path_id', 'step_id']).drop(columns=['step_id', 'trial_id', 'test_id', 'path_id']))

1327    0.588060
1328    0.094658
1329    0.794367
1330    0.588060
1331    0.300223
1332    0.704046
1333    0.588060
1334    0.286625
1335    0.997854
dtype: float64

In [298]:
paths[paths.path_id == 0].drop(columns=['step_id', 'path_id', 'test_id', 'trial_id'])

Unnamed: 0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
1327,80000.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,398.0,379.0,298.0,-300.0,-300.0,-300.0,381.0,607.0,0.0,0.0,0.0,598.0
1328,80000.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,398.0,379.0,1225933.2,593817.6,-300.0,-300.0,381.0,607.0,0.0,0.0,0.0,598.0
1329,80000.0,43.0,0.0,0.0,0.0,0.0,4.0,0.0,398.0,379.0,1225933.2,593817.6,-300.0,-300.0,381.0,612679.0,0.0,0.0,0.0,598.0
