In [228]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from models.core import logistic_regression
from data import data_loader
from models import model_constants
from data.adapters import continuous_adapter
from recourse_methods import mrmc_method

model = logistic_regression.LogisticRegression(data_loader.DatasetName.CREDIT_CARD_DEFAULT, model_constants.ModelName.DEFAULT).load_model()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test parameters used

In [229]:
data_df = pd.read_csv('../experiments/mrmc_results/data_df.csv')
index_df = pd.read_csv('../experiments/mrmc_results/index_df.csv')
cluster_df = pd.read_csv('../experiments/mrmc_results/cluster_df.csv')

print("Data collected on tests with these input parameters:")
index_df.drop_duplicates('test_id').drop(columns=['test_id', 'trial_id', 'seed', 'cluster_seed']).sort_values(['confidence_cutoff', 'noise_ratio'])

Data collected on tests with these input parameters:


Unnamed: 0,confidence_cutoff,max_iterations,noise_ratio,num_paths,step_size,volcano_cutoff,volcano_degree
16,0.5,30,0.0,3,0.5,0.2,2
5,0.5,30,0.25,3,0.5,0.2,2
4,0.5,30,0.5,3,0.5,0.2,2
13,0.5,30,0.75,3,0.5,0.2,2
6,0.5,30,1.0,3,0.5,0.2,2
0,0.5,30,1.25,3,0.5,0.2,2
20,0.6,30,0.0,3,0.5,0.2,2
1,0.6,30,0.25,3,0.5,0.2,2
11,0.6,30,0.5,3,0.5,0.2,2
10,0.6,30,0.75,3,0.5,0.2,2


## Experiment results

The only parameters varied are confidence cutoff and noise. For each path, we count the number of trials out of 30 which achieve the desired model confidence.

In [230]:
print("Confidence\tNoise\t\tSuccess counts (path1, path2, path3, etc...)")

for test_id in index_df.sort_values(['confidence_cutoff', 'noise_ratio']).test_id.unique():
    test_success_counts = [0] * len(data_df.path_id.unique())
    test_df = index_df[index_df.test_id == test_id]
    confidence = test_df.confidence_cutoff.iloc[0]
    for trial_id in index_df[index_df.test_id == test_id].trial_id.unique():
        trial_df = data_df[data_df.trial_id == trial_id]
        for path_id in trial_df.path_id.unique():
            path = trial_df[trial_df.path_id == path_id].sort_values('step_id')
            if model.predict_pos_proba_series(path.drop(columns=['path_id', 'step_id', 'test_id', 'trial_id']).iloc[-1]) > confidence:
                test_success_counts[path_id] += 1
    print(f"{test_df.confidence_cutoff.iloc[0]}\t\t{test_df.noise_ratio.iloc[0]}\t\t{test_success_counts}")

Confidence	Noise		Success counts (path1, path2, path3, etc...)
0.5		0.0		[30, 30, 30]
0.5		0.25		[30, 30, 30]
0.5		0.5		[30, 30, 30]
0.5		0.75		[30, 30, 30]
0.5		1.0		[30, 30, 30]
0.5		1.25		[30, 29, 30]
0.6		0.0		[30, 30, 30]
0.6		0.25		[30, 30, 30]
0.6		0.5		[30, 30, 30]
0.6		0.75		[30, 30, 30]
0.6		1.0		[30, 30, 29]
0.6		1.25		[28, 29, 29]
0.7		0.0		[30, 30, 30]
0.7		0.25		[30, 30, 30]
0.7		0.5		[30, 30, 30]
0.7		0.75		[30, 29, 30]
0.7		1.0		[30, 29, 30]
0.7		1.25		[28, 29, 29]
0.8		0.0		[30, 30, 30]
0.8		0.25		[30, 30, 30]
0.8		0.5		[30, 30, 30]
0.8		0.75		[30, 30, 29]
0.8		1.0		[29, 29, 27]
0.8		1.25		[28, 29, 20]


## What do the clusters look like for confidence 0.8?

In [233]:
CUTOFF = 0.8
noise = 0

idx = index_df[index_df.confidence_cutoff == cutoff].trial_id.iloc[0]

cdf = cluster_df[(cluster_df.trial_id == idx)]

clusters = []
for path_id in [0,1,2]:
    clusters.append(cdf[(cdf.trial_id == idx) & (cdf.path_id == path_id)].iloc[0])


dataset, datainfo = data_loader.load_data(data_loader.DatasetName.CREDIT_CARD_DEFAULT)
adapter = continuous_adapter.StandardizingAdapter(label_name=datainfo.label_name, positive_label=datainfo.positive_label).fit(dataset)

print("Positive Probability for each cluster:")
print(model.predict_pos_proba(adapter.inverse_transform(cdf).drop(columns=['path_id', 'test_id', 'trial_id'])))

print("Clusters:")
cdf

Positive Probability for each cluster:
9     1.0
10    1.0
11    1.0
dtype: float64
Clusters:


Unnamed: 0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,path_id,trial_id,test_id
9,457932.58427,37.235955,0.013483,0.004494382,0.004494,0.008988764,0.004494382,2.0816680000000002e-17,47592.840449,50067.719101,...,65427.049438,34055.442697,39715.782022,32219.979775,28967.566292,34402.642697,38479.746067,0,551,18
10,246371.191136,34.468144,0.019391,0.008310249,0.027701,0.02216066,0.005540166,0.02770083,60296.188366,94612.088643,...,87218.955679,72082.432133,55815.409972,34335.196676,26916.950139,15654.210526,23039.628809,1,551,18
11,475784.313725,37.401961,0.019608,6.938894e-18,0.039216,8.673617e-18,-3.469447e-18,1.2143060000000001e-17,287406.333333,288360.313725,...,299121.558824,71171.941176,119761.107843,62895.941176,43182.137255,31688.745098,42412.931373,2,551,18


## How large are the clusters?

In [234]:
cluster_sizes = [0,0,0]

d = mrmc_method.MRMC.filter_data(dataset, CUTOFF, model)
d = adapter.inverse_transform(mrmc_method.MRM.process_data(d, adapter))
dists = np.zeros((d.shape[0], 3))
for path_id in data_df.path_id.unique():
    cluster = cdf[cdf.path_id == path_id].drop(columns=['trial_id', 'test_id', 'path_id']).iloc[0]
    dists[:,path_id] = ((d - cluster)**2).mean(axis=1)

cluster_assignment = np.argmin(dists, axis=1)

for path_id in range(dists.shape[1]):
    cluster_sizes[path_id] = (cluster_assignment == path_id).sum()


print("Cluster Sizes:")
cluster_sizes

Cluster Sizes:


[446, 360, 102]