In [32]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..'))

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from models.core import logistic_regression
from data import data_loader
from models import model_constants
from data.adapters import continuous_adapter
from recourse_methods import mrmc_method


dataset, datainfo = data_loader.load_data(data_loader.DatasetName.CREDIT_CARD_DEFAULT)
adapter = continuous_adapter.StandardizingAdapter(label_name=datainfo.label_name, positive_label=datainfo.positive_label).fit(dataset)

model = logistic_regression.LogisticRegression(data_loader.DatasetName.CREDIT_CARD_DEFAULT, model_constants.ModelName.DEFAULT).load_model()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test parameters used

In [33]:
method = 'mrmc'

data_df = pd.read_csv(f'../experiments/{method}_results/data_df.csv')
index_df = pd.read_csv(f'../experiments/{method}_results/index_df.csv')
if method == 'mrmc':
    cluster_df = pd.read_csv(f'../experiments/{method}_results/cluster_df.csv')

print("Data collected on tests with these input parameters:")
index_df.drop_duplicates('test_id').drop(columns=['test_id', 'trial_id', 'seed']).sort_values(['confidence_cutoff', 'noise_ratio', 'rescale_ratio'])

Data collected on tests with these input parameters:


Unnamed: 0,cluster_seed,confidence_cutoff,max_iterations,noise_ratio,num_paths,rescale_ratio,step_size,volcano_cutoff,volcano_degree
15,10288294,0.6,30,0.0,3,0.8,0.5,0.2,2
0,10288294,0.6,30,0.0,3,0.9,0.5,0.2,2
1,10288294,0.6,30,0.0,3,1.0,0.5,0.2,2
4,10288294,0.6,30,0.5,3,0.8,0.5,0.2,2
16,10288294,0.6,30,0.5,3,0.9,0.5,0.2,2
17,10288294,0.6,30,0.5,3,1.0,0.5,0.2,2
6,10288294,0.6,30,1.0,3,0.8,0.5,0.2,2
9,10288294,0.6,30,1.0,3,0.9,0.5,0.2,2
7,10288294,0.6,30,1.0,3,1.0,0.5,0.2,2


In [37]:
trial_id = index_df[index_df.test_id == 1].trial_id.iloc[0]

data_df[data_df.trial_id == trial_id]

Unnamed: 0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,step_id,path_id,trial_id,test_id
0,50000.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,6350.0,3239.0,...,1100.0,1000.0,0.0,0.0,0.0,0.0,0,0,32,1
1,83167.469017,35.533889,3.8e-05,0.001191,0.000581,0.001314,0.000258,0.000929,12159.613557,9333.032338,...,1966.455193,1971.052262,1179.038176,1007.53555,1002.141483,1012.472695,1,0,32,1
2,50000.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,6350.0,3239.0,...,1100.0,1000.0,0.0,0.0,0.0,0.0,0,1,32,1
3,105262.057864,37.731778,0.000265,0.002201,0.002519,0.002031,0.001045,0.0017,8462.285436,5452.191707,...,1817.422197,1815.403188,894.072617,775.714474,776.179607,856.550934,1,1,32,1
4,50000.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,6350.0,3239.0,...,1100.0,1000.0,0.0,0.0,0.0,0.0,0,2,32,1
5,65352.943983,37.841954,0.000109,4.2e-05,0.001682,0.001614,0.000985,0.002114,19204.975898,15837.389281,...,1785.654136,1685.371403,601.436107,554.070327,564.922694,562.246616,1,2,32,1
6,80706.901851,37.67971,0.000208,8e-05,0.003326,0.003187,0.001931,0.004159,32066.20225,28438.818183,...,2457.92822,2355.278824,1194.211376,1099.891494,1121.296886,1115.243129,2,2,32,1


## Experiment results

The only parameters varied are confidence cutoff and noise. For each path, we count the number of trials out of 30 which achieve the desired model confidence.

In [31]:
def path_length(path):
    path = adapter.transform(path)
    length = 0
    for i in range(1, len(path)):
        length += np.sqrt(((path.iloc[i] - path.iloc[i-1])**2).sum())
    return length

for test_id in index_df.sort_values(['confidence_cutoff', 'rescale_ratio','noise_ratio']).test_id.unique():
    test_success_counts = [0] * len(data_df.path_id.unique())  # = [1,1,0]
    test_path_lengths = [0] * len(data_df.path_id.unique())
    test_path_counts = [0] * len(data_df.path_id.unique())
    test_df = index_df[index_df.test_id == test_id]
    confidence = test_df.confidence_cutoff.iloc[0]
    for trial_id in index_df[index_df.test_id == test_id].trial_id.unique():
        trial_df = data_df[data_df.trial_id == trial_id]
        for path_id in trial_df.path_id.unique():
            path = trial_df[trial_df.path_id == path_id].sort_values('step_id')
            if model.predict_pos_proba_series(path.drop(columns=['path_id', 'step_id', 'test_id', 'trial_id']).iloc[-1]) > confidence:
                test_success_counts[path_id] += 1
            test_path_counts[path_id] += len(path)
            test_path_lengths[path_id] += path_length(path)

    test_success_counts = [count / 30 for count in test_success_counts]
    test_path_lengths = [length / 30 for length in test_path_lengths]
    test_path_counts = [count / 30 for count in test_path_counts]
    print(test_df[['confidence_cutoff', 'rescale_ratio', 'noise_ratio']].iloc[:1])
    print("Successes:\t\t", test_success_counts)
    print("Path Lengths:\t\t", test_path_lengths)
    print("Path Counts:\t\t", test_path_counts)
    print("\n")

    confidence_cutoff  rescale_ratio  noise_ratio
15                0.6            0.8          0.0
Successes:		 [1.0, 1.0, 1.0]
Path Lengths:		 [8.221351605558677, 7.754637322273686, 9.729197751556342]
Path Counts:		 [8.633333333333333, 8.2, 10.033333333333333]


   confidence_cutoff  rescale_ratio  noise_ratio
4                0.6            0.8          0.5
Successes:		 [1.0, 1.0, 1.0]
Path Lengths:		 [9.082977974700201, 8.7239669875579, 10.698527416840552]
Path Counts:		 [9.433333333333334, 9.1, 10.933333333333334]


   confidence_cutoff  rescale_ratio  noise_ratio
6                0.6            0.8          1.0
Successes:		 [0.9333333333333333, 0.9666666666666667, 0.9666666666666667]
Path Lengths:		 [11.344747193696694, 11.093439502697079, 13.606516412693187]
Path Counts:		 [11.533333333333333, 11.3, 13.633333333333333]


   confidence_cutoff  rescale_ratio  noise_ratio
0                0.6            0.9          0.0
Successes:		 [1.0, 1.0, 1.0]
Path Lengths:		 [7.45678214781684

## Let's view some paths

In [21]:
test_id = index_df[(index_df.rescale_ratio == 0.8) & (index_df.confidence_cutoff == 0.7)].test_id.iloc[0]

trial_ids = index_df[index_df.test_id == test_id].sort_values('seed').trial_id

trial_id = trial_ids.iloc[0]

data_df[data_df.trial_id == trial_id]

paths = data_df[data_df.trial_id == trial_id]

model.predict_pos_proba(paths[paths.path_id == 0].sort_values('step_id').drop(columns=['step_id', 'trial_id', 'test_id', 'path_id']))

2444    0.588060
2445    0.613158
2446    0.637870
2447    0.662089
2448    0.685652
2449    0.708152
dtype: float64

In [22]:
paths[paths.path_id == 0].drop(columns=['step_id', 'path_id', 'test_id', 'trial_id'])

Unnamed: 0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
2444,80000.0,43.0,0.0,0.0,0.0,0.0,0.0,0.0,398.0,379.0,298.0,-300.0,-300.0,-300.0,381.0,607.0,0.0,0.0,0.0,598.0
2445,104129.817013,41.332555,0.000197,0.000341,0.000328,0.0006,0.000131,0.000745,4997.669242,5077.208852,5068.964084,4082.29454,4070.845318,4020.806115,2897.868322,2975.157472,1616.224745,1467.123492,1354.545724,1964.481762
2446,129000.489271,39.618805,0.000363,0.000632,0.000611,0.001123,0.000244,0.001381,9417.489864,9625.151864,9662.250193,8277.168491,8246.104958,8145.420834,5401.692056,5304.752971,3207.233266,2903.242759,2691.42512,3288.675302
2447,154843.93519,37.842284,0.000488,0.000856,0.000836,0.001541,0.000334,0.001877,13577.712109,13954.823836,14001.483823,12209.90325,12150.022803,12001.018088,7876.991525,7573.81755,4765.597634,4299.154791,4008.007674,4556.591903
2448,181891.77891,35.97631,0.000563,0.000993,0.000984,0.001819,0.000394,0.00219,17366.902294,17964.587972,17975.022401,15784.269676,15685.042348,15495.075062,10285.874337,9738.446949,6284.81257,5645.905103,5306.999012,5749.504883
2449,210051.502092,33.969364,0.000577,0.001016,0.001033,0.001912,0.000416,0.00227,20682.180926,21536.488526,21456.726793,18938.64093,18783.400178,18556.942042,12526.140068,11703.716908,7771.660359,6949.888911,6614.09855,6849.545316


## Thoughts

I had promising results earlier. Where did they go?
* I don't know -- I don't remember where that code is in the repo and I haven't found it yet
* They're in a notebook and may be in a different branch of the repo. Or did I forget to check them in? I don't remember deleting them

What do the current results indicate?
* Both DICE and MRMC succeed with similar rates
* Both success rates are unaffected by random perturbations
* DICE enforces sparsity but makes larger steps
* DICE typically crosses the boundary in just 1-4 large steps
* DICE path lengths are much longer, but this may be tuneable with hyperparameters

Questions
* Is the code correct? Especially the random noise. I should be able to check by examining the output data
* Can DICE step size be tuned? IE reweight optimization loss function
* How does randomness effect DICE? How does it effect MRMC?

## What do the clusters look like for confidence 0.8?

In [233]:
CUTOFF = 0.8
noise = 0

idx = index_df[index_df.confidence_cutoff == cutoff].trial_id.iloc[0]

cdf = cluster_df[(cluster_df.trial_id == idx)]

clusters = []
for path_id in [0,1,2]:
    clusters.append(cdf[(cdf.trial_id == idx) & (cdf.path_id == path_id)].iloc[0])


dataset, datainfo = data_loader.load_data(data_loader.DatasetName.CREDIT_CARD_DEFAULT)
adapter = continuous_adapter.StandardizingAdapter(label_name=datainfo.label_name, positive_label=datainfo.positive_label).fit(dataset)

print("Positive Probability for each cluster:")
print(model.predict_pos_proba(adapter.inverse_transform(cdf).drop(columns=['path_id', 'test_id', 'trial_id'])))

print("Clusters:")
cdf

Positive Probability for each cluster:
9     1.0
10    1.0
11    1.0
dtype: float64
Clusters:


Unnamed: 0,LIMIT_BAL,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,path_id,trial_id,test_id
9,457932.58427,37.235955,0.013483,0.004494382,0.004494,0.008988764,0.004494382,2.0816680000000002e-17,47592.840449,50067.719101,...,65427.049438,34055.442697,39715.782022,32219.979775,28967.566292,34402.642697,38479.746067,0,551,18
10,246371.191136,34.468144,0.019391,0.008310249,0.027701,0.02216066,0.005540166,0.02770083,60296.188366,94612.088643,...,87218.955679,72082.432133,55815.409972,34335.196676,26916.950139,15654.210526,23039.628809,1,551,18
11,475784.313725,37.401961,0.019608,6.938894e-18,0.039216,8.673617e-18,-3.469447e-18,1.2143060000000001e-17,287406.333333,288360.313725,...,299121.558824,71171.941176,119761.107843,62895.941176,43182.137255,31688.745098,42412.931373,2,551,18


## How large are the clusters?

In [234]:
cluster_sizes = [0,0,0]

d = mrmc_method.MRMC.filter_data(dataset, CUTOFF, model)
d = adapter.inverse_transform(mrmc_method.MRM.process_data(d, adapter))
dists = np.zeros((d.shape[0], 3))
for path_id in data_df.path_id.unique():
    cluster = cdf[cdf.path_id == path_id].drop(columns=['trial_id', 'test_id', 'path_id']).iloc[0]
    dists[:,path_id] = ((d - cluster)**2).mean(axis=1)

cluster_assignment = np.argmin(dists, axis=1)

for path_id in range(dists.shape[1]):
    cluster_sizes[path_id] = (cluster_assignment == path_id).sum()


print("Cluster Sizes:")
cluster_sizes

Cluster Sizes:


[446, 360, 102]