## Define some utility functions for working with tries

In [17]:
import json
import os
import subprocess
import numpy as np
import json
from treefarms import TREEFARMS

def construct_tree_rset(
    df,
    lam, db, eps,
    original_config_path='./config.json',
    save_dir=None,
    config_idx=None,
    verbose=False,
    ignore_trivial=False
    ):
    
    print(df.iloc[:, -1].unique())
    if len(df.iloc[:, -1].unique()) == 1:
        print("Single class dataset")
        save_to = os.path.join(save_dir, f'trie_bootstrap_{config_idx}_eps_{eps}_db_{db}_reg_{lam}.json')
        
        if df.iloc[:, -1].unique()[0] == 0:
            trie = {"-1":{"complexity":0.05,"loss":0.0,"objective":0.05}}
        else:
            trie = {"-2":{"complexity":0.05,"loss":0.0,"objective":0.05}}
        
        with open(save_to, 'w') as fp:
            json.dump(trie, fp)
        return None
    
    if verbose:
        print("save_dir: {}".format(save_dir))
        print("config_idx: {}".format(config_idx))

    with open(original_config_path) as f:
        configJson = json.load(f)
        configJson['depth_budget'] = db

        configJson['rashomon_bound_multiplier'] = 0
        configJson['rashomon_bound'] = 0
        configJson['rashomon_bound_adder'] = eps

        configJson['regularization'] = lam
        configJson['rashomon_ignore_trivial_extensions'] = ignore_trivial
        
        configJson['rashomon_trie'] = os.path.join(save_dir, 
                                        f'trie_bootstrap_{config_idx}_eps_{eps}_db_{db}_reg_{lam}.json')
        #configJson['rashomon_model_set_suffix'] = f'_bootstrap_{config_idx}_eps_{eps}_db_{db}_reg_{lam}.json'
        configJson['verbose'] = verbose

        print(configJson)
        tf = TREEFARMS(configJson)
    f.close()
    
    tf.fit(df.iloc[:, :-1], df.iloc[:, -1])
    return tf

In [18]:
def trie_intersect_recursive(t1, t2):
    t1_keys = set(t1.keys())
    t2_keys = set(t2.keys())

    shared_keys = t1_keys.intersection(t2_keys)

    result = {}
    if 'loss' in shared_keys:
        for key in ['loss', 'complexity', 'objective']:
            result[f't1_{key}'] = t1[key]
            result[f't2_{key}'] = t2[key]
        return result

    for key in shared_keys:
        tmp_res = trie_intersect_recursive(t1[key], t2[key])

        # If there ar intersecting subtries
        if len(list(tmp_res.keys())) > 0:
            result[key] = tmp_res
        
    return result



In [19]:
from treefarms.model.tree_classifier import TreeClassifier
def get_tree_classifier(trie, tree_index=0):
    return TreeClassifier(get_tree_recursive(trie, tree_index=tree_index)[0])

def get_tree_recursive(trie, tree_index=0, d=0):
    cur_key = list(trie.keys())[min(tree_index, len(list(trie.keys()))-1)]

    keys = list(trie.keys())
    if ('loss' in keys) or ('t1_loss' in keys):
        return
    subtrees = get_tree_recursive(trie[cur_key], tree_index, d=d+1)
    new_subtrees = []
    subtree_ind = 0
    for i, key in enumerate(cur_key.split()):
        if key == '-1':
            new_subtrees.append({
                "prediction": 0,
                "name": "Prediction"
            })
        elif key == '-2':
            new_subtrees.append({
                "prediction": 1,
                "name": "Prediction"
            })
        else:
            new_subtrees.append({
                "feature": int(key),
                "relation": "==",
                "reference": "true",
                "true": subtrees[subtree_ind],
                "false": subtrees[subtree_ind+1]
            })
            subtree_ind += 2
    return new_subtrees
'''
The expected format for decision_tree_classifier is 
{
  "feature": 7,
  "relation": "==",
  "reference": "true",
  "true": {
    "prediction": 1,
    "name": "Prediction"
  },
  "false": {
    "prediction": 0,
    "name": "Prediction"
  }
}
'''

'\nThe expected format for decision_tree_classifier is \n{\n  "feature": 7,\n  "relation": "==",\n  "reference": "true",\n  "true": {\n    "prediction": 1,\n    "name": "Prediction"\n  },\n  "false": {\n    "prediction": 0,\n    "name": "Prediction"\n  }\n}\n'

## Define a simple model using a tree to pick the missingness model

In [20]:
class MissingnessSet:
    def __init__(self, missingness_patterns, model_list):
        '''
        missingness_patterns: list of n lists of arrays; each array 
            represents a missingness pattern, each inner list (potentially
            list of 1) contains multiple patterns to send to a model
        model_list: list of n models, in the order of their corresponding missingness
            patterns
        '''
        self.missingness_patterns = missingness_patterns
        self.model_list = model_list

    def predict(self, X, missingness_patterns=None):
        missingness_keys = []
        for k in X.columns:
            if "-1" in k:
                missingness_keys.append(k)
        if missingness_patterns is None:
            missingness_patterns = np.unique((X[missingness_keys]), axis=0)

        preds = np.zeros(X.shape[0])

        # For each unique missingness pattern
        for i in range(missingness_patterns.shape[0]): 
            mp_mask = (X[missingness_keys] == missingness_patterns[i].reshape((1, -1)).repeat(X.shape[0], axis=0)).all(axis=1)
            cur_df = X[mp_mask]
            missingness_pattern = missingness_patterns[i]

            # Figure out which model to use, and predict the relevant samples
            found = False
            for mp_ind, mp_list in enumerate(self.missingness_patterns):
                if (missingness_pattern == mp_list).all():
                    #print(f"Matched to model {mp_ind}")
                    cur_model = self.model_list[mp_ind]
                    found = True
                    break
                    
            if found:
                pred_for_mp = cur_model.predict(cur_df)
                preds[mp_mask] = pred_for_mp
            else:
                print(f"No model found for mp {missingness_patterns[i]}")
            #print(pred_for_mp.shape, pred_for_mp.mean())
            #print(preds[mp_mask].shape)

        return preds
            
        

## Load and process our target data

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('fico_full.csv')

train_df, test_df = train_test_split(df, test_size=0.2, random_state=20, stratify=df['PoorRiskPerformance'])

In [22]:
def binarize_and_split_according_to_train(train_df, test_df):
    thresholds = []
    for c in train_df.columns:
        if c == 'PoorRiskPerformance':
            continue
        # We should change this to not compute quantiles over negative values
        t = list(train_df[c][train_df[c] > 0].quantile([0.2, 0.4, 0.6, 0.8, 1]).unique())
        t.append(-1)
        thresholds.append(t)

    train_df_x = train_df.iloc[:, :-1]
    missingness_patterns = np.unique((train_df_x < 0), axis=0)
    missingness_datasets = []
    full_train = None
    null_mp_ind = None
    for i in range(missingness_patterns.shape[0]): 
        cur_df = train_df[((train_df_x < 0) == missingness_patterns[i].reshape((1, -1)).repeat(train_df_x.shape[0], axis=0)).all(axis=1)]
        
        if missingness_patterns[i].mean() == 1:
            null_mp_ind = i
            continue
        elif missingness_patterns[i].mean() == 0:
            full_train = cur_df
        else:
            missingness_datasets.append(cur_df)

    print("null_mp_ind", null_mp_ind)
    datasets = []
    for dataset in [full_train] + missingness_datasets + [test_df]:
        cur_dataset_binned = {}
        for i, c in enumerate(train_df.columns):
            if c == 'PoorRiskPerformance':
                continue
            for v in thresholds[i]:
                new_col_name = f'{c} <= {v}'

                new_row_train = np.zeros(dataset.shape[0])
                new_row_train[dataset[c] <= v] = 1

                cur_dataset_binned[new_col_name] = new_row_train

        cur_dataset_binned['PoorRiskPerformance'] = dataset['PoorRiskPerformance']
        datasets.append(pd.DataFrame(cur_dataset_binned))

    return datasets[0], datasets[1:-1], datasets[-1], np.concatenate((missingness_patterns[:null_mp_ind], 
                                                                 missingness_patterns[null_mp_ind+1:]), axis=0)


In [23]:
full_train, partial_trains, full_test, missingness_patterns = binarize_and_split_according_to_train(train_df, test_df)


null_mp_ind 73


In [24]:
missingness_patterns.shape

(73, 23)

In [25]:
print(len(partial_trains))

72


In [26]:
import os
import json
from treefarms.model.model_set import ModelSetContainer
from treefarms.model.tree_classifier import TreeClassifier

def construct_missingness_models(full_train, partial_trains, 
                                 missingness_patterns, 
                                 compute_tries=False, 
                                 dataset_name='fico',
                                lam=0.02, db=4, eps=0.04):
    if compute_tries:
        if not os.path.isdir(f'./experiments/datasets/{dataset_name}/'):
            os.makedirs(f'./experiments/datasets/{dataset_name}/')
            
        construct_tree_rset(
            full_train,
            lam, db, eps,
            original_config_path='./config.json',
            save_dir=f'./experiments/datasets/{dataset_name}',
            config_idx='full',
            verbose=True
        )
        
        for i, df in enumerate(partial_trains):
            construct_tree_rset(
                partial_trains[i],
                lam, db, eps,
                original_config_path='./config.json',
                save_dir=f'./experiments/datasets/{dataset_name}',
                config_idx=f'{i}',
                verbose=False
            )
    
    
    t1 = f'./experiments/datasets/{dataset_name}/trie_bootstrap_full_eps_{eps}_db_{db}_reg_{lam}.json'
    with open(t1) as f1:
        trie_1 = json.load(f1)
        t = get_tree_classifier(trie_1, tree_index=0)

    m_models = [t]
    counter = 0
    good_inds = []

    for i in range(len(missingness_patterns[1:])):
        t2 = f'./experiments/datasets/fico/trie_bootstrap_{i}_eps_0.05_db_4_reg_0.05.json'
        with open(t2) as f2:
            trie_2 = json.load(f2)
        ti = trie_intersect_recursive(trie_1, trie_2)
        if len(ti) == 0:
            counter += 1
            t = get_tree_classifier(trie_2, tree_index=0)
        else:
            good_inds.append(i)
            t = get_tree_classifier(ti, tree_index=0)
        m_models.append(t)

    print(f"Found {counter} of {len(missingness_patterns)} patterns had no overlap")
    overall_model = MissingnessSet(missingness_patterns, m_models)
    return overall_model, good_inds

In [None]:
overall_model, good_inds = construct_missingness_models(full_train, partial_trains, 
                                 missingness_patterns, 
                                 compute_tries=True, 
                                 dataset_name='fico',
                                lam=0.05, db=4, eps=0.025)

[1 0]
save_dir: ./experiments/datasets/fico
config_idx: full
{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': True, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_full_eps_0.025_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rashomon_bound_multiplier': 0, 'rashomon_bound_adder': 0.025, 'rashomon_i

treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 3
[1 0]
{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': False, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_13_eps_0.025_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rashomon_bound_multiplier': 0,

treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 16
[0 1]
{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': False, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_23_eps_0.025_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rashomon_bound_multiplier': 0

treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 2
[1 0]
{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': False, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_34_eps_0.025_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rashomon_bound_multiplier': 0,

treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 92
[1]
Single class dataset
[1 0]
{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': False, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_45_eps_0.025_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rash

treefarms reported successful execution
training completed. Number of trees in the Rashomon set: 4
[1]
Single class dataset
[0 1]
{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': False, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_60_eps_0.025_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rasho

In [20]:
good_patterns = overall_model.missingness_patterns[[0] + [i + 1 for i in good_inds]]
X = full_test
missingness_keys = []
for k in X.columns:
    if "-1" in k:
        missingness_keys.append(k)
if missingness_patterns is None:
    missingness_patterns = np.unique((X[missingness_keys]), axis=0)

preds = np.zeros(X.shape[0])

# For each unique missingness pattern
test_df_filtered = None
for i in range(missingness_patterns.shape[0]): 
    mp_mask = (X[missingness_keys] == missingness_patterns[i].reshape((1, -1)).repeat(X.shape[0], axis=0)).all(axis=1)
    
    for row in good_patterns:
        if (row == missingness_patterns[i]).all():
            if test_df_filtered is None:
                test_df_filtered = X[mp_mask]
            else:
                test_df_filtered = pd.concat([test_df_filtered, X[mp_mask]])
    cur_df = X[mp_mask]

In [39]:
partial_trains[5]

Unnamed: 0,PercentTradesWBalance <= 48.0,PercentTradesWBalance <= 60.0,PercentTradesWBalance <= 75.0,PercentTradesWBalance <= 86.0,PercentTradesWBalance <= 100.0,PercentTradesWBalance <= -1,ExternalRiskEstimate <= 63.0,ExternalRiskEstimate <= 69.0,ExternalRiskEstimate <= 75.0,ExternalRiskEstimate <= 82.0,...,NumInstallTradesWBalance <= 2.0,NumInstallTradesWBalance <= 3.0,NumInstallTradesWBalance <= 23.0,NumInstallTradesWBalance <= -1,NumBank2NatlTradesWHighUtilization <= 1.0,NumBank2NatlTradesWHighUtilization <= 2.0,NumBank2NatlTradesWHighUtilization <= 3.0,NumBank2NatlTradesWHighUtilization <= 16.0,NumBank2NatlTradesWHighUtilization <= -1,PoorRiskPerformance
1801,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1


In [81]:
t = construct_tree_rset(
    full_train,
    lam=0.05, db=4, eps=0.05,
    original_config_path='./config.json',
    save_dir=f'./experiments/datasets/fico',
    config_idx=f'full',
    verbose=False,
    ignore_trivial=False
)
for i in range(158):
    print(t[i])

{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': False, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_full_eps_0.05_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rashomon_bound_multiplier': 0, 'rashomon_bound_adder': 0.05, 'rashomon_ignore_trivial_extensions': False}
treefarms reported successfu

In [79]:
t = construct_tree_rset(
    full_train,
    lam=0.05, db=4, eps=0.05,
    original_config_path='./config.json',
    save_dir=f'./experiments/datasets/fico',
    config_idx=f'full',
    verbose=False,
    ignore_trivial=True
)

{'balance': False, 'cancellation': True, 'look_ahead': True, 'similar_support': False, 'feature_exchange': False, 'continuous_feature_exchange': False, 'rule_list': False, 'diagnostics': False, 'verbose': False, 'regularization': 0.05, 'uncertainty_tolerance': 0.0, 'upperbound': 0.0, 'model_limit': 10000, 'precision_limit': 0, 'stack_limit': 0, 'tile_limit': 0, 'time_limit': 0, 'worker_limit': 1, 'costs': '', 'model': '', 'rashomon_model': '', 'rashomon_trie': './experiments/datasets/fico/trie_bootstrap_full_eps_0.05_db_4_reg_0.05.json', 'rashomon_model_set_suffix': '', 'profile': '', 'timing': '', 'trace': '', 'tree': '', 'datatset_encoding': '', 'depth_budget': 4, 'minimum_captured_points': 0, 'memory_checkpoints': [], 'output_objective_model_set': False, 'output_covered_sets': [], 'covered_sets_thresholds': [], 'rashomon': True, 'rashomon_bound': 0, 'rashomon_bound_multiplier': 0, 'rashomon_bound_adder': 0.05, 'rashomon_ignore_trivial_extensions': True}
treefarms reported successful

In [80]:
for i in range(49):
    print(t[i])

if feature_29 = true then:
    predicted Prediction: 1

else if feature_29 != true then:
    predicted Prediction: 0
if true then:
    predicted Prediction: 1
if feature_6 = true then:
    predicted Prediction: 1

else if feature_6 != true then:
    predicted Prediction: 0
if feature_33 = true then:
    predicted Prediction: 1

else if feature_33 != true then:
    predicted Prediction: 0
if feature_7 = true then:
    predicted Prediction: 1

else if feature_7 != true then:
    predicted Prediction: 0
if feature_16 = true then:
    predicted Prediction: 1

else if feature_16 != true and feature_6 = true then:
    predicted Prediction: 1

else if feature_16 != true and feature_6 != true then:
    predicted Prediction: 0
if feature_4 = true then:
    predicted Prediction: 1

else if feature_4 != true and feature_6 = true then:
    predicted Prediction: 1

else if feature_4 != true and feature_6 != true then:
    predicted Prediction: 0
if feature_5 = true then:
    predicted Prediction: 1

In [59]:
overall_model.model_list[6]

{
  "prediction": 0,
  "name": "Prediction"
}

In [35]:
for i in range(len(partial_trains)):
    print((overall_model.model_list[i+1].predict(partial_trains[i].iloc[:, :-1]) == partial_trains[i].iloc[:, -1]).mean())

0.4954128440366973
0.4017094017094017
0.7272727272727273
0.3763440860215054
0.6470588235294118
0.0
0.0
0.9411764705882353
0.0
1.0
0.0
0.7647058823529411
0.691743119266055
0.782608695652174
0.5735294117647058
0.7142857142857143
0.6610169491525424
0.0
0.2
0.0
0.25
0.6374201045903545
0.7096774193548387
0.34163208852005533
0.6538461538461539
0.6573705179282868
0.6666666666666666
0.75
0.8571428571428571
1.0
1.0
0.0
0.8823529411764706
0.6012269938650306
0.8181818181818182
0.724
0.5555555555555556
0.6730769230769231
0.625
1.0
0.46153846153846156
0.0
0.4
0.39285714285714285
0.0
0.5555555555555556
0.0
0.6666666666666666
0.0
0.8
0.0
0.6
0.5
0.75
0.0
0.6666666666666666
0.75
0.45454545454545453
0.5862068965517241
0.0
0.6428571428571429
1.0
1.0
0.0
0.5
0.6666666666666666
0.0
0.3333333333333333
0.0
0.0
0.5
0.0


In [24]:
all_train = pd.concat([full_train] + partial_trains)
(overall_model.predict(all_train.iloc[:, :-1]) == all_train.iloc[:, -1]).mean()

0.41941624365482233

In [207]:
(overall_model.predict(full_test.iloc[:, :-1]) == full_test.iloc[:, -1]).mean()

No model found for mp [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0.]
No model found for mp [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1.]
No model found for mp [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


0.5511472275334608

In [208]:
(overall_model.predict(test_df_filtered.iloc[:, :-1]) == test_df_filtered.iloc[:, -1]).mean()

0.5614035087719298

In [163]:
print(overall_model.model_list[1])

if feature_16 = true then:
    predicted Prediction: 1

else if feature_16 != true and feature_6 = true then:
    predicted Prediction: 1

else if feature_16 != true and feature_6 != true then:
    predicted Prediction: 0


In [150]:
t[8553]

{
  "feature": 8,
  "relation": "==",
  "reference": "true",
  "true": {
    "prediction": 1,
    "name": "Prediction"
  },
  "false": {
    "feature": 6,
    "relation": "==",
    "reference": "true",
    "true": {
      "prediction": 1,
      "name": "Prediction"
    },
    "false": {
      "prediction": 0,
      "name": "Prediction"
    }
  }
}

In [31]:
all_train = pd.concat([full_train] + partial_trains)

In [32]:
full_train

Unnamed: 0,PercentTradesWBalance <= 48.0,PercentTradesWBalance <= 60.0,PercentTradesWBalance <= 75.0,PercentTradesWBalance <= 86.0,PercentTradesWBalance <= 100.0,PercentTradesWBalance <= -1,ExternalRiskEstimate <= 63.0,ExternalRiskEstimate <= 69.0,ExternalRiskEstimate <= 75.0,ExternalRiskEstimate <= 82.0,...,NumInstallTradesWBalance <= 2.0,NumInstallTradesWBalance <= 3.0,NumInstallTradesWBalance <= 23.0,NumInstallTradesWBalance <= -1,NumBank2NatlTradesWHighUtilization <= 1.0,NumBank2NatlTradesWHighUtilization <= 2.0,NumBank2NatlTradesWHighUtilization <= 3.0,NumBank2NatlTradesWHighUtilization <= 16.0,NumBank2NatlTradesWHighUtilization <= -1,PoorRiskPerformance
9517,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
7262,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0
5701,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0
366,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
937,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4382,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4729,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
8076,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
7685,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1


In [74]:
partial_trains[0]

Unnamed: 0,PercentTradesWBalance <= 48.0,PercentTradesWBalance <= 60.0,PercentTradesWBalance <= 75.0,PercentTradesWBalance <= 86.0,PercentTradesWBalance <= 100.0,PercentTradesWBalance <= -1,ExternalRiskEstimate <= 63.0,ExternalRiskEstimate <= 69.0,ExternalRiskEstimate <= 75.0,ExternalRiskEstimate <= 82.0,...,NumInstallTradesWBalance <= 2.0,NumInstallTradesWBalance <= 3.0,NumInstallTradesWBalance <= 23.0,NumInstallTradesWBalance <= -1,NumBank2NatlTradesWHighUtilization <= 1.0,NumBank2NatlTradesWHighUtilization <= 2.0,NumBank2NatlTradesWHighUtilization <= 3.0,NumBank2NatlTradesWHighUtilization <= 16.0,NumBank2NatlTradesWHighUtilization <= -1,PoorRiskPerformance
5603,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
7225,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
6561,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
7127,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0
6594,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3147,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0
757,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0
9593,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
4368,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1


In [49]:
preds = overall_model.predict(all_train.iloc[:, :-1], missingness_patterns)
(preds == all_train.iloc[:, -1]).mean()

9517     True
7262     True
5701     True
366      True
937      True
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
91

9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
9176    False
8721    False
89      False
8771    False
Length: 7880, dtype: bool
9517    False
7262    False
5701    False
366     False
937     False
        ...  
4106    False
91

0.5052030456852792

In [51]:
(overall_model.model_list[0].predict(full_train.iloc[:, :-1]) == full_train.iloc[:, -1]).mean()

0.6156547917711992

In [71]:
print((overall_model.predict(partial_trains[2].iloc[:, :-1]) == partial_trains[2].iloc[:, -1]).mean())
    

Matched to model 3
0.7272727272727273


In [56]:
overall_model.model_list[1]

{
  "feature": 16,
  "relation": "==",
  "reference": "true",
  "true": {
    "feature": 6,
    "relation": "==",
    "reference": "true",
    "true": {
      "prediction": 0,
      "name": "Prediction"
    },
    "false": {
      "prediction": 1,
      "name": "Prediction"
    }
  },
  "false": {
    "prediction": 1,
    "name": "Prediction"
  }
}

In [60]:
partial_trains[0]

Unnamed: 0,PercentTradesWBalance <= 48.0,PercentTradesWBalance <= 60.0,PercentTradesWBalance <= 75.0,PercentTradesWBalance <= 86.0,PercentTradesWBalance <= 100.0,PercentTradesWBalance <= -1,ExternalRiskEstimate <= 63.0,ExternalRiskEstimate <= 69.0,ExternalRiskEstimate <= 75.0,ExternalRiskEstimate <= 82.0,...,NumInstallTradesWBalance <= 2.0,NumInstallTradesWBalance <= 3.0,NumInstallTradesWBalance <= 23.0,NumInstallTradesWBalance <= -1,NumBank2NatlTradesWHighUtilization <= 1.0,NumBank2NatlTradesWHighUtilization <= 2.0,NumBank2NatlTradesWHighUtilization <= 3.0,NumBank2NatlTradesWHighUtilization <= 16.0,NumBank2NatlTradesWHighUtilization <= -1,PoorRiskPerformance
5603,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
7225,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
6561,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
7127,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0
6594,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3147,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0
757,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0
9593,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1
4368,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1


In [61]:
full_train

Unnamed: 0,PercentTradesWBalance <= 48.0,PercentTradesWBalance <= 60.0,PercentTradesWBalance <= 75.0,PercentTradesWBalance <= 86.0,PercentTradesWBalance <= 100.0,PercentTradesWBalance <= -1,ExternalRiskEstimate <= 63.0,ExternalRiskEstimate <= 69.0,ExternalRiskEstimate <= 75.0,ExternalRiskEstimate <= 82.0,...,NumInstallTradesWBalance <= 2.0,NumInstallTradesWBalance <= 3.0,NumInstallTradesWBalance <= 23.0,NumInstallTradesWBalance <= -1,NumBank2NatlTradesWHighUtilization <= 1.0,NumBank2NatlTradesWHighUtilization <= 2.0,NumBank2NatlTradesWHighUtilization <= 3.0,NumBank2NatlTradesWHighUtilization <= 16.0,NumBank2NatlTradesWHighUtilization <= -1,PoorRiskPerformance
9517,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
7262,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0
5701,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0
366,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
937,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4382,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4729,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
8076,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
7685,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1
