In [6]:
import copy
import numpy as np
import pandas as pd

from src.common.functions import get_feature_importance
%cd /home/heza7322/PycharmProjects/missing-value-handling-in-carts
#%cd /Users/Henning/PycharmProjects/missing-value-handling-in-carts
import matplotlib.pyplot as plt
from src.binary_tree import BinaryTree
from src.trinary_tree import TrinaryTree
from src.weighted_tree import WeightedTree
from src.common.functions import get_indices, calculate_loss, fit_response

/home/heza7322/PycharmProjects/missing-value-handling-in-carts


In [7]:
def split_into_folds(X, y, n_folds, X_test = None, rng=None):
    # X_test if this should be another set
    X_test = X_test if X_test is not None else X
    rng = rng or np.random.default_rng(11)
    shuffled_indices = rng.permutation(X.index)
    fold_indices = np.array_split(shuffled_indices, n_folds)
    folds = []
    for i, test_index in enumerate(fold_indices):
        # Concatenate all indices from the other folds to use for training
        train_index = np.concatenate([fold_indices[j] for j in range(n_folds) if j != i])
        folds.append({
            'Train': {
                'X': X.loc[train_index],
                'y': y.loc[train_index]
            },
            'Test': {
                'X': X_test.loc[test_index],
                'y': y.loc[test_index]
            }
        })

    return folds

def tune_max_depth(folds, max_max_depth = 10, min_samples_leaf = 20):
    max_depths = range(max_max_depth)
    losses = pd.Series(index=max_depths, dtype=float)
    losses.loc[:] = 0
    for max_depth in max_depths:
        for fold in folds:
            X_train, y_train = fold['Train']['X'],fold['Train']['y']
            X_test, y_test  = fold['Test']['X'],fold['Test']['y']
            tree = BinaryTree(min_samples_leaf = min_samples_leaf, max_depth = max_depth)
            losses[max_depth] += evaluate_tree(tree, X_train, y_train, X_test, y_test)

        if max_depth>0:
            if losses[max_depth]>losses[max_depth-1]:
               break

    return losses.idxmin()

def evaluate_tree(tree, X_train, y_train, X_test, y_test):
    tree.fit(X_train,y_train)
    if y_test.dtype == 'float' or y.dtype == 'int':
        y_hat = tree.predict(X_test)
        return calculate_loss(y = y_test, y_hat = y_hat)
    else:
        y_prob = tree.predict(X_test, prob = True)
        return calculate_loss(y = y_test, y_prob = y_prob)

# Remove numbers randomly
def remove_random_values(X, missing_probs = [0,0.25,0.5]):
    Xs = {0: X.copy()}
    n_to_remove = [0]
    for i in range(1,len(missing_probs)):
        n_to_remove += [int(missing_probs[i]*len(X)) - sum(n_to_remove[:i])]
        Xs[i] = Xs[i-1].copy()
        for feature in X.columns:
            to_remove = Xs[i].loc[~Xs[i][feature].isna()].sample(n_to_remove[i]).index
            Xs[i].loc[to_remove,feature] = np.nan

    return Xs

# Create folds where only the test set contains missing values
def split_missing_data_into_folds(Xs, y, n_folds, rng = None, missing_in = 'all'):
    rng = rng or np.random.default_rng()
    if missing_in == 'all':
        return {i: split_into_folds(X = Xs[i], y = y, n_folds = n_folds) for i in range(len(Xs))}
    elif missing_in == 'test':
        # Assume Xs[0] is the full set
        X_test = Xs[0]
        return  {i: split_into_folds(X = Xs[i], y = y, n_folds = n_folds, X_test = X_test) for i in range(len(Xs))}

def setup_trees(min_samples_leaf = 20, max_depth = 3, tree_types = 'all'):
    trees = {'majority': BinaryTree(max_depth=max_depth, min_samples_leaf=min_samples_leaf, missing_rule='majority'),
              'mia':      BinaryTree(max_depth=max_depth, min_samples_leaf=min_samples_leaf,missing_rule='mia'),
              'trinary':  TrinaryTree(max_depth=max_depth, min_samples_leaf=min_samples_leaf),
              'weighted': WeightedTree(max_depth=max_depth, min_samples_leaf=min_samples_leaf)}
    if tree_types == 'all':
        return trees

    return {tree_type: trees[tree_type] for tree_type in tree_types}

def test_tree_set(trees, missing_folds):
    # Fit trees for every step
    losses = pd.DataFrame(columns = [tree_type for tree_type in trees],
                           index = range(len(missing_folds)))
    losses.loc[:] = 0
    for i, folds in enumerate(missing_folds.values()):
        for fold in folds:
            X_train, y_train = fold['Train']['X'],fold['Train']['y']
            X_test, y_test  = fold['Test']['X'],fold['Test']['y']
            for tree_type in trees:
                losses.loc[i,tree_type] += evaluate_tree(trees[tree_type], X_train, y_train, X_test, y_test)

    return losses

In [8]:
# TODO: Something is wrong since the losses are DECREASING with increasing missingness and also there are nans???

In [36]:
data_folder = 'data/cleaned'
data_set = 'titanic'

# Load data
df = pd.read_csv(f'{data_folder}/{data_set}.csv',index_col = 0)
X = df.drop('y',axis=1)
y = df['y']

# Set up split
random_seed = 100
rng = np.random.default_rng(seed = random_seed)

# Create folds
n_folds = 10
folds = split_into_folds(X  = X, y = y,n_folds = n_folds ,rng = rng)

# Tune hyperparameter
max_depth = tune_max_depth(folds = folds)

# Remove values from covariates
missing_probs = [0,0.1,0.2,0.3]
Xs = remove_random_values(X, missing_probs = missing_probs)
# Create folds for every level of missingness
missing_folds = split_missing_data_into_folds(Xs, y, n_folds, rng, missing_in = 'all')
#missing_folds_test = split_missing_data_into_folds(Xs, y, n_folds, rng, missing_in = 'test')

# Set up trees
trees = setup_trees(max_depth = max_depth)

# Get results
losses = test_tree_set(trees,missing_folds)
#losses_test = test_tree_set(trees,missing_folds_test)

In [38]:
max_depth

5

In [29]:
fold = 1
missingness_index = 0
X_train = missing_folds[missingness_index][fold]['Train']['X']
y_train = missing_folds[missingness_index][fold]['Train']['y']
X_test  = missing_folds[missingness_index][fold]['Test']['X']
y_test  = missing_folds[missingness_index][fold]['Test']['y']

tree = trees['majority']
tree.fit(X_train,y_train)
y_prob = tree.predict(X_test, prob = True)
calculate_loss(y = y_test, y_prob = y_prob)

32.852449680013294

In [40]:
y_prob


Unnamed: 0,left,right,balance
597,0.833333,0.083333,0.083333
621,0.833333,0.083333,0.083333
1,0.138889,0.694444,0.166667
412,0.041667,0.958333,0.000000
384,0.440000,0.400000,0.160000
...,...,...,...
422,0.041667,0.958333,0.000000
87,0.023810,0.976190,0.000000
469,0.125000,0.687500,0.187500
155,0.138889,0.694444,0.166667
