In [1]:
import copy
import numpy as np
import pandas as pd

from src.common.functions import get_feature_importance
%cd /home/heza7322/PycharmProjects/missing-value-handling-in-carts
#%cd /Users/Henning/PycharmProjects/missing-value-handling-in-carts
from src.binary_tree import BinaryTree
from src.trinary_tree import TrinaryTree
from src.weighted_tree import WeightedTree
from src.common.functions import get_indices, calculate_loss, fit_response


/home/heza7322/PycharmProjects/missing-value-handling-in-carts


In [22]:
def create_missing_Xs(X, ps, seed = None):
    """
    Takes a pandas DataFrame X and a list of probabilities ps, and returns a dictionary
    of modified versions of X, with values randomly set to NaN based on the corresponding
    probabilities in ps.
    """
    if seed is None:
        rng = np.random.default_rng()
    else:
        rng = np.random.default_rng(seed)
    n = len(X)
    Xs = {}

    X = X.copy()
    n_to_drop = 0
    for i, p in enumerate(ps):
        n_to_drop = int(p * n) - n_to_drop
        for j in X.columns:
            to_remove = np.random.choice(X.loc[~X[j].isna(),j].index,
                                         size = n_to_drop,
                                         replace = False)
            X.loc[to_remove,j] = np.nan
        Xs[p] = X.copy()
    return Xs

def split_dataset_into_folds(X, y, n_folds, seed = None):
    if seed is None:
        rng = np.random.default_rng()
    else:
        rng = np.random.default_rng(seed)
    orig_index = X.index
    shuffled_index = rng.permutation(orig_index)
    test_indices = np.array_split(shuffled_index,n_folds)

    folds = {fold: {} for fold in range(n_folds)}
    for fold, test_index  in enumerate(test_indices):
        train_index = [i for i in orig_index if i not in test_index]
        folds[fold]['Train'] = X.loc[train_index], y.loc[train_index]
        folds[fold]['Test'] = X.loc[test_index], y.loc[test_index]
    return folds

def split_missing_datasets_into_folds(Xs,y, n_folds, seed = None):
    missing_folds = {p: split_dataset_into_folds(Xs[p],y, n_folds = n_folds, seed=seed) for p in Xs}
    return missing_folds

def tune_max_depth(folds, max_max_depth = 10, min_samples_leaf = 20):
    max_depths = np.arange(max_max_depth+1)
    losses = pd.Series(index = max_depths, dtype = float)

    for max_depth in max_depths:
        tree = BinaryTree(max_depth = max_depth, min_samples_leaf= min_samples_leaf)
        losses[max_depth] = calculate_cv_loss(missing_folds[0],tree)

        if losses.iloc[max_depth] >= losses.iloc[max_depth-1]:
            break

    return losses.idxmin()

def setup_equal_trees(max_depth = None, min_samples_leaf = None,tree_types = 'all'):
    trees = {'Majority': BinaryTree(max_depth = max_depth, min_samples_leaf=min_samples_leaf, missing_rule='majority'),
         'MIA': BinaryTree(max_depth=max_depth, min_samples_leaf=min_samples_leaf, missing_rule='mia'),
         'Trinary': TrinaryTree(max_depth = max_depth, min_samples_leaf=min_samples_leaf),
         'Weighted': WeightedTree(max_depth=max_depth, min_samples_leaf=min_samples_leaf)}

    if tree_types == 'all':
        return trees

    return {tree_type: trees[tree_type] for tree_type in tree_types}

def calculate_missing_cvs_loss(missing_folds, trees):
    losses = pd.DataFrame(index = Xs.keys(),columns = trees.keys(), dtype= float)
    for p in missing_folds:
        for tree_type in trees:
            losses.loc[p,tree_type] = calculate_cv_loss(missing_folds[p],trees[tree_type])

    return losses

def calculate_cv_loss(folds,tree):
    y = pd.concat([folds[fold]['Test'][1] for fold in folds]).sort_index()
    if y.dtype == 'object':
        y_prob = pd.DataFrame(columns = y.unique(),index = y.index, dtype = 'float')
    else:
        y_hat = pd.Series(index = y.index, dtype = 'float')
    for i,fold in folds.items():
        X_train,y_train = fold['Train']
        X_test,_        = fold['Test']

        tree.fit(X_train,y_train)
        if y.dtype == 'object':
            y_prob.loc[X_test.index] = tree.predict(X_test, prob = True)
            return calculate_loss(y = y, y_prob = y_prob)
        else:
            y_hat.loc[X_test.index] = tree.predict(X_test)
            return calculate_loss(y = y, y_hat = y_hat)

In [11]:
# Load data
data_folder = 'data/cleaned'
data_set = 'auto_mpg'
X = pd.read_csv(f'{data_folder}/{data_set}.csv',index_col = 0)
y = X.pop('y')

# Set up variables
seed_missingness = 10
seed_fold_split = 11
ps = [0,0.25,0.5]
n_folds = 3
min_samples_leaf = 20
max_max_depth = 2

# Run pipeline
Xs = create_missing_Xs(X, ps, seed = seed_missingness)
missing_folds = split_missing_datasets_into_folds(Xs, y, n_folds, seed = seed_fold_split)
max_depth = tune_max_depth(missing_folds[0], max_max_depth = max_max_depth, min_samples_leaf=min_samples_leaf)
trees = setup_equal_trees(max_depth = max_depth, min_samples_leaf=min_samples_leaf)
losses = calculate_missing_cvs_loss(missing_folds, trees)

In [15]:
losses

Unnamed: 0,Majority,MIA,Trinary,Weighted
0.0,7.319821,7.319821,7.319821,7.319821
0.25,12.029909,12.029909,6.958593,11.42131
0.5,16.967845,16.967845,9.068999,15.818863


In [92]:
# Set up variables
data_folder = 'data/cleaned'
data_sets = ['titanic','iris']
tree_types = ['Majority','Trinary']
seed_missingness = 10
seed_fold_split = 11
ps = [0,0.25]
n_folds = 2
min_samples_leaf = 20
max_max_depth = 2

idx = pd.IndexSlice

losses  = {}
for data_set in data_sets:
    X = pd.read_csv(f'{data_folder}/{data_set}.csv',index_col = 0)
    y = X.pop('y')

    Xs = create_missing_Xs(X, ps, seed = seed_missingness)
    missing_folds = split_missing_datasets_into_folds(Xs, y, n_folds, seed = seed_fold_split)
    max_depth = tune_max_depth(missing_folds[0], max_max_depth = max_max_depth, min_samples_leaf=min_samples_leaf)
    trees = setup_equal_trees(max_depth = max_depth, min_samples_leaf=min_samples_leaf, tree_types=tree_types)
    losses[data_set] = calculate_missing_cvs_loss(missing_folds, trees)

In [98]:
pd.concat(losses)

Unnamed: 0,Unnamed: 1,Majority,Trinary
titanic,0.0,,
titanic,0.25,,
iris,0.0,,
iris,0.25,,


In [96]:
losses.keys()

dict_keys(['titanic', 'iris'])

In [90]:
losses.loc[idx[data_set]]


Unnamed: 0,Majority,Trinary
0.0,,
0.25,,


In [91]:
these_losses


Unnamed: 0,Majority,Trinary
0.0,1.0,2.0
0.25,3.0,5.0
