In [1]:
import numpy as np

In [2]:
np.random.seed(64)
n_rows = 90
seq1 = np.array([i % 5 for i in range(0, n_rows)])
seq2 = np.array([i % 4 for i in range(0, n_rows)])
seq3 = np.array([i % 7 for i in range(0, n_rows)])
seq4 = np.array([i % 2 for i in range(0, n_rows)])
data = np.array(
    [
        seq1,
        seq2,
        seq3,
        seq4
    ]
).T
# target = np.array([1 if (i % 3 == 0) else 0 for i in range(0, 50)])
target = np.array([i % 3 for i in range(0, n_rows)])

In [3]:
target[seq3 == 5]

array([2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2])

In [4]:
def gini(target_vec):
    """Get the Gini impurity for a vector"""
    # feature vec needs to be filtered to only a certain class in the feature vector 
    # in the function where this is called
    # target vector needs to be filtered to correct indices
    target_unique = np.unique(target_vec)
    p_is = np.zeros((len(target_unique),))
    for i in range(len(target_unique)):
        p_i = sum(target_vec == target_unique[i]) / len(target_vec)
        p_is[i] = p_i
    gini_info = 1 - sum(p_is**2)

    return gini_info

# gini('', target[seq4 == 1])

def split_decider(data_array, target_vec, verbose=False):
    """Based on Gini impurity, decide which feature/value to split on."""
    min_impurity = np.inf
    split_feature_index = 0
    split_feature_val = 0
    for col in range(data_array.shape[1]):
        feature = data_array[:, col]
        feature_classes = np.unique(feature)
        for j in range(len(feature_classes)):
            feature_val = feature_classes[j]
            # calculate gini impurity for every potential split
            # (impurity of target vector filtered to indices where feature_vec == feature_val)
            gini_impurity = gini(target_vec[feature == feature_val])
            if verbose:
               print(f'feature: {col}, class: {feature_val}, gini: {gini_impurity}')
            if gini_impurity < min_impurity:
                min_impurity = gini_impurity
                split_feature_index = col
                split_feature_val = feature_classes[j]

    return {'feature': split_feature_index, 'value': split_feature_val}

split_decider(data, target)

{'feature': 2, 'value': np.int64(0)}

In [10]:
def decision_tree(data_array, target_vec):
    # take in a data matrix and target array
    # get feature to split on + value to split on
    # split dataset into two (data_array[feature == split_val] & data_array[feature != split_val])
    # check if recursion break condition is met - maybe like gini is above a certain threshold?
    # call decision tree function again to deepen tree
    # what to return?? lol
    instructions = split_decider(data_array, target_vec)
    true_mask = data_array[:,instructions['feature']] == instructions['value']
    false_mask = data_array[:,instructions['feature']] != instructions['value']
    mat1 = data_array[true_mask, :]
    mat2 = data_array[false_mask, :]
    if (gini(target_vec[true_mask]) == 0) or (len(true_mask) <= (.05 * data_array.shape[1])):
        return 'lol'
    else:
        decision_tree(mat1, target_vec[true_mask])
        decision_tree(mat2, target_vec[false_mask])
    return 'dawg'

decision_tree(data, target)

'dawg'