In [55]:
import pandas as pd
import numpy as np
import os
import re
import math


In [56]:
data = {
    'Chest pain': ['Yes', 'No', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes'],
    'Blocked Arteries': ['Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes'],
    'Patient Weight': [205, 180, 210, 167, 156, 125, 168, 172],
    'Heart Disease': ['Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'No', 'No']
}

new_column_name = 'weight_>_176'
new_column_values = ['yes' if weight > 176 else 'no' for weight in data['Patient Weight']]
data[new_column_name] = new_column_values
del data['Patient Weight']

lowercase_data = {}
for key, value in data.items():
    lowercase_data[key] = [str(val).lower() for val in value]

snake_case_data = {}
for key, value in lowercase_data.items():
    snake_case_key = re.sub(r'\s+', '_', key.lower())
    snake_case_data[snake_case_key] = value


pandas_df = pd.DataFrame(snake_case_data)

pandas_df.head(10)

Unnamed: 0,chest_pain,blocked_arteries,heart_disease,weight_>_176
0,yes,yes,yes,yes
1,no,yes,yes,yes
2,yes,no,yes,yes
3,yes,yes,yes,no
4,no,yes,no,no
5,no,yes,no,no
6,yes,no,no,no
7,yes,yes,no,no


In [67]:
class AdaboostDataset:
    def __init__(self, pandas_df):
        self.df = pandas_df
    
    def get_intial_sample_weights(self, just_weight=False):
        n_rows = df.df.shape[0]
        initial_weight = float(1.0)/float(n_rows)
        
        if just_weight:
            return initial_weight
        
        df_weighted = self.df.copy()
        df_weighted.insert(self.df.shape[1], 'sample_weight', initial_weight)
        return df_weighted
        

ada_df = AdaboostDataset(pandas_df=pandas_df)
ada_df.get_intial_sample_weights().head()

Unnamed: 0,chest_pain,blocked_arteries,heart_disease,weight_>_176,sample_weight
0,yes,yes,yes,yes,0.125
1,no,yes,yes,yes,0.125
2,yes,no,yes,yes,0.125
3,yes,yes,yes,no,0.125
4,no,yes,no,no,0.125


In [65]:
class TreeNode:
    def __init__(self, node_split_by, split_by_value, node_df, target_var, target_a, target_b):
        self.node_split_by = node_split_by 
        self.split_by_value = split_by_value
        self.target_var = target_var
        self.target_a = target_a
        self.target_b = target_b
        
        self.node_df = node_df
        self.node_population = self.node_df.shape[0]
        self.gini_impurity = None

        df_value_a = self.node_df[self.node_df[self.target_var] == self.target_a]
        df_value_b = self.node_df[self.node_df[self.target_var] == self.target_b]

        probability_a = df_value_a.shape[0] / self.node_population
        probability_b = df_value_b.shape[0] / self.node_population
        

        self.gini_impurity = 1 - (probability_a ** 2) - (probability_b ** 2)

        #print(self.gini_impurity)


class NodeSplit:
    def __init__(self, df, split_by_feature, feature_value_a, feature_value_b,target_var, target_a, target_b):
        self.df = df
        self.split_by = split_by_feature
        self.target_var = target_var
        self.target_a = target_a
        self.target_b = target_b
        self.df_a = self.df[self.df[split_by_feature] == feature_value_a]
        self.df_b = self.df[self.df[split_by_feature] == feature_value_b]        
        self.feature_value_a = feature_value_a
        self.feature_value_b = feature_value_b
        self.node_a = TreeNode(split_by_feature, feature_value_a, self.df_a, target_var, target_a, target_b)
        self.node_b = TreeNode(split_by_feature, feature_value_a, self.df_b, target_var, target_a, target_b)
        
        node_a_pop = self.node_a.node_population
        node_b_pop = self.node_b.node_population    
        
        node_a_coeff = node_a_pop / (node_a_pop + node_b_pop)
        node_b_coeff = node_b_pop / (node_a_pop + node_b_pop)        
        
        self.total_gini_impurity = (node_a_coeff*self.node_a.gini_impurity) + (node_b_coeff*self.node_b.gini_impurity)
        self.total_gini_impurity = np.round(self.total_gini_impurity, 2)
        #print(self.total_gini_impurity)
        
    def to_str(self):
        return f"NodeSplit: {self.split_by}" + f" - GiniIndex: {self.total_gini_impurity}"
    
    def get_gini_index(self):
        return self.total_gini_impurity
    

s0 = NodeSplit(pandas_df, 'chest_pain', 'yes', 'no', 'heart_disease', 'yes', 'no')
print(s0.to_str())

s1 = NodeSplit(pandas_df, 'blocked_arteries', 'yes', 'no', 'heart_disease', 'yes', 'no')
print(s1.to_str())

s2 = NodeSplit(pandas_df, 'weight_>_176', 'yes', 'no', 'heart_disease', 'yes', 'no')
print(s2.to_str())

NodeSplit: chest_pain - GiniIndex: 0.47
NodeSplit: blocked_arteries - GiniIndex: 0.5
NodeSplit: weight_>_176 - GiniIndex: 0.2


In [71]:
class Stump:
    def __init__(self, ada_df, target_var):
        self.ada_df = ada_df
        self.target_var = target_var
    
    def find_lowest_gini(self):
        print('Starting Function: find_lowest_gini()')
    

stump_0 = Stump(ada_df=ada_df, target_var='heart_disease')
stump_0.find_lowest_gini()

Starting Function: find_lowest_gini()
