In [53]:
import numpy as np
import pandas as pd
import itertools
from sklearn.metrics import accuracy_score
import datetime
from sklearn.model_selection import train_test_split
from more_itertools import set_partitions


In [74]:
#file_path = 'Iris.csv'
file_path = 'eye-color.data'

In [75]:
df = pd.read_csv(file_path, sep=',')
#df.drop(['Id'], axis = 1, inplace = True)
X_set = df.iloc[:,:-1]
y_set = df.iloc[:,-1]

In [76]:
dataset= df
header = list(dataset)
dataset = dataset.values

In [77]:
num_rows, num_cols = np.shape(dataset)
num_test_rows = num_rows
num_training_rows = num_rows

num_cols = num_cols -1
class_index = num_cols
feature_indices = list(range(class_index)) + list(range(class_index + 1, num_cols))

training = dataset[:num_test_rows]
test = dataset[num_test_rows:]
print('Number of training instances: ' + str(num_training_rows))
print('Number of test instances: ' + str(num_test_rows))

Number of training instances: 8
Number of test instances: 8


In [81]:
def build_split(data,column_to_split,split_values):
    '''build 2 groups of data by splitting data on the column_to_split 
       at the split_value'''

    left_split = data.loc[data[header[column_to_split]].isin(split_values[0])]
    right_split = data.loc[data[header[column_to_split]].isin(split_values[1])]
    
    return left_split,right_split

def build_split_numeric(data,column_to_split,split_value):
    '''build 2 groups of data by splitting data on the column_to_split 
       at the split_value'''
    left_split = data[data[column_to_split]<split_value]
    right_split = data[data[column_to_split]>=split_value]
    
    return left_split,right_split

def multi_gini_index(group1,group2):
    '''Calculate Gini Impurity, func expects to be passed 
       the 2 groups of data that are the result of a split'''
    class_proportions_group1 = group1['class'].value_counts(normalize=True)    
    class_proportions_group2 = group2['class'].value_counts(normalize=True)    

    instance_proportion_group1 = len(group1)/(len(group1)+len(group2))
    instance_proportion_group2 = len(group2)/(len(group1)+len(group2))

    gini1 = (1 - class_proportions_group1.pow(2).sum())*(instance_proportion_group1)
    gini2 = (1 - class_proportions_group2.pow(2).sum())*(instance_proportion_group2)
    gini = gini1+gini2

    return gini

def single_gini_index(group):
    '''Calculate Gini Impurity of a single group'''
    class_proportions = group['class'].value_counts(normalize=True)    

    gini = (1 - class_proportions.pow(2).sum())
  
    return gini

def find_best_split_point(passed_data, feature_subset):
    '''find best split point iterating over range of values returned from the 
    get_range_to_split_on function and return a dictionary which functions as a node '''

    best_split_gini = 10
    attribute_index = None
    best_split_value = None
    best_split_groups  = None
    best_split_column = None
    best_split_type = None
    types = list(passed_data.dtypes)
    for attribute_index in feature_subset:
        if types[attribute_index] == 'O':
            attribute_values = list(set([x[attribute_index] for x in passed_data.values]))
            if len(attribute_values) == 1:
                gini_XA = single_gini_index(passed_data)
                if gini_XA < best_split_gini:
                    best_split_gini = gini_XA
                    best_split_column  = attribute_index
                    best_split_value = attribute_values
                    best_split_groups = passed_data, pd.DataFrame(columns = passed_data.columns)
                    best_split_type = 0
            else:    
                partitions = list(set_partitions(attribute_values, 2))
                for part in partitions:
                    if len(part[1]) < len(part[0]):
                        part = [part[1], part[0]]    
                    left_split, right_split = build_split(passed_data, attribute_index, part)
                    gini_XA =  multi_gini_index(left_split, right_split)
                    if gini_XA < best_split_gini:
                        best_split_gini = gini_XA
                        best_split_column  = attribute_index
                        best_split_value = part
                        best_split_groups = left_split, right_split
                        best_split_type = 0
        else:
            col_name = passed_data.columns[attribute_index]
            split_point = float(passed_data[col_name].median())
            left_split, right_split = build_split_numeric(passed_data,col_name,split_point)
            gini_XA = multi_gini_index(left_split, right_split)

            if gini_XA < best_split_gini:
                best_split_gini = gini_XA
                best_split_column = attribute_index
                best_split_value = split_point
                best_split_groups = left_split, right_split
                best_split_type = 1
    return {'column_name': best_split_column,'type':best_split_type,'dsplit_value':best_split_value,
                 'gini':best_split_gini, 'groups': best_split_groups}

def recursive_splitter(node,random_features):
    '''this function recursively splits the data starting with the root node which its passed
    untill the groups are homogenous or further splits result in empty nodes'''
    left_group,right_group = node['groups']
    #delete the groups entry in original node
    del node['groups']
    #check if the groups of the node are empty
    if left_group.empty or right_group.empty:
        #combine as we will use original to predict
        combined = pd.concat([left_group,right_group])
        predicted_class = combined['class'].value_counts().index[0]
        node['left']=node['right']=predicted_class
        return [predicted_class]
    #check if the groups of the node are homogenous otherwise call recursive_spltter again
    if single_gini_index(left_group) == 0:
        predicted_class = left_group['class'].value_counts().index[0]
        node['left'] = predicted_class
    else:
        node['left'] = find_best_split_point(left_group,random_features)
        curr_node = recursive_splitter(node['left'],random_features)
        if type(curr_node) == list:
            node['left'] = curr_node[0]

    if single_gini_index(right_group) == 0:
        predicted_class = right_group['class'].value_counts().index[0]
        node['right'] = predicted_class
    else:
        node['right'] = find_best_split_point(right_group,random_features)
        curr_node = recursive_splitter(node['right'],random_features)
        if type(curr_node) == list:
            node['right'] = curr_node[0]        
    return node


def make_prediction_tree(data_row,root_node):
    '''recursively traverse the tree from root to leaf turning left if feature value
    to test is less than dsplit_value or right otherwise until we reach a leaf node'''
    
    if  root_node['type'] == 0:
        #check if feature of data_row is less than dsplit_value else move to right branch
        if data_row[root_node['column_name']] in root_node['dsplit_value'][0]:
            #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
            if type(root_node['left']) is dict:
                return make_prediction_tree(data_row,root_node['left'])
            else:
                return root_node['left']
        else:
            if type(root_node['right']) is dict:
                return make_prediction_tree(data_row,root_node['right'])
            else:
                return root_node['right']
    else:
        if data_row[root_node['column_name']] < root_node['dsplit_value']:
            #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
            if type(root_node['left']) is dict:
                return make_prediction_tree(data_row,root_node['left'])
            else:
                return root_node['left']
        else:
            if type(root_node['right']) is dict:
                return make_prediction_tree(data_row,root_node['right'])
            else:
                return root_node['right']  

In [88]:
aa = ['Green', 'Brown', 'Tall']

In [89]:
aa.remove('Brown')

In [90]:
aa

['Green', 'Tall']

In [73]:
make_prediction_tree([5.9, 3.0, 5.1, 1.8], root_node)

'Iris-virginica'

In [82]:
dd = df.copy()

In [83]:
root_node = find_best_split_point(dd, feature_indices)

In [85]:
root_node

{'column_name': 0,
 'type': 0,
 'dsplit_value': [['Blue'], ['Green', 'Brown']],
 'gini': 0.1999999999999999,
 'groups': (    Eye    Hair  Height class
  0  Blue  Blonde    Tall    C+
  1  Blue   Brown  Medium    C+
  7  Blue   Brown  Medium    C+,
       Eye    Hair  Height class
  2  Brown   Brown  Medium    C-
  3  Green   Brown  Medium    C-
  4  Green   Brown    Tall    C+
  5  Brown   Brown     Low    C-
  6  Green  Blonde     Low    C-)}

In [86]:
recursive_splitter(root_node, feature_indices)

{'column_name': 0,
 'type': 0,
 'dsplit_value': [['Blue'], ['Green', 'Brown']],
 'gini': 0.1999999999999999,
 'left': 'C+',
 'right': {'column_name': 2,
  'type': 0,
  'dsplit_value': [['Tall'], ['Medium', 'Low']],
  'gini': 0.0,
  'left': 'C+',
  'right': 'C-'}}

In [87]:
make_prediction_tree(['Green', 'Brown', 'Tall'], root_node)

'C+'