In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.metrics import accuracy_score
import datetime
from sklearn.model_selection import train_test_split
from more_itertools import set_partitions
import math
import random
from sklearn.utils import resample


In [2]:
#file_path = 'Iris.csv'
file_path = 'eye-color.data'

In [3]:
df = pd.read_csv(file_path, sep=',')

In [4]:
headers = list(df.columns)
headers.remove('class')

In [4]:
train, test = train_test_split(df, test_size=0.1, random_state=42)

In [6]:
headers

['Eye', 'Hair', 'Height']

In [76]:
class DT:
    def __init__(self):
        """
        :param k: Number of Clusters
        :param max_it: Maximum number of iterations if hasn't reached convergence yet.
        """
        self.data_column_names = None
        self.feature_indices = None
        self.forest = []
        self.predictions = []
        self.feature_importance = []
        self.number_trees = None
        self.number_features = None
        self.num_cols = None
        self.type_cols = None
        
    def make_and_test_forest(self,train_data,test_data,number_features=2,number_trees=2):
        the_forest = self.plant_forest(train_data,number_features,number_trees)
        accuracy,self.predictions,self.multiple_predictions = self.make_prediction_forest(the_forest,test_data)
        if sum(self.feature_importance) != 0:
            self.feature_importance = [x/sum(self.feature_importance) for x in self.feature_importance]
        else:
            self.feature_importance = [0]*len(self.feature_indices)
        results = {'Accuracy': accuracy, 'F': self.number_features, 'NT': self.number_trees}
        for i in range(len(self.feature_importance)):
            results[self.data_column_names[i]] = self.feature_importance[i]
        return results
    
    def plant_forest(self,train_data,number_features=2,number_trees=2):
        self.forest = []
        self.data_column_names = list(train_data.columns)
        self.data_column_names.remove('class')
        train_dataset = train_data.values
        num_rows, num_cols = np.shape(train_dataset)
        self.num_cols = num_cols -1
        class_index = self.num_cols
        self.feature_indices = list(range(class_index)) + list(range(class_index + 1, self.num_cols))
        self.number_features = number_features
        self.number_trees = number_trees
        self.type_cols = list(train_data.dtypes)
        self.feature_importance = [0]*len(self.feature_indices)
  

        for i in range(self.number_trees):
            if self.number_features == 'uniform':
                number_features = int(np.random.uniform(1,self.num_cols))
                random_features = random.sample(self.feature_indices,number_features)
            else:
                random_features = random.sample(self.feature_indices,self.number_features)
            the_tree = self.plant_tree(train_data,random_features)
            self.forest.append(the_tree)

        return self.forest        
        
    def plant_tree(self,train_data, random_features):
        ''' start the process of recursion on the training data and let the tree
        grow to its max depth using subset of random features'''

        #get column names minus class
        #choose random set of features from column names

        root_node = self.find_best_split_point(train_data,random_features)
        self.recursive_splitter(root_node,random_features)
        return root_node
    def build_split(self,data,column_to_split,split_values):
        '''build 2 groups of data by splitting data on the column_to_split 
           at the split_value'''
        left_split = data.loc[data[self.data_column_names[column_to_split]].isin(split_values[0])]
        right_split = data.loc[data[self.data_column_names[column_to_split]].isin(split_values[1])]

        return left_split,right_split

    def build_split_numeric(self,data,column_to_split,split_value):
        '''build 2 groups of data by splitting data on the column_to_split 
           at the split_value'''
        left_split = data[data[column_to_split]<split_value]
        right_split = data[data[column_to_split]>=split_value]

        return left_split,right_split

    def multi_gini_index(self,group1,group2):
        '''Calculate Gini Impurity, func expects to be passed 
           the 2 groups of data that are the result of a split'''
        class_proportions_group1 = group1['class'].value_counts(normalize=True)    
        class_proportions_group2 = group2['class'].value_counts(normalize=True)    

        instance_proportion_group1 = len(group1)/(len(group1)+len(group2))
        instance_proportion_group2 = len(group2)/(len(group1)+len(group2))

        gini1 = (1 - class_proportions_group1.pow(2).sum())*(instance_proportion_group1)
        gini2 = (1 - class_proportions_group2.pow(2).sum())*(instance_proportion_group2)
        gini = gini1+gini2

        return gini

    def single_gini_index(self,group):
        '''Calculate Gini Impurity of a single group'''
        class_proportions = group['class'].value_counts(normalize=True)    

        gini = (1 - class_proportions.pow(2).sum())

        return gini

    def find_best_split_point(self,passed_data, feature_subset):
        '''find best split point iterating over range of values returned from the 
        get_range_to_split_on function and return a dictionary which functions as a node '''
        print(feature_subset)
        best_split_gini = 10
        attribute_index = None
        best_split_value = None
        best_split_groups  = None
        best_split_column = None
        best_split_type = None
        
        gini_X = self.single_gini_index(passed_data)
        for attribute_index in feature_subset:
            if self.type_cols[attribute_index] == 'O':
                attribute_values = list(set([x[attribute_index] for x in passed_data.values]))
                if len(attribute_values) == 1:
                    gini_XA = self.single_gini_index(passed_data)
                    if gini_XA < best_split_gini:
                        best_split_gini = gini_XA
                        best_split_column  = attribute_index
                        best_split_value = attribute_values
                        best_split_groups = passed_data, pd.DataFrame(columns = passed_data.columns)
                        best_split_type = 0
                else:    
                    partitions = list(set_partitions(attribute_values, 2))
                    for part in partitions:
                        if len(part[1]) < len(part[0]):
                            part = [part[1], part[0]]    
                        left_split, right_split = self.build_split(passed_data, attribute_index, part)
                        gini_XA =  self.multi_gini_index(left_split, right_split)
                        if gini_XA < best_split_gini:
                            best_split_gini = gini_XA
                            best_split_column  = attribute_index
                            best_split_value = part
                            best_split_groups = left_split, right_split
                            best_split_type = 0
            else:
                col_name = passed_data.columns[attribute_index]
                split_point = float(passed_data[col_name].median())
                left_split, right_split = self.build_split_numeric(passed_data,col_name,split_point)
                gini_XA = self.multi_gini_index(left_split, right_split)

                if gini_XA < best_split_gini:
                    best_split_gini = gini_XA
                    best_split_column = attribute_index
                    best_split_value = split_point
                    best_split_groups = left_split, right_split
                    best_split_type = 1
        
        gini_A = gini_X - best_split_gini
        self.feature_importance[best_split_column] += gini_A
        return {'column_id': best_split_column,'column_name':self.data_column_names[best_split_column],'type':best_split_type,'dsplit_value':best_split_value,
                     'gini':best_split_gini, 'groups': best_split_groups}

    def recursive_splitter(self,node,random_features):
        '''this function recursively splits the data starting with the root node which its passed
        untill the groups are homogenous or further splits result in empty nodes'''
        left_group,right_group = node['groups']
        #delete the groups entry in original node
        del node['groups']
        #check if the groups of the node are empty
        if left_group.empty or right_group.empty:
            #combine as we will use original to predict
            combined = pd.concat([left_group,right_group])
            predicted_class = combined['class'].value_counts().index[0]
            node['left']=node['right']=predicted_class
            return [predicted_class]
        #check if the groups of the node are homogenous otherwise call recursive_spltter again
        if self.single_gini_index(left_group) == 0:
            predicted_class = left_group['class'].value_counts().index[0]
            node['left'] = predicted_class
        else:
            node['left'] = self.find_best_split_point(left_group,random_features)
            curr_node = self.recursive_splitter(node['left'],random_features)
            if type(curr_node) == list:
                node['left'] = curr_node[0]

        if self.single_gini_index(right_group) == 0:
            predicted_class = right_group['class'].value_counts().index[0]
            node['right'] = predicted_class
        else:
            node['right'] = self.find_best_split_point(right_group,random_features)
            curr_node = self.recursive_splitter(node['right'],random_features)
            if type(curr_node) == list:
                node['right'] = curr_node[0]        
        return node

    def make_prediction_tree(self,data_row,root_node):
        '''recursively traverse the tree from root to leaf turning left if feature value
        to test is less than dsplit_value or right otherwise until we reach a leaf node'''

        if  root_node['type'] == 0:
            #check if feature of data_row is less than dsplit_value else move to right branch
            if data_row[root_node['column_id']] in root_node['dsplit_value'][0]:
                #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
                if type(root_node['left']) is dict:
                    return self.make_prediction_tree(data_row,root_node['left'])
                else:
                    return root_node['left']
            else:
                if type(root_node['right']) is dict:
                    return self.make_prediction_tree(data_row,root_node['right'])
                else:
                    return root_node['right']
        else:
            if data_row[root_node['column_id']] < root_node['dsplit_value']:
                #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
                if type(root_node['left']) is dict:
                    return self.make_prediction_tree(data_row,root_node['left'])
                else:
                    return root_node['left']
            else:
                if type(root_node['right']) is dict:
                    return self.make_prediction_tree(data_row,root_node['right'])
                else:
                    return root_node['right']

    def make_prediction_forest(self,forest,test_data):

        classes = test_data['class']
        classes = classes.reset_index(drop=True)

        forest_predictions = []
        multiple_forest_predictions = []
        for index,row in test_data.iterrows():
            tree_predictions = []
            for tree in forest:
                tree_predictions.append(self.make_prediction_tree(row,tree))
            multiple_forest_predictions.append(tree_predictions)
            tree_predictions_series = pd.Series(tree_predictions)
            predicted_class = tree_predictions_series.value_counts().index[0]    
            forest_predictions.append(predicted_class)

        forest_pred_series = pd.Series(forest_predictions)

        results = forest_pred_series==classes

        successes = 0

        for i in results:
            if i==True: successes+=1

        accuracy = successes/len(classes)     


        return accuracy,forest_pred_series, multiple_forest_predictions
    

In [77]:
class RDT(DT):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def plant_forest(self,train_data,number_features=2,number_trees=2):
        self.forest = []
        self.data_column_names = list(train_data.columns)
        self.data_column_names.remove('class')
        train_dataset = train_data.values
        num_rows, num_cols = np.shape(train_dataset)
        self.num_cols = num_cols -1
        class_index = self.num_cols
        self.feature_indices = list(range(class_index)) + list(range(class_index + 1, self.num_cols))
        self.number_features = number_features
        self.number_trees = number_trees
        self.type_cols = list(train_data.dtypes)
        self.feature_importance = [0]*len(self.feature_indices)
  
        #resample training data for bagging
        resampled_training_sets = []
        for i in range(self.number_trees):
            dataset = resample(train_data, replace=True)
            #dataset = train_data.sample(frac=fraction_samples,replace=True)
            resampled_training_sets.append(dataset) 
        for dataset in resampled_training_sets:
            the_tree = self.plant_tree(train_data)
            self.forest.append(the_tree)

        return self.forest        
        
    def plant_tree(self,train_data):
        ''' start the process of recursion on the training data and let the tree
        grow to its max depth using subset of random features'''

        #get column names minus class
        #choose random set of features from column names
        random_features = random.sample(self.feature_indices,self.number_features)
        root_node = self.find_best_split_point(train_data,random_features)
        self.recursive_splitter(root_node)
        return root_node


    def recursive_splitter(self,node):
        '''this function recursively splits the data starting with the root node which its passed
        untill the groups are homogenous or further splits result in empty nodes'''
        random_features = random.sample(self.feature_indices,self.number_features)
        left_group,right_group = node['groups']
        #delete the groups entry in original node
        del node['groups']
        #check if the groups of the node are empty
        if left_group.empty or right_group.empty:
            #combine as we will use original to predict
            combined = pd.concat([left_group,right_group])
            predicted_class = combined['class'].value_counts().index[0]
            node['left']=node['right']=predicted_class
            return [predicted_class]
        #check if the groups of the node are homogenous otherwise call recursive_spltter again
        if self.single_gini_index(left_group) == 0:
            predicted_class = left_group['class'].value_counts().index[0]
            node['left'] = predicted_class
        else:
            node['left'] = self.find_best_split_point(left_group,random_features)
            curr_node = self.recursive_splitter(node['left'])
            if type(curr_node) == list:
                node['left'] = curr_node[0]

        if self.single_gini_index(right_group) == 0:
            predicted_class = right_group['class'].value_counts().index[0]
            node['right'] = predicted_class
        else:
            node['right'] = self.find_best_split_point(right_group,random_features)
            curr_node = self.recursive_splitter(node['right'])
            if type(curr_node) == list:
                node['right'] = curr_node[0]        
        return node

    def make_prediction_tree(self,data_row,root_node):
        '''recursively traverse the tree from root to leaf turning left if feature value
        to test is less than dsplit_value or right otherwise until we reach a leaf node'''

        if  root_node['type'] == 0:
            #check if feature of data_row is less than dsplit_value else move to right branch
            if data_row[root_node['column_id']] in root_node['dsplit_value'][0]:
                #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
                if type(root_node['left']) is dict:
                    return self.make_prediction_tree(data_row,root_node['left'])
                else:
                    return root_node['left']
            else:
                if type(root_node['right']) is dict:
                    return self.make_prediction_tree(data_row,root_node['right'])
                else:
                    return root_node['right']
        else:
            if data_row[root_node['column_id']] < root_node['dsplit_value']:
                #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
                if type(root_node['left']) is dict:
                    return self.make_prediction_tree(data_row,root_node['left'])
                else:
                    return root_node['left']
            else:
                if type(root_node['right']) is dict:
                    return self.make_prediction_tree(data_row,root_node['right'])
                else:
                    return root_node['right']


In [20]:
df_results = pd.DataFrame(columns=['Accuracy', 'F', 'NT'] + headers)

NameError: name 'headers' is not defined

In [50]:
df

Unnamed: 0,Eye,Hair,Height,class
0,Blue,Blonde,Tall,C+
1,Blue,Brown,Medium,C+
2,Brown,Brown,Medium,C-
3,Green,Brown,Medium,C-
4,Green,Brown,Tall,C+
5,Brown,Brown,Low,C-
6,Green,Blonde,Low,C-
7,Blue,Brown,Medium,C+


In [82]:
rf = DT()

In [83]:
forest_results = rf.make_and_test_forest(df,df,number_features='uniform',number_trees=3)

[1]
[1]
[1]
[1]
[1]
[1]
[1, 0]
[1, 0]
[1, 0]
[1, 0]


In [73]:
forest_results

{'Accuracy': 1.0,
 'F': 2,
 'NT': 1,
 'Eye': 0.4428969359331476,
 'Hair': 0.0,
 'Height': 0.5571030640668524}

In [84]:
rf.forest

[{'column_id': 1,
  'column_name': 'Hair',
  'type': 0,
  'dsplit_value': [['Blonde'], ['Brown']],
  'gini': 0.5,
  'left': 'C-',
  'right': 'C-'},
 {'column_id': 1,
  'column_name': 'Hair',
  'type': 0,
  'dsplit_value': [['Blonde'], ['Brown']],
  'gini': 0.5,
  'left': 'C-',
  'right': 'C-'},
 {'column_id': 0,
  'column_name': 'Eye',
  'type': 0,
  'dsplit_value': [['Blue'], ['Green', 'Brown']],
  'gini': 0.1999999999999999,
  'left': 'C+',
  'right': {'column_id': 0,
   'column_name': 'Eye',
   'type': 0,
   'dsplit_value': [['Green'], ['Brown']],
   'gini': 0.26666666666666666,
   'left': {'column_id': 1,
    'column_name': 'Hair',
    'type': 0,
    'dsplit_value': [['Blonde'], ['Brown']],
    'gini': 0.3333333333333333,
    'left': 'C-',
    'right': 'C-'},
   'right': 'C-'}}]

In [58]:
class RDT(DT):
    def __init__(self, **kwargs):
        """
        :param k: Number of Clusters
        :param max_it: Maximum number of iterations if hasn't reached convergence yet.
        """
        self.data_column_names = None
        self.feature_indices = None
        self.forest = []
        self.predictions = []
        self.feature_importance = []
        self.number_trees = None
        self.number_features = None
        self.num_cols = None
        self.type_cols = None
        
    def make_and_test_forest(self,train_data,test_data,number_features,number_trees=50,fraction_samples=1):
        the_forest = self.plant_forest(train_data,number_features,number_trees,fraction_samples)
        accuracy,self.predictions,self.multiple_predictions = self.make_prediction_forest(the_forest,test_data)
        self.feature_importance = [x/sum(self.feature_importance) for x in self.feature_importance]
        results = {'Accuracy': accuracy, 'F': self.number_features, 'NT': self.number_trees}
        for i in range(len(self.feature_importance)):
            results[self.data_column_names[i]] = self.feature_importance[i]
        return results
    
    def plant_forest(self,train_data,number_features=2,number_trees=2,fraction_samples=1):
        self.forest = []
        self.data_column_names = list(train_data.columns)
        self.data_column_names.remove('class')
        train_dataset = train_data.values
        num_rows, num_cols = np.shape(train_dataset)
        self.num_cols = num_cols -1
        class_index = self.num_cols
        self.feature_indices = list(range(class_index)) + list(range(class_index + 1, self.num_cols))
        self.number_features = number_features
        self.number_trees = number_trees
        self.type_cols = list(train_data.dtypes)
        self.feature_importance = [0]*len(self.feature_indices)
  
        #resample training data for bagging
        resampled_training_sets = []
        for i in range(self.number_trees):
            dataset = resample(train_data, replace=True)
            #dataset = train_data.sample(frac=fraction_samples,replace=True)
            resampled_training_sets.append(dataset) 
        for dataset in resampled_training_sets:
            the_tree = self.plant_tree(train_data)
            self.forest.append(the_tree)

        return self.forest        
        
    def plant_tree(self,train_data):
        ''' start the process of recursion on the training data and let the tree
        grow to its max depth using subset of random features'''

        #get column names minus class
        #choose random set of features from column names
        random_features = random.sample(self.feature_indices,self.number_features)
        root_node = self.find_best_split_point(train_data,random_features)
        self.recursive_splitter(root_node)
        return root_node
    def build_split(self,data,column_to_split,split_values):
        '''build 2 groups of data by splitting data on the column_to_split 
           at the split_value'''
        left_split = data.loc[data[self.data_column_names[column_to_split]].isin(split_values[0])]
        right_split = data.loc[data[self.data_column_names[column_to_split]].isin(split_values[1])]

        return left_split,right_split

    def build_split_numeric(self,data,column_to_split,split_value):
        '''build 2 groups of data by splitting data on the column_to_split 
           at the split_value'''
        left_split = data[data[column_to_split]<split_value]
        right_split = data[data[column_to_split]>=split_value]

        return left_split,right_split

    def multi_gini_index(self,group1,group2):
        '''Calculate Gini Impurity, func expects to be passed 
           the 2 groups of data that are the result of a split'''
        class_proportions_group1 = group1['class'].value_counts(normalize=True)    
        class_proportions_group2 = group2['class'].value_counts(normalize=True)    

        instance_proportion_group1 = len(group1)/(len(group1)+len(group2))
        instance_proportion_group2 = len(group2)/(len(group1)+len(group2))

        gini1 = (1 - class_proportions_group1.pow(2).sum())*(instance_proportion_group1)
        gini2 = (1 - class_proportions_group2.pow(2).sum())*(instance_proportion_group2)
        gini = gini1+gini2

        return gini

    def single_gini_index(self,group):
        '''Calculate Gini Impurity of a single group'''
        class_proportions = group['class'].value_counts(normalize=True)    

        gini = (1 - class_proportions.pow(2).sum())

        return gini

    def find_best_split_point(self,passed_data, feature_subset):
        '''find best split point iterating over range of values returned from the 
        get_range_to_split_on function and return a dictionary which functions as a node '''

        best_split_gini = 10
        attribute_index = None
        best_split_value = None
        best_split_groups  = None
        best_split_column = None
        best_split_type = None
        
        gini_X = self.single_gini_index(passed_data)
        for attribute_index in feature_subset:
            if self.type_cols[attribute_index] == 'O':
                attribute_values = list(set([x[attribute_index] for x in passed_data.values]))
                if len(attribute_values) == 1:
                    gini_XA = self.single_gini_index(passed_data)
                    if gini_XA < best_split_gini:
                        best_split_gini = gini_XA
                        best_split_column  = attribute_index
                        best_split_value = attribute_values
                        best_split_groups = passed_data, pd.DataFrame(columns = passed_data.columns)
                        best_split_type = 0
                else:    
                    partitions = list(set_partitions(attribute_values, 2))
                    for part in partitions:
                        if len(part[1]) < len(part[0]):
                            part = [part[1], part[0]]    
                        left_split, right_split = self.build_split(passed_data, attribute_index, part)
                        gini_XA =  self.multi_gini_index(left_split, right_split)
                        if gini_XA < best_split_gini:
                            best_split_gini = gini_XA
                            best_split_column  = attribute_index
                            best_split_value = part
                            best_split_groups = left_split, right_split
                            best_split_type = 0
            else:
                col_name = passed_data.columns[attribute_index]
                split_point = float(passed_data[col_name].median())
                left_split, right_split = self.build_split_numeric(passed_data,col_name,split_point)
                gini_XA = self.multi_gini_index(left_split, right_split)

                if gini_XA < best_split_gini:
                    best_split_gini = gini_XA
                    best_split_column = attribute_index
                    best_split_value = split_point
                    best_split_groups = left_split, right_split
                    best_split_type = 1
        
        gini_A = gini_X - best_split_gini
        self.feature_importance[best_split_column] += gini_A
        return {'column_id': best_split_column,'column_name':self.data_column_names[best_split_column],'type':best_split_type,'dsplit_value':best_split_value,
                     'gini':best_split_gini, 'groups': best_split_groups}

    def recursive_splitter(self,node):
        '''this function recursively splits the data starting with the root node which its passed
        untill the groups are homogenous or further splits result in empty nodes'''
        random_features = random.sample(self.feature_indices,self.number_features)
        print(random_features)
        left_group,right_group = node['groups']
        #delete the groups entry in original node
        del node['groups']
        #check if the groups of the node are empty
        if left_group.empty or right_group.empty:
            #combine as we will use original to predict
            combined = pd.concat([left_group,right_group])
            predicted_class = combined['class'].value_counts().index[0]
            node['left']=node['right']=predicted_class
            return [predicted_class]
        #check if the groups of the node are homogenous otherwise call recursive_spltter again
        if self.single_gini_index(left_group) == 0:
            predicted_class = left_group['class'].value_counts().index[0]
            node['left'] = predicted_class
        else:
            node['left'] = self.find_best_split_point(left_group,random_features)
            curr_node = self.recursive_splitter(node['left'])
            if type(curr_node) == list:
                node['left'] = curr_node[0]

        if self.single_gini_index(right_group) == 0:
            predicted_class = right_group['class'].value_counts().index[0]
            node['right'] = predicted_class
        else:
            node['right'] = self.find_best_split_point(right_group,random_features)
            curr_node = self.recursive_splitter(node['right'])
            if type(curr_node) == list:
                node['right'] = curr_node[0]        
        return node

    def make_prediction_tree(self,data_row,root_node):
        '''recursively traverse the tree from root to leaf turning left if feature value
        to test is less than dsplit_value or right otherwise until we reach a leaf node'''

        if  root_node['type'] == 0:
            #check if feature of data_row is less than dsplit_value else move to right branch
            if data_row[root_node['column_id']] in root_node['dsplit_value'][0]:
                #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
                if type(root_node['left']) is dict:
                    return self.make_prediction_tree(data_row,root_node['left'])
                else:
                    return root_node['left']
            else:
                if type(root_node['right']) is dict:
                    return self.make_prediction_tree(data_row,root_node['right'])
                else:
                    return root_node['right']
        else:
            if data_row[root_node['column_id']] < root_node['dsplit_value']:
                #check if at a branch or a leaf if branch recursively call predict else return leaf prediction
                if type(root_node['left']) is dict:
                    return self.make_prediction_tree(data_row,root_node['left'])
                else:
                    return root_node['left']
            else:
                if type(root_node['right']) is dict:
                    return self.make_prediction_tree(data_row,root_node['right'])
                else:
                    return root_node['right']

    def make_prediction_forest(self,forest,test_data):

        classes = test_data['class']
        classes = classes.reset_index(drop=True)

        forest_predictions = []
        multiple_forest_predictions = []
        for index,row in test_data.iterrows():
            tree_predictions = []
            for tree in forest:
                tree_predictions.append(self.make_prediction_tree(row,tree))
            multiple_forest_predictions.append(tree_predictions)
            tree_predictions_series = pd.Series(tree_predictions)
            predicted_class = tree_predictions_series.value_counts().index[0]    
            forest_predictions.append(predicted_class)

        forest_pred_series = pd.Series(forest_predictions)

        results = forest_pred_series==classes

        successes = 0

        for i in results:
            if i==True: successes+=1

        accuracy = successes/len(classes)     


        return accuracy,forest_pred_series, multiple_forest_predictions
    

{'Accuracy': 1.0,
 'F': 2,
 'NT': 5,
 'Eye': 0.45034788108791907,
 'Hair': 0.03162555344718533,
 'Height': 0.5180265654648957}

In [147]:
df_results = df_results.append(forest_results, ignore_index=True)

In [148]:
df_results.to_csv('eye_dt.csv')

Unnamed: 0,Accuracy,F,NT,Eye,Hair,Height
0,1.0,2.0,1.0,0.483871,0.0,0.516129
1,0.875,2.0,1.0,0.760766,0.239234,0.0
2,0.875,2.0,1.0,0.760766,0.239234,0.0
3,0.75,2.0,1.0,0.0,0.0,1.0
