## Decision Tree Classifier

In [17]:
from sklearn.datasets import load_breast_cancer, load_wine
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import statistics
from scipy import stats
import pandas as pd 

## Datasets

In [18]:
df_breast_cancer = pd.DataFrame(load_breast_cancer()['data'], columns= load_breast_cancer()['feature_names'])
df_breast_cancer['target'] = load_breast_cancer()['target']

In [19]:
df_wine = pd.DataFrame(load_wine()['data'], columns= load_wine()['feature_names'])
df_wine['target'] = load_wine()['target']

df_wine = df_wine[df_wine['target'] != 2]

## Definition

In [150]:
class Node:

    def __init__(self, data, features, target):

        self.data = data
        self.is_leaf = False
        self.features = features
        self.target = target
        self.split_feat = None
        self.split_value = None
        
    def gini_calc(self, data, target):
        
        gini = 0
        for i in data[target].unique():  
            positive_data = data[data[target] == i]['target']             
            
            p = sum(positive_data) / len(data)
            q = 1-p
            gini += (p*q)
            
        return gini
    
    def entropy_calc(self, data, target):
        
        entropy = 0
        
        if len(data) == 0:
            return 0
        
        for i in data[target].unique():  
            positive_data = data[data[target] == i]['target']             
            p = sum(positive_data) / len(data)
            entropy += (-p * np.log2(p))
        return entropy
    
    def find_best_split(self, depth, max_depth):
        data = self.data
        target = self.target
        features = self.features
        
        max_gini_gain = 0
        
        if depth > max_depth:
            if self.split_feat is None:
                self.is_leaf = True
            return
            
        for feat in features:
            f = sorted(data[feat])
            split_point = statistics.mean(f)
            
            root_gini = self.gini_calc(data, target)
            
            left_data = data[data[feat] <= split_point]
            right_data = data[data[feat] >= split_point]
            
            left_gini = self.gini_calc(left_data, target)
            right_gini = self.gini_calc(right_data, target)
            
            split_gini = len(left_data)/len(data)*left_gini + len(right_data)/len(data)*right_gini
                
            if max_gini_gain < (root_gini - split_gini):
                
                max_gini_gain = (root_gini - split_gini)
                self.split_feat = feat
                
                self.left = Node(left_data, self.features, self.target)
                self.right = Node(right_data, self.features, self.target)
                
                self.split_value = data[feat].mean()
                #print(self.left is None)
                #print(self.right is None)
            
        if self.split_feat is None:
            self.is_leaf = True

        return

In [151]:
def train_DT(orig_dataset, features, target, depth, max_depth):
    n = Node(orig_dataset.copy(), features, target)
    
    n.find_best_split(depth, max_depth)
    
    if n.split_feat is None:
        return n
    
    n.left = train_DT(n.left.data, features, target, depth+1, max_depth) 
    n.right = train_DT(n.right.data, features, target, depth+1, max_depth) 
    
    return n

In [152]:
def traverse_DT(node, index = 0, is_left=False, is_root=False): 
   
    if node is None:
        return
    
    if node.is_leaf:
        print('Leaf')
        return
        
    elif is_root:
        print(index, ' Root ', node.split_feat, node.split_value)
    
    elif is_left:
        print(index, ' Left ', node.split_feat, node.split_value)
        
    else:
        print(index, ' Right ', node.split_feat, node.split_value)
        
    left = node.left
    right = node.right
    
    if node.left is None:
        return
    if node.right is None:
        return
    
    traverse_DT(node.left, index+1, is_left=True)
    traverse_DT(node.right, index+1)

In [153]:
def predict(row, node):
    if node.is_leaf:
        #print(stats.mode(node.data[node.target].values)[0][0])
        return stats.mode(node.data[node.target].values)[0][0]
        
    feat = node.split_feat
    val = node.split_value

    if row[feat] <= val:
        return predict(row, node.left)
    
    return predict(row, node.right)

def predict_Decision_Tree(node_decision_tree, data):
    y_pred = []
    for index, row in data.iterrows():
        y = predict(row, node_decision_tree) 
        if y >= 0.5:
            y = 1
        else:
            y = 0
            
        y_pred.append(y)
    return y_pred 

## Execution

### Breast Cancer Data

In [183]:
node_decision_tree_BC = train_DT(df_breast_cancer, df_breast_cancer.drop('target', axis=1).columns, 'target', depth = 1, max_depth = 4)

In [184]:
#traverse_DT(node_decision_tree_BC, is_root=True)

In [185]:
y_pred = predict_Decision_Tree(node_decision_tree_BC, df_breast_cancer)

In [186]:
print('Accuracy', accuracy_score(df_breast_cancer['target'], y_pred))
print('Confusion Matrix\n ', confusion_matrix(df_breast_cancer['target'], y_pred))

Accuracy 0.961335676625659
Confusion Matrix
  [[193  19]
 [  3 354]]


### Wine Data

In [158]:
node_decision_tree_W = train_DT(df_wine, df_wine.drop('target', axis=1).columns, 'target', depth = 1, max_depth = 4)

In [159]:
#traverse_DT(node_decision_tree_W, is_root=True)

In [160]:
y_pred = predict_Decision_Tree(node_decision_tree_W, df_wine)

In [161]:
print('Accuracy', accuracy_score(df_wine['target'], y_pred))
print('Confusion Matrix\n ', confusion_matrix(df_wine['target'], y_pred))

Accuracy 0.9923076923076923
Confusion Matrix
  [[58  1]
 [ 0 71]]


## Random Forest Classifier 

In [162]:
node_decision_tree_BC = train_DT(df_breast_cancer, df_breast_cancer.drop('target', axis=1).columns, 'target', depth = 1, max_depth = 7)
y_pred = predict_Decision_Tree(node_decision_tree_BC, df_breast_cancer)

In [207]:
import random
def RandomForestClassifier(orig_dataset, features, target, max_depth=4, n_estimators=100, feature_sampling = 0.8):
    dataset = orig_dataset.copy()
    bagging_y_pred = []
    for i in range(n_estimators):
        ## Feature Samples
        #print('Decision Tree ', i+1)
        
        rand_features = random.sample(list(features), int(feature_sampling*len(list(features))))
        
        bagging_dataset = dataset[rand_features].copy()
        bagging_dataset[target] = dataset[target]
        
        sample_dataset = bagging_dataset.sample(n=len(bagging_dataset), replace=True)
        
        node_decision_tree_BC = train_DT(sample_dataset, sample_dataset.drop('target', axis=1).columns, 
                                         'target', depth = 1, max_depth = max_depth)
        
        y_pred = predict_Decision_Tree(node_decision_tree_BC, bagging_dataset)
        
        bagging_y_pred.append(y_pred)
    
    return bagging_y_pred

In [208]:
bagging_y = RandomForestClassifier(df_breast_cancer, df_breast_cancer.drop('target', axis=1).columns, 'target')

In [209]:
y_pred = pd.DataFrame(bagging_y).T.mode(axis=1)[0] 

In [210]:
print('Accuracy', accuracy_score(df_breast_cancer['target'], y_pred))
print('Confusion Matrix\n ', confusion_matrix(df_breast_cancer['target'], y_pred))

Accuracy 0.9824253075571178
Confusion Matrix
  [[205   7]
 [  3 354]]
