In [60]:
# Import packages
import numpy as np
import pprint
import ipdb
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log

In [61]:
# Load training data, split into train and validation sets
data = pd.read_csv("train.csv")
train_data = data.sample(frac=0.8)
val_data = data.drop(train_data.index)

In [62]:
def entropy_before_split(train_data):
    """ Finds Entropy of dataset before any split """
    
    dependent_variable = "left"
    entropy = 0
    labels = train_data[dependent_variable].unique()
    for label in labels:
        temp = train_data[dependent_variable].value_counts()[label] / len(train_data[dependent_variable])
        entropy += -temp * log(temp + eps)
    return entropy

In [63]:
def entropy_on_attribute_split(train_data, attribute):
    """ Finds resulting entropy of dataset if it is split using attribute """
    entropy_after_split = 0
    dependent_variable = "left"
    class_labels = train_data[dependent_variable].unique()
    attribute_labels = train_data[attribute].unique()

    for attribute_label in attribute_labels:
        entropy = 0
        for class_label in class_labels:
            numer = len(train_data[attribute][train_data[attribute] == attribute_label][train_data[dependent_variable] == class_label])
            denom = len(train_data[attribute][train_data[attribute] == attribute_label])
            temp = numer / (denom + eps)
            entropy += -temp * log(temp + eps)
        temp2 = denom / len(train_data)
        entropy_after_split += -temp2 * entropy
    return abs(entropy_after_split)

In [64]:
def split_criteria(train_data):
    """ Finds the best attribute to split on """
    
    # Define categorical attributes
    attributes = ["Work_accident", "promotion_last_5years", "sales", "salary"]    
    Info_gain = {}
    for key in attributes:
        Info_gain[key] = entropy_before_split(train_data) - entropy_on_attribute_split(train_data, key)
    first = max(Info_gain, key=lambda k: Info_gain[k])
#     print (Info_gain)
    return first, Info_gain[first]

In [65]:
def split_dataset(train_data, feature, label):
    """ This splits the dataset on given feature and all of its values """
    return train_data[train_data[feature] == label].reset_index(drop = True)

In [66]:
def most_probable(train_data):
    dependent_variable = "left"
    count_left = len(train_data[train_data[dependent_variable] == 0])
    count_right = len(train_data[train_data[dependent_variable] == 1])
    if count_left > count_right:
        return 0
    else:
        return 1

In [67]:
def Decision_tree(train_data):
    """ Builds tree recursively """
    D_tree = {}
    dependent_variable = "left"
    root, gain = split_criteria(train_data)
    
    if gain == 0.0:
        return most_probable(train_data)
    
    labels = train_data[root].unique()
    D_tree[root] = {}
    
    for label in labels:
        split_data = split_dataset(train_data, root, label)
        unique_labels = split_data[dependent_variable].unique()
        
        if len(unique_labels) == 1:
            D_tree[root][label] = unique_labels[0]
        else:
            D_tree[root][label] = Decision_tree(split_data)
    return D_tree
    

In [69]:
def predict(inst,tree):
    #This function is used to predict for any input variable  
    #Recursively we go through the tree that we built earlier
    for nodes in tree.keys():        
        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

In [70]:
def validate_tree(val_data):
    tree = Decision_tree(train_data)
    predicted = []
    for index, row in val_data.iterrows():
        predicted.append(predict(row, tree))
    actual = val_data["left"].tolist()
    true_pos = 0
    true_neg = 0
    false_pos = 0
    false_neg = 0
    
    for i in range(0, len(predicted)):
        if (predicted[i] == 0 and actual[i] == 0):
            true_neg += 1
        elif (predicted[i] == 0 and actual[i] == 1):
            false_neg += 1
        elif (predicted[i] == 1 and actual[i] == 0):
            false_pos += 1
        else:
            true_pos += 1
    return true_pos, true_neg, false_pos, false_neg 
    
        

In [71]:
def accuracy(val_data):
    true_pos, true_neg, false_pos, false_neg = validate_tree(val_data)
    total_instances = true_neg + true_pos + false_neg + false_pos
    accuracy_estimate = (true_neg + true_pos) / (total_instances)
    precision_estimate = true_pos / (true_pos + false_pos)
    recall_estimate = (true_pos) / (true_pos + false_neg)
    f1_score = (1 / recall_estimate) + (1 / precision_estimate)
    f1_score = 2 / f1_score
    print ("Accuracy : ", accuracy_estimate)
    print ("Precision : ", precision_estimate)
    print ("Recall : ", recall_estimate)
    print ("F1_Score : ", f1_score)

In [74]:
accuracy(val_data)

Accuracy :  0.7682384341637011
Precision :  1.0
Recall :  0.0019157088122605363
F1_Score :  0.0038240917782026767
