### Part3: Effectiveness of Misclassification rate, Gini Index measures
> Note: For misclassification Rate just uncomment the commented part.

In [65]:
import numpy as np
import pandas as pd
import pprint
import matplotlib.pyplot as plt

In [66]:
df=pd.read_csv("decision_Tree/train.csv")

In [67]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.1,0.9,7,286,4,0,1,0,sales,low
1,0.89,0.93,4,249,3,0,0,0,sales,low
2,0.38,0.5,2,132,3,0,1,0,accounting,low
3,0.95,0.71,4,151,4,0,0,0,sales,medium
4,0.84,0.84,5,163,3,0,0,0,technical,low


In [68]:
def convert_sat_to_num(s):
    if s < 0.247:
        return 0
    elif s > .6400:
        return 2
    else:
        return 1

df.satisfaction_level = df.satisfaction_level.map(convert_sat_to_num)

def convert_lastEval_to_num(s):
    if s < 0.4:
        return 0
    elif s > .720:
        return 2
    else:
        return 1

df.last_evaluation = df.last_evaluation.map(convert_lastEval_to_num)


def convert_number_project_to_num(s):
    if s < 3:
        return 0
    elif s > 5:
        return 2
    else:
        return 1

df.number_project = df.number_project.map(convert_number_project_to_num)


def convert_hours_to_num(s):
    if s < 100:
        return 0
    elif s > 210:
        return 2
    else:
        return 1

df.average_montly_hours = df.average_montly_hours.map(convert_hours_to_num)


def convert_timeSpend_to_num(s):
    if s < 3:
        return 0
    elif s > 4:
        return 2
    else:
        return 1

df.time_spend_company = df.time_spend_company.map(convert_timeSpend_to_num)

In [69]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0,2,2,2,1,0,1,0,sales,low
1,2,2,1,2,1,0,0,0,sales,low
2,1,1,0,1,1,0,1,0,accounting,low
3,2,1,1,1,1,0,0,0,sales,medium
4,2,2,1,1,1,0,0,0,technical,low


In [70]:
split = int(0.8 * df.shape[0])

training_data = df[:split] # 80% of the total data
testing_data = df[split:]  # 20% of the total data

In [71]:
def count_Unique(col):
    d={} # empty dictionary
    for i in col:
        if i not in d:
            d[i]=1
        else:
            d[i]+=1
    return d

> GiniIndex=${2*(p)*(1-p)}$</n>

> MisclassificationRate=${min(p,1-p)}$

In [72]:
def misClassificationRate(Y):
    freq_map = count_Unique(Y)
    total = len(Y)
    p = freq_map[0]/total
    return min(p,1-p)

def gini_index(Y):
    freq_map = count_Unique(Y)
    gini= 0.0
    total = len(Y)
    p = freq_map[0]/total
    gini =2*p*(1-p)
    return gini

In [73]:
def gini_attribute(df,attribute):
   
    var=list(set(df[attribute]))
    gini_global=0
    for i in var:
        val1=len(df[attribute][df[attribute]==i][df.left == 1])
        val2=len(df[attribute][df[attribute]==i])
        if val1 == 0 or val2 == 0 or val1==val2:
            local = 0
        else:
            prob = val1 / (val2)
            local=2*prob*(1-prob)
            #local=min(prob,1-prob)
      
   
        weighted=val2/len(training_data)
        gini_global+=weighted*local

    return gini_global

In [74]:
def information_gain(df,attribute):
    if not attribute:
        return 'empty'
    
    totalGini=gini_index(df['left'])
    #totalGini=misClassificationRate(df['left'])
    max = 0
    select_attr = 0

    for i in attribute:
        wightedGini = gini_attribute(df,i)
        info_gain = totalGini - wightedGini
        if info_gain >= max :
                 max = info_gain
                 select_attr = i

    return select_attr

In [75]:
def make_tree(df, attribute, tree=None):
    
    node = information_gain(df,attribute)
    temp = []
    if node is 'empty':
        a1 = df['left'][df['left'] == 1].count()
        a2 = df['left'][df['left'] == 0].count()
        
        if a1 > a2:
            return 1
        else:
            return 0
        
        
    for i in attribute:
        temp.append(i) 
    temp.remove(node)
    
    
    collval = np.unique(df[node])
   
    
    if tree is None:
        tree = {}
        tree[node] = {}
        
    for value in collval:
        sub_tree = df[['sales','promotion_last_5years','salary',  'Work_accident','satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','left']][df[node] == value]
        
        count = len(df[node][df[node] == value])
        val,counts = np.unique(sub_tree['left'],return_counts=True)
        

        if len(counts)==1:
            tree[node][value]= val[0]
            
        else:
            tree[node][value] = make_tree(sub_tree, temp)
            
    return tree

In [76]:
featureSets = ['sales','promotion_last_5years','salary',  'Work_accident','satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company']
tree = make_tree(training_data, featureSets)

pprint.pprint(tree)

{'number_project': {0: {'time_spend_company': {0: {'last_evaluation': {0: 0,
                                                                       1: {'average_montly_hours': {0: 0,
                                                                                                    1: 0,
                                                                                                    2: {'satisfaction_level': {1: {'salary': {'low': 0,
                                                                                                                                              'medium': {'sales': {'sales': 1,
                                                                                                                                                                   'technical': 0}}}},
                                                                                                                               2: 0}}}},
                                                                

                                                                                                                                                                                                                              1: 0}},
                                                                                                                                                                                            'medium': 0}}}},
                                                                                                                                                     2: {'last_evaluation': {1: {'salary': {'high': 0,
                                                                                                                                                                                            'low': {'promotion_last_5years': {0: 0}},
                                                                                                                                        

                                               2: {'sales': {'IT': {'salary': {'high': 0,
                                                                               'low': {'average_montly_hours': {1: 1,
                                                                                                                2: 0}},
                                                                               'medium': {'Work_accident': {0: {'time_spend_company': {1: {'average_montly_hours': {1: {'last_evaluation': {1: {'promotion_last_5years': {0: 1}}}}}}}},
                                                                                                            1: 0}}}},
                                                             'RandD': 0,
                                                             'accounting': 0,
                                                             'hr': 0,
                                                             'management': {'time_spend_company': {0:

In [77]:
def predict(inst,tree):
    for nodes in tree.keys():        
        value = inst[nodes]
        #print("node",nodes)
        try:
            tree = tree[nodes][value]
            prediction = 0
        except:
            return 1
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

In [78]:
prediction=[]
for i in range(0,len(testing_data)):
    inst = testing_data.iloc[i]
    pre = predict(inst, tree)
    prediction.append(pre)

### Accuracy, Precision,Recall and F1Score to measure effectiveness of our Classifier 

In [79]:
actual_result=testing_data['left'].tolist()
Tp=0
Tn=0
Fp=0
Fn=0
for i in range(0,len(actual_result)):
    if actual_result[i]==prediction[i] and actual_result[i]==1:
        Tp+=1
    elif actual_result[i]==prediction[i] and actual_result[i]==0:
        Tn+=1
    elif actual_result[i]==0 and prediction[i]==1:
        Fp+=1
    else:
        Fn+=1

accuracy=(Tp+Tn)/len(actual_result)*100
precision=Tp/(Tp+Fp)
recall=Tp/(Tp+Fn)
F1_Score=2*precision*recall/(recall+precision)

In [80]:
print(accuracy,"%")
print(precision)
print(recall)
print(F1_Score)

95.06227758007117 %
0.8767361111111112
0.926605504587156
0.9009812667261374
