### Imports

In [1]:
import sys
import math
from math import log
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

### Making decision tree class (Display to show build tree)
### Predict function is used to predict label of test data given

In [4]:
class Make_Dec_Tree:
    def __init__(self, colss):
        x = colss
        self.colss = x
        self.leaf = {}
    def pre(self, x):
        if not self.leaf == {}: # reached leaf level
            if x[self.colss] in self.leaf:
                s = self.leaf[x[self.colss]]
                return s.pre(x)
        else:
            return self.colss

    def display(self, level = 0):
        if not self.leaf == {}:
            for x in self.leaf.keys():
                p = "\n" + " " * level * 4
                print(p, self.colss, "=", x,end="")
                self.leaf[x].display(level + 1)
        else:
            print(": ", self.colss, end="")
     
  

### Calculation of entropy

In [5]:
def entropyfn(df, label):
    df=df.values
    label_column = df[:, -1]
    _, counts = np.unique(label_column, return_counts=True)
    q_value = counts / counts.sum()
    entropy = sum(q_value * -np.log2(q_value))
    return entropy

# def InfoGain(data,split_attribute_name,label_name="class"):
#     total_entropy = entropyfn(data[label_name])
#     vals,counts= np.unique(data[split_attribute_name],return_counts=True)
#     Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropyfn(data.where(data[split_attribute_name]==vals[i]).dropna()[label_name]) for i in range(len(vals))])
#     Information_Gain = total_entropy - Weighted_Entropy
#     return Information_Gain



### Choosing attribute with max info gain

In [6]:
def choose_best_Attribute(df, label, attributes):
    TL = len(df)
    info_gain = []
    baseEntropy = entropyfn(df, label)
    for x in attributes:
        weighted_entr = 0
        selected_attr_list = df.groupby(x)
        for key,temp in selected_attr_list:
            del temp[x]
            xtl = len(temp)
            lkte = (xtl / TL)
            weighted_entr += lkte*entropyfn(temp,label)
        totent = baseEntropy-weighted_entr
        info_gain.append([x,totent])
    bestAttribute = max(info_gain, key=lambda x: x[1])
    return(bestAttribute[0])


In [7]:
def check_sklearn():
    #Comparing result with in-built(scikit-learn) decision tree function to check correctness of algorithm used
    df = pd.read_csv("../input_data/train1.csv")
    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    le_salary = LabelEncoder()
    le_sales=LabelEncoder()
    df['sales_n'] = le_salary.fit_transform(df['sales'])
    df['salary_n'] = le_sales.fit_transform(df['salary'])
    df=df.drop(['sales','salary'],axis='columns')
    df=df.drop(["satisfaction_level","last_evaluation","number_project","average_montly_hours","time_spend_company"],axis='columns')
    #dividing the data into training and testing data(for validation)
    msk = np.random.rand(len(df)) < 0.8
    train2 = df[msk]
    test2 = df[~msk]
    trainy=train2['left']
    trainx=train2.drop(['left'],axis='columns')

    #training the model
    model.fit(trainx,trainy)
    testy=test2['left']
    testx=test2.drop(['left'],axis='columns')

    #predicting over the test data
    pred=model.predict(testx)
    print( accuracy_score(testy,pred))
    print (confusion_matrix(testy,pred))
    print (classification_report(testy,pred))

### Building Decision Tree

In [10]:
def Decision_tree_algo(df, label, attributes1):
    attributes=attributes1[:]
    u_val = df.apply(lambda x: x.nunique()).loc[label]
    if u_val==1:
        return Make_Dec_Tree(df[label].iloc[0])
    if len(attributes) == 0:
        item_counts = df[label].value_counts()
        max_item = item_counts.idxmax()
        return Make_Dec_Tree(max_item)
    
    bestAttribute = choose_best_Attribute(df, label, attributes)
    # print(bestAttribute)
    attributes.remove(bestAttribute)
    selected_attr_list = df.groupby(bestAttribute)
    root_attr = Make_Dec_Tree(bestAttribute)
    
    for key,temp in selected_attr_list:
        if not len(temp) == 0:
            root_attr.leaf[key] = Decision_tree_algo(temp.drop([bestAttribute],axis=1), label, attributes)
        else:
            item_counts = temp[label].value_counts()
            max_item = item_counts.idxmax()
            root_attr.leaf[key] = Make_Dec_Tree(max_item)

    return root_attr

### Main function

In [11]:
train = pd.read_csv('../input_data/train1.csv')
train, test = train_test_split(train, test_size = 0.2)
# label = sys.argv[1]
label = "left"

attributes = train.columns.tolist()
x = ["satisfaction_level","last_evaluation","number_project","average_montly_hours","time_spend_company"]
for i in x:
    attributes.remove(i)
attributes.remove(label)
# while True:
#         try:
#             word_list.remove(vowel)
#         except:
#             break
test = test.reset_index()
tree = Decision_tree_algo(train,label,attributes)
tree.display()
# print(bestAttribute)
# print(bestAttribute)

tp,fp,tn,fn=0,0,0,0
correct = 0
for i in range(0,len(test)):
    if tree.pre(test.loc[i])==1 and test.loc[i,label]==1:
        tp+=1
    if tree.pre(test.loc[i])==1 and test.loc[i,label]==0:
        fp+=1
    if tree.pre(test.loc[i])==0 and test.loc[i,label]==0:
        tn+=1
    if tree.pre(test.loc[i])==0 and test.loc[i,label]==1:
        fn+=1
    if str(tree.pre(test.loc[i])) == str(test.loc[i,label]):
        correct += 1

print("\nThe accuracy is: ", correct/len(test))
y = (tp+fp)
x = (tp+fn)
if x:
    rc=tp/x
if y:
    pc=tp/y
if (rc+pc):
    f1=(2*rc*pc)/(rc+pc)
print("\nRecall: ", rc)
print("\nPrecision: ", pc)
print("\nF1-Score: ", f1)

print("\nTrue pos: ",tp)
print("\nFalse pos: ",fp)
print("\nTrue neg: ",tn)
print("\nFalse neg: ",fn)


 salary = high
     sales = IT
         Work_accident = 0
             promotion_last_5years = 0:  0
         Work_accident = 1:  0
     sales = RandD:  0
     sales = accounting
         Work_accident = 0
             promotion_last_5years = 0:  0
             promotion_last_5years = 1:  0
         Work_accident = 1:  0
     sales = hr
         Work_accident = 0
             promotion_last_5years = 0:  0
         Work_accident = 1:  0
     sales = management
         promotion_last_5years = 0
             Work_accident = 0:  0
             Work_accident = 1:  0
         promotion_last_5years = 1:  0
     sales = marketing
         Work_accident = 0
             promotion_last_5years = 0:  0
             promotion_last_5years = 1:  0
         Work_accident = 1:  0
     sales = product_mng
         Work_accident = 0
             promotion_last_5years = 0:  0
         Work_accident = 1:  0
     sales = sales
         Work_accident = 0
             promotion_last_5years = 0:  0
         

In [12]:
check_sklearn()

0.7490892531876139
[[1645    0]
 [ 551    0]]
              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1645
           1       0.00      0.00      0.00       551

   micro avg       0.75      0.75      0.75      2196
   macro avg       0.37      0.50      0.43      2196
weighted avg       0.56      0.75      0.64      2196



  'precision', 'predicted', average, warn_for)
