# Part-1

### Train decision tree only on categorical data. Report precision, recall, f1 score and accuracy.


In [1]:
import numpy as np
import pandas as pd
import pprint
eps = np.finfo(float).eps
from numpy import log2 as log
targetattr = "left"
data = pd.read_csv("./../input_data/train.csv")
dataSet = pd.DataFrame(data,columns=['Work_accident', 'promotion_last_5years', 'sales','salary','left'])
# reading csv file  
trainingSet , validationSet = np.split(dataSet,[int(0.8*len(dataSet))])
# print trainingSet

#### maxInfoGainNode function decides which attribute is selected as node

In [2]:
def maxInfoGainNode(df):
    attributes = list(df.loc[:, df.columns != targetattr])
    IG = []
    entropybefore = entropy(df)
    for attr in attributes:
        IG.append(entropybefore - I(df,attr))
    maxGain = max(IG)
    if maxGain == 0.0:
        return None 
    else:
        return attributes[IG.index(maxGain)]


#### entropy function

In [3]:
def entropy(df):
    entropy = 0
    uniqueSet = df[targetattr].unique()
    for elem in uniqueSet:
        fraction = df[targetattr].value_counts()[elem]/float(len(df[targetattr]))
        if fraction == 0 or fraction == 1:
            entropy += 0.0
        else:
            entropy += -fraction*np.log2(fraction)
    return entropy

#### Impurity function to calculate impurity for weighted entropy

In [4]:
def I(df , attribute):
    target_variables = df[targetattr].unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in Temperature)
    den = len(df[attribute])
    impurity = 0
    for variable in variables:
        num = len(df[attribute][df[attribute] == variable])
        fraction = num/(den + eps)
        if num == 0:
            impurity += 0.0
        else:
            impurity += fraction*entropy(df[df[attribute] == variable]) 
    return impurity

#### get_subtable to filter data frame

In [5]:
def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)


#### Tree Builder Function

In [6]:
def treeBuilder(df, tree=None):     
    #Here we build our decision tree   
    #Get attribute with maximum information gain
    node = maxInfoGainNode(df)
    
    if node == None:
        clValue,counts = np.unique(df[targetattr],return_counts=True)  
        return clValue[np.argmax(counts)]
    
    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
   #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 
    
    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable[targetattr],return_counts=True)                        
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]
        else:        
            tree[node][value] = treeBuilder(subtable) #Calling the function recursively 
                   
    return tree


#### Run the code here

In [7]:
tree = treeBuilder(trainingSet)
pprint.pprint(tree)


{'Work_accident': {0: {'salary': {'high': {'sales': {'IT': 0,
                                                     'RandD': 0,
                                                     'accounting': {'promotion_last_5years': {0: 0,
                                                                                              1: 0}},
                                                     'hr': {'promotion_last_5years': {0: 0,
                                                                                      1: 0}},
                                                     'management': {'promotion_last_5years': {0: 0,
                                                                                              1: 0}},
                                                     'marketing': 0,
                                                     'product_mng': 0,
                                                     'sales': {'promotion_last_5years': {0: 0,
                                                

#### predict function predicts the value of class attribute for a given row of input and tree

In [8]:
def predict(tree , row):
    #This function is used to predict for any input variable 
    
    #Recursively we go through the tree that we built earlier
    prediction = 0
    for nodes in tree.keys():        
        
        value = row[nodes]
        tree = tree[nodes][value]
            
        if type(tree) is dict:
            prediction = predict(tree , row)
        else:
            prediction = tree
            break;                            
        
    return prediction

#### validate function validates the given Data Frame

In [9]:
def validate(tree, df):
    true_positive = 0
    true_negative = 0 
    false_negative = 0
    false_positive = 0
    result = []
    for index, row in df.iterrows():
        value = predict(tree, row)
        result.append(value)
        if value == row[targetattr]:
            if value == 1:
                true_positive += 1
            else:
                true_negative += 1
        else:
            if value == 1:
                false_positive += 1
            else:
                false_negative += 1
    return true_positive, true_negative , false_negative, false_positive, result

In [10]:
def accuracy(true_positive , true_negative , false_negative, false_positive):
    return ((true_positive + true_negative)*100)/(true_positive + true_negative + false_positive + false_negative + eps)

In [11]:
def recall(true_positive , false_negative):
    return true_positive*100/(true_positive +  false_negative+ eps)

In [12]:
def precision(true_positive , false_positive):
    return true_positive*100/(true_positive +  false_positive + eps)

In [13]:
def f1score(recall , prescision):
    return 2/(1/(recall)+1/(prescision))

In [14]:
def RESULT(tree, validationSet):
    true_positive , true_negative , false_negative, false_positive, result = validate(tree , validationSet)
    acc = accuracy(true_positive , true_negative , false_negative, false_positive)
    rec = recall(true_positive , false_negative)
    pre = precision(true_positive , false_positive)
    f1 = f1score(rec, pre)

    print("ACCURACY%: ", acc)
    print("RECALL: ", rec)
    print("PRECISION: ", pre)
    print("F1-score: ", f1)
    print(result)
    
RESULT(tree , validationSet)

ACCURACY%:  75.80071174377224
RECALL:  0.1834862385321101
PRECISION:  99.99999999999997
F1-score:  0.3663003663003664
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

###### TESTING function resturns list of output


In [17]:
def TESTING(tree, df):
    result = []
    for index, row in df.iterrows():
        value = predict(tree, row)
        result.append(value)
    return result

### TEST YOUR DATASET HERE....

In [28]:
testingData = pd.read_csv("./../input_data/sample_test.csv")
res = TESTING(tree, testingData)
testingData[targetattr] = res
testingData.to_csv("./../output_data/q-1-1_output.csv",index = False)
print (res)
print(testingData)


[0, 0]
   satisfaction_level  last_evaluation  number_project  average_montly_hours  \
0                0.69             0.69               3                   236   
1                0.36             0.54               2                   153   

   time_spend_company  Work_accident  promotion_last_5years        sales  \
0                   4              0                      0  product_mng   
1                   3              1                      0   accounting   

   salary  left  
0  medium     0  
1  medium     0  
