### Aim
To implement ID3 decision tree based classification algorithm using python

In [1]:
from sklearn import tree
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score
import ID3
from numpy import transpose
import sys
import DataLoader

### Some Preprocessing of data
Converting the data to integer type using scikit learn label encoder

In [2]:
[X,Y] = DataLoader.getDataset()
[X_test,Y_test] = DataLoader.getDatasetTest()

### ID3 implementation using scikit-learn library

In [3]:
clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = clf.fit(X, Y)

In [4]:
Y_predicted = clf.predict(X_test)
print(Y_predicted)

[0 0 1 ... 1 0 0]


In [5]:
accuracy_score(Y_test, Y_predicted)

0.7314047048707082

### ID3 implementation using our code

<code>
ID3 Library
import pandas as pd
import math
import numpy as np
from numpy import transpose
import matplotlib.pyplot as plt
from Tree import Node
from sklearn.preprocessing import LabelEncoder
import random

def calculateInfoD(label):
    classes = []
    """Create classes array with count"""
    for x in label:
        isfound = False
        for y in classes:
            if y[0] == x:
                isfound = True
                y[1] += 1
        if not isfound:
            classes.append([x,1])
    """Calculate the info(D)"""
    gain = 0
    total = len(label)
    for x in classes:
        gain += (x[1]/total)*math.log((total/x[1]),2)
    return gain
def calculateInfoA(attribute,label):
    classes = []
    """Create classes array with count for attribute"""
    for x in attribute:
        isfound = False
        for y in classes:
            if y[0] == x:
                isfound = True
                y[1] += 1
        if not isfound:
            classes.append([x,1])
    """Calculate the infoA(D)"""
    gain = 0
    total = len(attribute)
    for x in classes:
        tempLabel = []
        for i in range(len(label)):
            if attribute[i] == x[0]:
                tempLabel.append(label[i])
        gain += (x[1]/total)*calculateInfoD(tempLabel)
    return gain
def calculateGain(attribute,label,InfoD):
    return InfoD - calculateInfoA(attribute,label)
def getNode(dataset,deleteRows,label,InfoD,deleteCols):
    temp_dataset = dataset
    temp_dataset = np.delete(temp_dataset,deleteRows,1)
    temp_label = label
    temp_label = np.delete(temp_label,deleteRows)
    gainArray = []
    for i in range(len(temp_dataset)):
        if i not in deleteCols:
            gainArray.append(calculateGain(temp_dataset[i],temp_label,InfoD))
        else:
            gainArray.append(0)
    return gainArray.index(max(gainArray))    
def makeTree(dataset,deleteRows,label,InfoD,parentNode,edgeNum,deleteCols,dataRow):
    nodeNum = getNode(dataset,deleteRows,label,InfoD,deleteCols)
    node = Node(nodeNum,parentNode)
    edge = {"edgeNum":edgeNum, "node":node}
    parentNode.genChildren(edge)
    parentNode = node
    elements = []
    for x in dataset[nodeNum]:
        if x not in elements:
            elements.append(x)
    deleteRows = []
    if nodeNum not in deleteCols:
        deleteCols.append(nodeNum)
    if len(deleteCols) is len(dataset):
        for element in elements:
            dataRow[nodeNum] = element
            flag = False
            store = 0
            for idx,x in enumerate(dataset.T):
                if np.array_equal(dataRow,x):
                    flag = True
                    store = idx
                    break
            if not flag:
                store = random.randint(0,len(dataset[0])-2)
            temp_node = Node(-2,parentNode)
            edge = {"edgeNum":element, "node":temp_node, "answer":label[store]}
            parentNode.genChildren(edge)
        return 0
    for x in elements:
        dataRow[nodeNum] = x
        for i in range(len(dataset[nodeNum])):
            if x != dataset[nodeNum][i]:
                deleteRows.append(i)
        makeTree(dataset,deleteRows,label,InfoD,parentNode,x,deleteCols[:],dataRow[:])
def decisionTreeClassifier(dataset,label):
    InfoD = calculateInfoD(label)
    root = Node(-1,None)
    dataRow=[]
    for x in range(len(dataset)):
        dataRow.append(-1)
    makeTree(dataset,[],label,InfoD,root,0,[],dataRow[:])
    return root
def predict(dataset,root):
    predicted_values = []
    for x in range(len(dataset[0])):
        curr_node = root.children[0]['node']
        while curr_node.value is not -2:
            value = dataset[curr_node.value,x]
            for y in range(len(curr_node.children)):
                if curr_node.children[y]['edgeNum'] == value:
                    store = y
                    break
            curr_node = curr_node.children[store]['node']
        predicted_values.append(curr_node.parent.children[store]['answer'])
    return predicted_values
</code>

In [None]:
X = transpose(X.as_matrix())
Y = Y.values.tolist()
# X
Y_test = Y_test.values.tolist()
X_test = transpose(X_test).as_matrix()
root = ID3.decisionTreeClassifier(X,Y)

# Y_predicted = ID3.predict(X_test,root)
# print(Y_predicted)

In [None]:
accuracy_score(df['Class'].values.tolist(), Y_predict)

### Conclusion
Thus we have implemented ID3 using scikit learn on income prediction dataset and got accuracy of 73.53%. Also we implemented the algorithm in python and verified it on a different dataset on which we got accuracy of 95%.