# `Jai Chaudhry 2K18-SE-069`

## Experiment - 5

- Write a program to demonstrate the working of the decision tree based CART algorithm. 
- With the help of data set used in above experiment build the decision tree and classify a new sample. 
- Then Finding Accuracy, Precision, Recall and ROC_AUC Score of the Decision Tree

In [147]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log


- `eps` here is the smallest representable number. 
- At times we get log(0) or 0 in the denominator, to avoid that we are going to use this.

# Define the Dataset : (Class Dataset)

In [148]:
outlook = 'sunny,sunny,overcast,rain,rain,rain,overcast,sunny,sunny,rain,sunny,overcast,overcast,rain'.split(',')
temp='hot,hot,hot,mild,cold,cold,cold,mild,cold,mild,mild,mild,hot,mild'.split(',')
humidity = 'high,high,high,high,normal,normal,normal,high,normal,normal,normal,high,normal,high'.split(',')
windy = 'weak,strong,weak,weak,weak,strong,strong,weak,weak,weak,strong,strong,weak,strong'.split(',')
play = 'no,no,yes,yes,yes,no,yes,no,yes,yes,yes,yes,yes,no'.split(',')

In [149]:
dataset = {'outlook':outlook,'temp':temp,'humidity':humidity,'windy':windy,'play':play}
df = pd.DataFrame(dataset,columns=['outlook','temp','humidity','windy','play'])


In [150]:
df.head(14)


Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rain,mild,high,weak,yes
4,rain,cold,normal,weak,yes
5,rain,cold,normal,strong,no
6,overcast,cold,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cold,normal,weak,yes
9,rain,mild,normal,weak,yes


## Decision Tree Algorithm
- compute the entropy for data-set
- for every attribute/feature:
       1.calculate entropy for all categorical values
       2.take average information entropy for the current attribute
       3.calculate split info for the current attribute
       4.calculate gain for the current attribute
- pick the highest gain attribute.
- Repeat until we get the tree we desired

# Find Gini Index for a particular attribute

In [151]:
def find_gini_index(df,attribute,printy=False):
    Class = df.keys()[-1]  
    
    target_variables = df[Class].unique()
    variables = df[attribute].unique()
    gini = 0
    
    for variable in variables:
        den = len( df[attribute][df[attribute]==variable] )
        
        fraction2 = 1
        for target_var in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[Class]==target_var])
            fraction = num / (den + eps)
            fraction2 -= fraction**2
        if printy:
            print(variable,": ",fraction2)
        fraction2 = fraction2 * (den/len(df))
        gini += fraction2 
        
    return gini


In [152]:
# Testing
print(df.keys())

print("Gini : ",find_gini_index(df,'temp',True))
print()
print("Gini : ",find_gini_index(df,'humidity',True))
print()
print("Gini : ",find_gini_index(df,'windy',True))


Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')
hot :  0.5
mild :  0.4444444444444444
cold :  0.375
Gini :  0.44047619047619047

high :  0.48979591836734704
normal :  0.24489795918367352
Gini :  0.3673469387755103

weak :  0.375
strong :  0.5
Gini :  0.42857142857142855


# Get The one that is the best divider for current node

In [153]:
# ok   
def find_winner(df):
   
    Gini_att = []
    IG = []
    
    for key in df.keys()[:-1]:
        gini = find_gini_index(df,key)        
        IG.append(gini)

    return df.keys()[:-1][np.argmin(IG)]                                


In [154]:
find_winner(df)


'outlook'

# After choosing an Attr. for division, 
### For a particular branch from this attr. for a specific value x 
### Get the subset of data that has this attr.'s value as x 

In [155]:
def get_subtable(df, node ,value): 
    return df[df[node] == value].reset_index(drop=True) 


In [156]:
df.drop(['play'],axis=1).head(3)


Unnamed: 0,outlook,temp,humidity,windy
0,sunny,hot,high,weak
1,sunny,hot,high,strong
2,overcast,hot,high,weak


# Main Driver Function to create Tree

In [180]:
def buildTree(df,TgtClass,previous_winner=0,tree=None): 
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    
    # Here we build our decision tree

    # Get attribute with maximum Gini Index
    node = find_winner(df)
    
    print(node)
    
    # Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
    if previous_winner == node:
        clValue,counts = np.unique(df[TgtClass],return_counts=True) 
        tree[node][df[node][0]] = clValue[np.argmax(counts)]
        return tree
    
    
    # Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])

   # We make loop to construct a tree by calling this function recursively. 
   # In this we check if the subset is pure and stops if it is pure. 

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable[TgtClass],return_counts=True)   
        
        if len(counts)==1:                # Checking purity of subset
            tree[node][value] = clValue[0]   
        else:        
            tree[node][value] = buildTree(subtable,TgtClass,node) #Calling the function recursively 
                   
    return tree


## Running It 

In [158]:
t = buildTree(df,'play')

In [159]:
import pprint      
pprint.pprint(t)    


{'outlook': {'overcast': 'yes',
             'rain': {'windy': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}


In [160]:
graph = pydot.Dot(graph_type='graph')
visit(t)
graph.write_png('Play_Tennis_CART.png')




# Play_Tennis_CART
<img src="Play_Tennis_CART.png" >

# Now Implementing Decision Tree on Adult DataSet

# Loading Data

In [161]:
import copy 
import pandas as pd
data = pd.read_csv('C:\\Users\\Jai\\Desktop\\adult.csv',na_values='?')

num_cols = list(data.select_dtypes(include=["number"]).columns)
cat_cols = list(data.select_dtypes(exclude=["number"]).columns)

print((num_cols))
print()
print(cat_cols)



['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']


# PreProcessing

In [162]:
print(len(data))
data = data.dropna() 
print(len(data))     
data.head()          


48842
45222


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


# Getting Numerical and Categorical Columns 


In [163]:
# RUN THIS ONLY ONCE

# Numeric Data
ndata = data.select_dtypes(include=['number']) 
indata = copy.deepcopy(ndata)

# Categorical Data
cdata = data.select_dtypes(exclude=['number']) 

print(len(data))
cdata.head(10)     


45222


Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,<=50K
1,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,<=50K
2,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,>50K
3,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States,>50K
5,Private,10th,Never-married,Other-service,Not-in-family,White,Male,United-States,<=50K
7,Self-emp-not-inc,Prof-school,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,>50K
8,Private,Some-college,Never-married,Other-service,Unmarried,White,Female,United-States,<=50K
9,Private,7th-8th,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States,<=50K
10,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
11,Federal-gov,Bachelors,Married-civ-spouse,Adm-clerical,Husband,White,Male,United-States,<=50K


In [164]:
# Numeric Data
ndata =  copy.deepcopy(indata)
ndata.head()


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
0,25,226802,7,0,0,40
1,38,89814,9,0,0,50
2,28,336951,12,0,0,40
3,44,160323,10,7688,0,40
5,34,198693,6,0,0,30


# Converting Numeric Values to ranged Classes values for Decision tree (Using Mean)

In [165]:
# a = np.array(ndata,dtype=object)
# print(a.shape)
# print(a[0])
# a[0][0] = str(23) + "s"
# print(a[0][0])

ndata =  copy.deepcopy(indata)

cols = ndata.columns
b = np.array(ndata)
a = np.array(ndata,dtype=object)
print(a.shape)

for i in range(0,len(cols)):
    
    col = cols[i]
    meann = np.round(ndata[col].mean())
    print(col,meann)
    
    for j in range(0,len(ndata)):
        if b[j][i] >= meann:
            a[j][i] = ">=" + str(meann)
        else:
            a[j][i] = "<" + str(meann)


newdata = pd.DataFrame(a).applymap(str)
ndata=newdata
ndata.columns = cols

print(len(ndata))
newdata.head(10)

(45222, 6)
age 38.54794
fnlwgt 189734.73431
educational-num 10.11846
capital-gain 1101.43034
capital-loss 88.59542
hours-per-week 40.93802
45222


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
0,<38.54794,>=189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802
1,<38.54794,<189734.73431,<10.11846,<1101.43034,<88.59542,>=40.93802
2,<38.54794,>=189734.73431,>=10.11846,<1101.43034,<88.59542,<40.93802
3,>=38.54794,<189734.73431,<10.11846,>=1101.43034,<88.59542,<40.93802
4,<38.54794,>=189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802
5,>=38.54794,<189734.73431,>=10.11846,>=1101.43034,<88.59542,<40.93802
6,<38.54794,>=189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802
7,>=38.54794,<189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802
8,>=38.54794,<189734.73431,<10.11846,>=1101.43034,<88.59542,<40.93802
9,<38.54794,>=189734.73431,>=10.11846,<1101.43034,<88.59542,<40.93802


In [166]:
# ## Assigning categorical values (like interval) to numeric data
# ## Again run this only once

# ndata =  copy.deepcopy(indata)
# print(ndata.head())


# for i in ndata.columns:
    
#     mn = ndata[i].mean()
#     print(i,mn)
    
# #     ndata[i] = ndata[i].apply(str)

# #     ndata[i][ndata[i]<mn] = 0
# #     ndata[i][ndata[i]>=mn] = 1

# #     ndata.loc[(ndata[i]) < mn, i] = 0
# #     ndata.loc[(ndata[i]) >= mn, i] = 1
    
# newdata.head()

for col in cols:
    print(ndata[col].value_counts())
    print()


<38.54794     24238
>=38.54794    20984
Name: age, dtype: int64

<189734.73431     25426
>=189734.73431    19796
Name: fnlwgt, dtype: int64

<10.11846     30343
>=10.11846    14879
Name: educational-num, dtype: int64

<1101.43034     41534
>=1101.43034     3688
Name: capital-gain, dtype: int64

<88.59542     43082
>=88.59542     2140
Name: capital-loss, dtype: int64

<40.93802     31445
>=40.93802    13777
Name: hours-per-week, dtype: int64



# Merging the Transformed Numeric Data with Categorical Data

In [167]:
ndata.reset_index(drop=True, inplace=True)
cdata.reset_index(drop=True,inplace=True)
frames = [ndata,cdata]
print(len(ndata),len(cdata))


45222 45222


In [168]:
result = pd.concat(frames,axis = 1)
data =  result

print(len(data))

data.head()


45222


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,<38.54794,>=189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,<=50K
1,<38.54794,<189734.73431,<10.11846,<1101.43034,<88.59542,>=40.93802,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,<=50K
2,<38.54794,>=189734.73431,>=10.11846,<1101.43034,<88.59542,<40.93802,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,>50K
3,>=38.54794,<189734.73431,<10.11846,>=1101.43034,<88.59542,<40.93802,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States,>50K
4,<38.54794,>=189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802,Private,10th,Never-married,Other-service,Not-in-family,White,Male,United-States,<=50K


# All set to Run on the Dataset

In [187]:
number_of_tuples_i_want = 30000

In [188]:
npdata = np.array(data)  
print(npdata.shape)      

(45222, 15)


In [189]:
newdata = data[:min(number_of_tuples_i_want,len(data))]     
print(len(newdata))
newdata.head(5)

30000


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,<38.54794,>=189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,<=50K
1,<38.54794,<189734.73431,<10.11846,<1101.43034,<88.59542,>=40.93802,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States,<=50K
2,<38.54794,>=189734.73431,>=10.11846,<1101.43034,<88.59542,<40.93802,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,>50K
3,>=38.54794,<189734.73431,<10.11846,>=1101.43034,<88.59542,<40.93802,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States,>50K
4,<38.54794,>=189734.73431,<10.11846,<1101.43034,<88.59542,<40.93802,Private,10th,Never-married,Other-service,Not-in-family,White,Male,United-States,<=50K


In [190]:
print(newdata['income'][0])
# newdata.head()


<=50K


# Convert The Dataframe values to string so that it can be plotted using pydot

In [191]:
newdata = newdata.applymap(str)

In [192]:
tree = buildTree(newdata,'income')  

relationship
education
occupation
workclass
age
hours-per-week
fnlwgt
capital-gain
native-country
age
age
fnlwgt
race
age
age
race
hours-per-week
capital-loss
native-country
fnlwgt
age
age
age
age
capital-loss
fnlwgt
age
age
age
age
hours-per-week
age
fnlwgt
age
age
workclass
fnlwgt
capital-loss
workclass
age
age
capital-loss
native-country
fnlwgt
capital-gain
age
age
workclass
capital-loss
fnlwgt
hours-per-week
workclass
race
native-country
age
age
age
age
capital-gain
age
age
native-country
capital-loss
capital-gain
workclass
capital-loss
workclass
race
hours-per-week
age
age
capital-gain
fnlwgt
hours-per-week
age
age
age
age
age
native-country
hours-per-week
age
age
age
age
age
capital-gain
occupation
fnlwgt
workclass
native-country
age
age
workclass
age
native-country
age
capital-loss
race
hours-per-week
fnlwgt
age
age
fnlwgt
age
age
fnlwgt
hours-per-week
capital-loss
race
age
age
capital-loss
hours-per-week
race
age
age
hours-per-week
workclass
age
age
age
fnlwgt
hours-per-week
ho

capital-gain
workclass
age
fnlwgt
race
age
age
hours-per-week
age
age
native-country
age
age
hours-per-week
capital-loss
age
age
workclass
workclass
fnlwgt
hours-per-week
capital-gain
age
age
capital-gain
capital-loss
workclass
age
capital-gain
fnlwgt
race
age
age
fnlwgt
capital-loss
hours-per-week
age
age
capital-gain
capital-loss
age
race
fnlwgt
hours-per-week
age
age
age
age
hours-per-week
age
age
age
age
race
fnlwgt
age
age
fnlwgt
hours-per-week
age
age
age
age
hours-per-week
age
age
age
age
hours-per-week
age
fnlwgt
age
age
fnlwgt
age
hours-per-week
age
age
capital-loss
capital-gain
age
race
fnlwgt
hours-per-week
fnlwgt
age
age
age
age
fnlwgt
age
age
age
age
race
age
age
fnlwgt
hours-per-week
age
age
age
age
hours-per-week
age
age
age
age
hours-per-week
age
fnlwgt
age
age
age
age
fnlwgt
age
age
age
fnlwgt
race
age
age
age
fnlwgt
age
age
capital-gain
capital-loss
age
fnlwgt
hours-per-week
age
age
fnlwgt
hours-per-week
age
age
age
age
race
fnlwgt
capital-loss
capital-gain
hours-per-

age
age
age
fnlwgt
age
age
age
age
hours-per-week
age
fnlwgt
age
age
fnlwgt
age
age
fnlwgt
age
age
age
age
age
hours-per-week
age
fnlwgt
age
age
age
age
fnlwgt
capital-gain
hours-per-week
fnlwgt
capital-loss
age
age
race
age
fnlwgt
age
age
age
age
fnlwgt
capital-loss
age
age
capital-loss
age
age
hours-per-week
age
fnlwgt
capital-loss
age
age
race
age
age
race
capital-loss
fnlwgt
age
age
age
capital-gain
age
age
age
age
age
workclass
capital-gain
capital-loss
fnlwgt
age
age
hours-per-week
native-country
capital-gain
capital-loss
race
age
fnlwgt
age
age
age
age
fnlwgt
age
age
age
age
capital-loss
race
fnlwgt
fnlwgt
age
age
native-country
capital-loss
hours-per-week
age
age
capital-loss
age
hours-per-week
workclass
fnlwgt
age
age
race
capital-gain
fnlwgt
age
age
age
age
fnlwgt
age
age
capital-gain
age
age
capital-gain
fnlwgt
workclass
race
age
age
age
age
workclass
race
age
age
age
age
age
age
workclass
workclass
hours-per-week
race
age
age
fnlwgt
age
age
age
age
hours-per-week
race
fnlwg

age
age
fnlwgt
workclass
hours-per-week
age
age
age
age
age
age
age
age
hours-per-week
workclass
age
age
age
native-country
workclass
race
fnlwgt
age
hours-per-week
hours-per-week
age
age
race
age
capital-gain
capital-gain
age
hours-per-week
fnlwgt
age
age
age
age
fnlwgt
age
age
capital-loss
fnlwgt
hours-per-week
age
age
age
age
hours-per-week
age
age
age
age
fnlwgt
age
age
age
age
workclass
hours-per-week
fnlwgt
race
age
capital-loss
age
age
race
age
age
capital-gain
race
fnlwgt
capital-loss
age
age
hours-per-week
age
age
age
age
fnlwgt
capital-loss
native-country
age
hours-per-week
age
age
age
age
hours-per-week
age
age
age
age
native-country
age
capital-loss
age
age
capital-loss
hours-per-week
age
age
age
age
fnlwgt
hours-per-week
age
native-country
age
age
education
capital-loss
occupation
workclass
occupation
fnlwgt
marital-status
occupation
workclass
fnlwgt
age
hours-per-week
occupation
hours-per-week
workclass
capital-gain
hours-per-week
race
age
age
occupation
occupation
hours-

fnlwgt
gender
age
age
capital-loss
age
age
capital-loss
gender
capital-loss
fnlwgt
age
hours-per-week
race
age
age
age
hours-per-week
age
age
hours-per-week
age
age
age
capital-loss
fnlwgt
hours-per-week
age
age
age
race
hours-per-week
age
age
age
age
age
age
hours-per-week
fnlwgt
age
age
fnlwgt
age
age
age
hours-per-week
gender
age
age
workclass
capital-loss
race
fnlwgt
hours-per-week
age
age
workclass
hours-per-week
gender
age
workclass
fnlwgt
marital-status
age
age
marital-status
fnlwgt
age
workclass
age
age
occupation
fnlwgt
race
hours-per-week
workclass
age
marital-status
marital-status
age
age
workclass
gender
fnlwgt
hours-per-week
age
age
fnlwgt
marital-status
hours-per-week
gender
age
hours-per-week
hours-per-week
marital-status
gender
capital-gain
capital-gain
native-country
workclass
occupation
hours-per-week
fnlwgt
capital-loss
marital-status
occupation
occupation
fnlwgt
race
hours-per-week
age
gender
hours-per-week
age
age
hours-per-week
age
age
age
age
hours-per-week
age
a

age
age
workclass
race
fnlwgt
age
age
workclass
hours-per-week
age
age
fnlwgt
capital-loss
occupation
age
hours-per-week
race
fnlwgt
workclass
age
age
native-country
age
age
race
age
race
age
fnlwgt
workclass
age
fnlwgt
capital-gain
workclass
fnlwgt
age
capital-gain
hours-per-week
age
capital-gain
capital-loss
age
age
age
occupation
workclass
capital-loss
age
capital-loss
race
hours-per-week
fnlwgt
native-country
hours-per-week
fnlwgt
age
age
age
age
capital-gain
race
fnlwgt
hours-per-week
age
age
age
age
age
age
native-country
capital-gain
workclass
race
fnlwgt
fnlwgt
race
hours-per-week
capital-gain
age
marital-status
capital-loss
age
age
age
age
age
age
age
age
hours-per-week
gender
age
age
capital-gain
capital-loss
age
fnlwgt
fnlwgt
age
age
age
age
fnlwgt
race
fnlwgt
fnlwgt
workclass
age
age
race
age
fnlwgt
capital-gain
workclass
age
age
fnlwgt
native-country
workclass
hours-per-week
capital-loss
capital-gain
age
fnlwgt
age
age
fnlwgt
age
age
age
age
fnlwgt
capital-gain
age
fnlwgt


# Saving Our Decision Tree

In [140]:
import pickle

with open('C45.pickle', 'wb') as handle:
    pickle.dump(tree, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('C45.pickle', 'rb') as handle:
    b = pickle.load(handle)
    
tree == b

True

# Visualizing Decision Tree

In [141]:
import pydot

dict = {}

menu = {'dinner':
            {'chicken':'good',
             'beef':'average',
             'vegetarian':{
                   'tofu':'good',
                   'salad':{
                            'caeser':'bad',
                            'italian':'average'}
                   },
             'pork':'bad'}
        }


def draw(parent_name, child_name):
    edge = pydot.Edge(parent_name, child_name)
    graph.add_edge(edge)


def visit(node, parent=None):
    for k,v in node.items():
        if isinstance(v, type(dict)):
            # We start with the root node whose parent is None
            # we don't want to graph the None node
            if parent:
                draw(parent, k)
            visit(v, k)
            
        else:
            draw(parent, k)
            # drawing the label using a distinct name
            draw(k, k+'_'+v)


graph = pydot.Dot(graph_type='graph')

# visit(menu)
visit(tree)
# # visit(t)
# print(tree)
graph.write_png('Adult_Dataset_CART.png')

<img src = "Adult_Dataset_CART.png"> 

# Classifying a new sample

In [193]:
limit = 2*14

err = 0
exception = 0
returning = 0

def predict(tree,prev,sample,cnt):
    
    # Find the current attr / parameter
    # Check the value of that in sample
    # Then navigate through the tree accordingly
    
    global err, exception, returning
    
    if cnt > limit:
#         print(cnt>limit)
        err += 1
        return "idk"
    try:
        tkeys = list(tree.keys())
    except:
#         print(tree)
        return tree
    
    try:
        if len(tkeys)==1:
            # Found a category
            t=1
            return predict(tree[tkeys[0]],tkeys[0],sample,cnt+1)
        else:
    #         print("GOING WITH ",sample[prev],"\n")
            return predict(tree[sample[prev]],"-1",sample,cnt+1)
    except:
        exception += 1
        return "idk"
    
    returning += 1
    return "idk"


In [197]:
err = 0
exception = 0
returning = 0

sample = data.iloc[1422].to_dict()
# pprint.pprint(sample)

print("Actual : ",sample['income'])

print("Prediction : ",predict(tree,"-1",sample,0) )

cnt = 0
for i in range(0,45000):
    sample = data.iloc[i].to_dict()
    pred = predict(tree,"-1",sample,0)
    
    if pred == "idk":
        cnt+=1

print(cnt)
print(err)
print(exception)
print(returning)

Actual :  <=50K
Prediction :  <=50K
1007
463
544
0


# Get Prediction for a Certain Size

In [199]:
size = 45000

y_actual = data['income'].iloc[0:size]
y_actual = np.array(y_actual)

print(y_actual.shape)

preds = np.array((y_actual),dtype=object)
for i in range(0,size):
    try:
        preds[i] = predict(tree,"-1",data.iloc[i].to_dict(),0)
    except:
        preds[i] = "idk"


(45000,)


# Convert to Binary

In [201]:
testy = y_actual
yhat_classes = preds

for i in range(testy.shape[0]):
    if testy[i]=='<=50K':
        testy[i]='0'

    elif testy[i]=='>50K':
        testy[i]='1'


    if yhat_classes[i]=='<=50K':
        yhat_classes[i]='0'

    elif yhat_classes[i]=='>50K':
        yhat_classes[i]='1'
        
    elif yhat_classes[i]=='idk':
        yhat_classes[i] = str((1-int(testy[i])))


testy = np.array(testy,dtype=int)
yhat_classes = np.array(yhat_classes,dtype=int)

print(testy.dtype,"\n")

temp = pd.DataFrame(yhat_classes)
temp[0].value_counts()

int32 



0    34962
1    10038
Name: 0, dtype: int64

# Calculating  _  Accuracy | Precision | Recall | F1 score | ROC_AUC_Score

In [202]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(testy, yhat_classes)*100
print('Accuracy      : %f' % (accuracy),"%")


# precision tp / (tp + fp)
precision = precision_score(testy, yhat_classes)*100
print('Precision     : %f' % precision,"%")


# recall: tp / (tp + fn)
recall = recall_score(testy, yhat_classes)*100
print('Recall        : %f' % recall,"%")


# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(testy, yhat_classes)*100
print('F1 score      : %f' % f1,"%")
 

# ROC AUC
auc = roc_auc_score(testy, yhat_classes)*100
print("roc_auc_score :",auc,"%")

Accuracy      : 85.637778 %
Precision     : 73.411038 %
Recall        : 66.012721 %
F1 score      : 69.515589 %
roc_auc_score : 79.0624527405946 %


<img src = "Adult_Dataset_CART.png"> 

# `Jai Chaudhry 2K18-SE-069`