
Only numpy and pandas can be used, so these need to be imported

In [1]:

import numpy as np
import pandas as pd


We use entropy as impurity measure for the cost, so function to calculate entropy

In [2]:

def entropy(data):
    entropy = 0
    for i in data['target'].unique():
        p = data['target'].value_counts()[i]/len(data['target'])
        entropy += -p*np.log2(p)
    
    return entropy


Moreover, the CART algorithm needs a function to find possible decisions

In [3]:

def possible_decisions(data):
    possible_features = data.columns[:-1]
    
    #create possible decisions
    possible_decisions = []
    for feature in possible_features:
        for value in data[feature].unique():
            possible_decisions.append((feature, value))
    
    return possible_decisions


Now we're ready to create the split function

In [4]:

def split(data):
    
    # create place to store best information gain, feature and value
    max_information_gain = 0
    best_feature = None
    best_value = None
    
    for possible_decision in possible_decisions(data):
        feature = possible_decision[0]
        value = possible_decision[1]
        
        #split data
        data_left = data[data[feature] != value]
        data_right = data[data[feature] == value]
        
        #calculate entropy
        entropy_left = entropy(data_left)
        entropy_right = entropy(data_right)
        
        #calculate information gain
        information_gain = entropy(data) - (len(data_left)/len(data)*entropy_left + len(data_right)/len(data)*entropy_right)
        
        if information_gain > max_information_gain:
            max_information_gain = information_gain
            best_feature = feature
            best_value = value
            
    
    #split data based on feature and value
    data_left = data[data[best_feature] != best_value]
    data_right = data[data[best_feature] == best_value]
    
    return data_left, data_right, best_feature, best_value


And finally, creating the decision tree

In [5]:

def decision_tree(data, depth=0, max_depth=3):
    
    L = {'leaf': True, 'target': data['target'].value_counts().idxmax()}
    
    #check if max depth is reached
    if depth == max_depth:
        print(entropy(data))
        return L
    
    #check if there are no more possible decisions
    if len(possible_decisions(data)) == 0:
        print(entropy(data))
        return L
    
    #split data
    data_left, data_right, feature, value = split(data)
    
    #create decision tree
    T = {'leaf': False, 'feature': feature, 'value': value}
    T['left'] = decision_tree(data_left, depth+1, max_depth)
    T['right'] = decision_tree(data_right, depth+1, max_depth)
    
    return T


Lets import the training data and create a decision tree

Note: Output will be entropy at nodes, to check answer B

In [6]:

data_train = pd.read_csv('/Users/irisroeloffzen/Downloads/HW1 2/heart_train_data.csv')

# delete index column
data_train = data_train.reset_index(drop=True)

# create decision tree
T = decision_tree(data_train, max_depth=3)

0.34351974100740124
0.7642045065086203
0.9182958340544896
0.9494520153879484
0.2811937964320427
0.6840384356390417
0.9321115676166747
0.7219280948873623



Now that the decision tree is made, we can create a function to predict with it and test it, and a function to calculate the accuracy

In [7]:

def predict(T, x):
    if T['leaf']:
        return T['target']
    else:
        if x[T['feature']] != T['value']:
            return predict(T['left'], x)
        else:
            return predict(T['right'], x)
        
def test(T, data):
    predictions = []
    for i in range(len(data)):
        predictions.append(predict(T, data.iloc[i]))
    return predictions

def accuracy(predictions, data):
    correct = 0
    for i in range(len(predictions)):
        if predictions[i] == data.iloc[i]['target']:
            correct += 1
    return correct/len(predictions)


Now lets import our test data

In [8]:

data_test = pd.read_csv('/Users/irisroeloffzen/Downloads/HW1 2/heart_validate_data.csv')

# delete index column
data_test = data_test.reset_index(drop=True)


Now to answer the HW question, lets calculate the accuracy

In [9]:

accuracy = accuracy(test(T, data_test), data_test)
print(accuracy)

0.7252747252747253



Lets see what the decision of the root is

In [10]:

# print value and feature of root node

print(T['feature'], T['value'])

cp 0



Now lets see if a patient that has atypical angina, exercise induced angina, and a fixed defect thallium heart scan will be diagnosed as having a heart disease by the decision tree classifier.

In [11]:

data_question = {'cp': 1.0, 'exang': 1.0, 'thal': 2.0}
data_q = pd.DataFrame(data_question, index=[0])

test = test(T, data_q)
print(test)


[1]
