In [1]:
import pandas as pd
import math
import numpy as np

Question 2-1: ID3 tree

In [2]:
#calculate entropy
def entropy (data, label):

    unique_type = data[label].value_counts() #update the code to not calculate in a loop
    probabilities = unique_type / len(data)
    entropy = -sum(probabilities * probabilities.apply(math.log2))
    
    return entropy

def ME_fun(data, label):
    unique_type = data[label].value_counts()
    min_category = unique_type.min()
    total_category = unique_type.sum()
    me_value = min_category/total_category
    return(me_value) #

def Gini(data, label):
    unique_type = data[label].value_counts(normalize=True)
    sum_category = sum(unique_type ** 2)
    gini = 1 - sum_category
    
    return(gini)

def information_gain_fun(data, type, y): 

    len_x = len(data.columns)-1

    if type == 'en':
        total_entropy = entropy(data, y)
    elif type == "me":
        total_entropy = ME_fun(data, y)
    elif type == "gini":
        total_entropy = Gini(data, y)
    else:
        #print('Default is entropy')
        total_entropy = entropy(data, y)

    output_table = pd.DataFrame(columns = ['factor','decision_value'])

    for i in range(len_x):
        varx = data.columns[i]
        varx_categorpy = data[varx].unique()
        
        entropy_lst = []
        len_var = len(data[varx])

        for i in sorted(varx_categorpy):
            varsub = data[data[varx] == i] #x = 1 data
            
            if type == 'en':
                varsub_entropy = entropy(varsub, y)
            elif type == "me":
                varsub_entropy = ME_fun(varsub, y)
            elif type == "gini":
                varsub_entropy = Gini(varsub, y)
            else:
                #print('Default is entropy')
                varsub_entropy = entropy(varsub, y)

            proportion= len(varsub)/len_var
            entropy_var_category = proportion*varsub_entropy
            entropy_lst.append(entropy_var_category)

        expected_entropy = sum(entropy_lst)
        information_gain = total_entropy - expected_entropy
        row = {'factor': varx,"decision_value":information_gain}
        output_table = pd.concat([output_table, pd.DataFrame([row])], ignore_index = True)


    # Find the index of the maximum value in the 'decision_value' column
    max_index = output_table['decision_value'].idxmax()

    # Get the corresponding factor
    max_factor = output_table.loc[max_index, 'factor']
    return (max_factor)

def ID3 (data, type, y, features, current_depth, max_depth):
    trees = {}
    
    #If all target labels are the same, return label
    if data[y].nunique() == 1:
        output_y = data[y].unique()[0]
        return output_y

    # If no more features are available, return the most common label
    elif len(features) == 0 or (current_depth >= max_depth) :
        return data[y].mode()[0]
        
    else:
        best_feature1 = information_gain_fun(data, type, y)
        #Innitial tree with best features:
        trees[best_feature1] = {}

        for i in data[best_feature1].unique():

            split_tree_data = data[data[best_feature1] == i]
            if split_tree_data.empty:
                trees[best_feature1][i] = data[y].mode()[0]

            else:
                new_features = [f for f in features if f != best_feature1]
                update_data = split_tree_data.loc[:, split_tree_data.columns != best_feature1]
                subtree = ID3(update_data, type, y, new_features, current_depth+1, max_depth) #call ID3 again
                trees[best_feature1][i] = subtree
        return trees

def predict (trees, predictors):

    if not isinstance(trees, dict):
        return trees

    parent_node = next(iter(trees)) #parent node
    subtree = trees[parent_node] #subtree of parent_node
    feature_value = predictors[parent_node]  #the value of the parent in the first observation

    if feature_value in subtree:
        return predict(subtree[feature_value], predictors)
    
    else: 
        return None
    
def evaluation(trees, verify_data, features, y): #test
    
    inaccurate_case = 0
    num_rows = len(verify_data)

    for i, row in verify_data.iterrows():
        true_y = row[y] #true y
        predictors = row[features].to_dict()
        predict_value = predict(trees, predictors) #predicted y
        
        if predict_value != true_y:
            inaccurate_case += 1 #total inaccurate prediction case

    predicted_error = inaccurate_case/num_rows #prediction error
    
    return(predicted_error)

Question 2-2: car data

In [21]:
#load data
train_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\car\\train.csv", header = None, names = ['buying','maint','doors','persons','lug_boot','safety','label'])
test_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\car\\test.csv", header = None, names = ['buying','maint','doors','persons','lug_boot','safety','label'])

#Main function
features = ['buying','maint','doors','persons','lug_boot','safety']
max_depth_lst = [i for i in range(1,7)]
type3 = ['en', 'me', 'gini'] #three types of approach to calculate the information gain
y = 'label'
current_depth = 0
comparison_lst = []

for type in type3:
    for max_depth in max_depth_lst:
        trees = ID3 (train_data, type, y, features, current_depth, max_depth)
        predicted_error_train = evaluation(trees, train_data, features, y)
        predicted_error_test = evaluation(trees, test_data, features, y)
        
        comparison_lst.append({
            "Criterion": type,
            "Max Depth": max_depth,
            "training_error":predicted_error_train,
            "testing_error":predicted_error_test
        })

car_compare_table = pd.DataFrame(comparison_lst)
car_compare_table
car_compare_table.to_csv('car_comparison_table.csv')

Question 3 Bank data

In [4]:
#load data
train_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\train.csv",header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

test_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\test.csv", header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

In [17]:
#regard unknown as a particular attribute
def data_preprocessing_attribute(data, features, continuous):
    for var in features:
        if var in continuous:
            media = data[var].median() #replace with median
            data[var] = data[var].apply(lambda x:"no" if x < media else 'yes')

    return data

#regard unknown as a missing value and replace it with the majority of other values of the same attributes
def data_preprocessing_missing(data, features, continuous):
    for var1 in features:
        if var1 in continuous:
            media = data[var1].median() #replace with median
            data[var1] = data[var1].apply(lambda x:"no" if x < media else 'yes')
        else:
            data[var1].replace('unknown', np.nan, inplace = True)
            mode_factor = data[var1].mode()[0]
            data[var1].replace(np.nan, mode_factor, inplace = True)
        
    return data

In [24]:
#Main function
features = ['age', 'job', 'marital','education', 'default', 'balance', 'housing','loan', 'contact', 'day','month', 
            'duration','campaign','pdays','previous', 'poutcome']

continuous = ['age', 'balance', 'day','duration','campaign','pdays','previous']

#load data
train_data_att = data_preprocessing_attribute(train_data.copy(), features, continuous)
test_data_att = data_preprocessing_attribute(test_data.copy(), features, continuous)

max_depth_lst = [i for i in range(1,17)]  #range from 1-16
type3 = ['en', 'me', 'gini']
y = 'y'
current_depth = 0
comparison_lst = []

for type in type3:
    for max_depth in max_depth_lst:
        trees = ID3 (train_data_att, type, y, features, current_depth, max_depth)
        predicted_error_train = evaluation(trees, train_data_att, features, y)
        predicted_error_test = evaluation(trees, test_data_att, features, y)
        
        comparison_lst.append({
            "Criterion": type,
            "Max Depth": max_depth,
            "training_error":predicted_error_train,
            "testing_error":predicted_error_test
        })
    
        #print(type, max_depth, predicted_error_train, predicted_error_test)
bank_att_compare_table = pd.DataFrame(comparison_lst)
bank_att_compare_table.to_csv('bank_att_comparison_table_v3.csv')

In [19]:
#load data
train_data_miss = data_preprocessing_missing(train_data.copy(), features, continuous)
test_data_miss = data_preprocessing_missing(test_data.copy(), features, continuous)

max_depth_lst = [i for i in range(1,17)] #range from 1-16
type3 = ['en', 'me', 'gini']
y = 'y'
current_depth = 0
comparison_lst = []

for type in type3:
    for max_depth in max_depth_lst:
        trees = ID3 (train_data_miss, type, y, features, current_depth, max_depth)
        predicted_error_train = evaluation(trees, train_data_miss, features, y)
        predicted_error_test = evaluation(trees, test_data_miss, features, y)
        
        comparison_lst.append({
            "Criterion": type,
            "Max Depth": max_depth,
            "training_error":predicted_error_train,
            "testing_error":predicted_error_test
        })
    
        #print(type, max_depth, predicted_error_train, predicted_error_test)
bank_miss_compare_table = pd.DataFrame(comparison_lst)
bank_miss_compare_table
bank_miss_compare_table.to_csv('bank_miss_comparison_table_v3.csv')