In [1]:
import pandas as pd
import math
import numpy as np

In [3]:
#load data
train_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\car\\train.csv", header = None, names = ['buying','maint','doors','persons','lug_boot','safety','label'])
test_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\car\\test.csv", header = None, names = ['buying','maint','doors','persons','lug_boot','safety','label'])

Processing data

In [4]:
#detect missing value
train_data.isna().sum() #

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
label       0
dtype: int64

In [5]:
#replace str to value
train_data.replace({"low":1, "med":2, 'high':3, 'vhigh':4,'5more':5, 'more':5, "small":1,"big":3, 'unacc':1, 'acc':2, 'good':3, 'vgood':4}, inplace = True)
train_data = train_data.astype('int') #transfer data to int format
test_data.replace({"low":1, "med":2, 'high':3, 'vhigh':4,'5more':5, 'more':5, "small":1,"big":3, 'unacc':1, 'acc':2, 'good':3, 'vgood':4}, inplace = True)
test_data = test_data.astype('int') #transfer data to int format

| label values

unacc(1), acc(2), good(3), vgood(4)

| attributes

buying:   vhigh(4), high(3), med(2), low(1).

maint:    vhigh(4), high(3), med(2), low(1).

doors:    2, 3, 4, 5more(5).

persons:  2, 4, more(5).

lug_boot: small(1), med(2), big(3).

safety:   low(1), med(2), high(3).

Function for calculating information gain

In [6]:
#calculate entropy
def entropy (data, label):

    entropy = 0

    unique_type = sorted(data[label].unique())
    sum_type = []
    for i in unique_type:
        #lable is the outcome
        subvalue = len(data[data[label] == i])/len(data)
        sublog = subvalue*math.log2(subvalue)
        sum_type.append(sublog)

    if 0 not in sum_type: # decide if the tree equal to zero
        entropy = - sum(sum_type)

    return entropy

In [7]:
def ME_fun(data, label):
    
    unique_type = sorted(data[label].unique())
    sum_type = []
    for i in unique_type:
        #lable is the outcome
        subvalue = len(data[data[label] == i])
        sum_type.append(subvalue)

    min_category = min(sum_type)
    total_category = sum(sum_type)
    me_value = min_category/total_category
    return(me_value) #

In [8]:
def Gini(data, label):
    
    unique_type = sorted(data[label].unique())
    sum_type = []

    for i in unique_type:
        #lable is the outcome
        subvalue = len(data[data[label] == i])/len(data)
        subvalue2 = math.pow(subvalue, 2)
        sum_type.append(subvalue)

    sum_category = sum(sum_type)
    gini = 1 - sum_category

    return(gini)

In [9]:
def information_gain_fun(data, type, y): 

    copy_data_frame = data
    len_x = len(copy_data_frame.columns)-1

    if type == 'en':
        total_entropy = entropy(copy_data_frame, y)
    elif type == "me":
        total_entropy = ME_fun(copy_data_frame, y)
    elif type == "gini":
        total_entropy = Gini(copy_data_frame, y)
    else:
        #print('Default is entropy')
        total_entropy = entropy(copy_data_frame, y)

    output_table = pd.DataFrame(columns = ['factor','decision_value'])

    for i in range(len_x):
        varx = copy_data_frame.columns[i]
        varx_categorpy = copy_data_frame[varx].unique()
        
        entropy_lst = []
        len_var = len(copy_data_frame[varx])

        for i in sorted(varx_categorpy):
            varsub = copy_data_frame[copy_data_frame[varx] == i] #x = 1 data
            if type == 'en':
                varsub_entropy = entropy(varsub, y)
            elif type == "me":
                varsub_entropy = ME_fun(varsub, y)
            elif type == "gini":
                varsub_entropy = Gini(varsub, y)
            else:
                #print('Default is entropy')
                varsub_entropy = entropy(varsub, y)

            proportion= len(varsub)/len_var
            entropy_var_category = proportion*varsub_entropy
            entropy_lst.append(entropy_var_category)

        expected_entropy = sum(entropy_lst)
        information_gain = total_entropy - expected_entropy
        row = {'factor': varx,"decision_value":information_gain}
        output_table = pd.concat([output_table, pd.DataFrame([row])], ignore_index = True)


    # Find the index of the maximum value in the 'decision_value' column
    max_index = output_table['decision_value'].idxmax()

    # Get the corresponding factor
    max_factor = output_table.loc[max_index, 'factor']
    return (max_factor)


In [10]:
def ID3 (data, type, y, features, current_depth, max_depth):
    trees = {}
    used_data = data
    
    #If all target labels are the same, return label
    if used_data[y].nunique() == 1:
        output_y = used_data[y].unique()[0]
        output_y2 = f"y = {output_y}"
        return output_y2

    # If no more features are available, return the most common label
    elif len(features) == 0 or (current_depth > max_depth) :
        return used_data[y].mode()[0]
        
    else:
        best_feature1 = information_gain_fun(used_data, type, y)
        best_features_category = sorted(used_data[best_feature1].unique())
        
        #Innitial tree with best features:
        trees[best_feature1] = {}

        for i in best_features_category:
            split_tree_data = used_data[used_data[best_feature1] == i]
            if split_tree_data.empty:
                trees[best_features1][i] = used_data[y].mode()[0]

            else:
                new_features = [f for f in features if f != best_feature1]
                update_data = split_tree_data.loc[:, split_tree_data.columns != best_feature1]
                subtree = ID3(update_data, type, y, new_features, current_depth+1, max_depth) #call ID3 again
                trees[best_feature1][i] = subtree
        return trees

In [12]:
def predict (tree, predictors):

    if not isinstance(tree, dict):
        return tree

    parent_node = next(iter(trees)) #parent node
    subtree = trees[parent_node] #subtree of parent_node
    feature_value = predictors[parent_node]  #the value of the parent in the first observation

    if feature_value in subtree:
        return predict(subtree[feature_value], predictors)
    
    else: 
        return None


In [13]:
def evaluation(verify_data, features, y): #test
    
    inaccurate_case = 0
    num_rows = len(verify_data)

    for i in range(num_rows):
        row = verify_data.iloc[0]
        true_y = row[y] #true y
        predictors = row[features]
        predict_value = predict(trees, predictors) #predicted y
        
        if predict_value != true_value:
            inaccurate_case += 1 #total inaccurate prediction case

    predicted_error = inaccurate_case/num_rows #prediction error
    
    return(predicted_error)

: 

In [14]:
#Main function
features = ['buying','maint','doors','persons','lug_boot','safety']
data = train_data
y = 'label'
type = 'en'   # 'en', 'me', 'gini'
max_depth = 2  # decide the largest number of depth
verify_data = train_data #test_data

trees = ID3 (data, type, y, features, current_depth=0, max_depth)
predicted_error = evaluation(verify_data, features, y)

Question 3

In [45]:
#load data
train_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\train.csv",header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

test_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\test.csv", header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

In [46]:
#continuous variable: age, balance, day, duration, empaign, pdays, previous 
#trainsfer continuous variables to category
def data_preprocessing(data, continuous, category):
    for var in continuous:
        mode = data[var].mode()[0]
        data[var] = data[var].map(lambda x:0 if x < mode else 1)

    for var2 in category:
        data[var2] = pd.Categorical(data[var2],
                                        categories = data[var2].unique(),
                                        ordered=True)
        data[var2] = data[var2].cat.codes

    return data

In [47]:
continuous = ['age', 'balance', 'day','duration','campaign','pdays','previous']
category = ['job','marital','education','default','housing','loan','contact','month','poutcome','y']
update_train = data_preprocessing(train_data, continuous,category)
update_test = data_preprocessing(test_data, continuous,category)