In [1]:
import numpy as np
import pandas as pd

In [2]:
# Dataset for Building a decision tree
df = pd.DataFrame({"District": ["Suburban", "Suburban", "Rural", "Urban", "Suburban", "Suburban", "Suburban", "Rural",
                                "Rural", "Rural", "Urban", "Urban", "Urban", "Urban"],
                   "House Type": ["Detached", "Semi-detached", "Semi-detached", "Detached", "Semi-detached",
                                  "Semi-detached", "Detached", "Detached", "Semi-detached", "Semi-detached", "Detached",
                                  "Semi-detached", "Semi-detached", "Detached"],
                   "Income": ["High", "High", "Low", "Low", "High", "High", "Low", "Low", "High", "High", "Low", "Low",
                              "High", "Low"],
                   "Previous Customer": ["No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes",
                                         "No", "No", ],
                   "Outcome": ["Not Responded", "Responded", "Responded", "Not Responded", "Responded", "Responded",
                               "Not Responded", "Responded",
                               "Responded", "Responded", "Not Responded", "Not Responded", "Responded", "Responded"]},
                  columns=["District", "House Type", "Income", "Previous Customer", "Outcome"])

df

Unnamed: 0,District,House Type,Income,Previous Customer,Outcome
0,Suburban,Detached,High,No,Not Responded
1,Suburban,Semi-detached,High,Yes,Responded
2,Rural,Semi-detached,Low,No,Responded
3,Urban,Detached,Low,Yes,Not Responded
4,Suburban,Semi-detached,High,No,Responded
5,Suburban,Semi-detached,High,Yes,Responded
6,Suburban,Detached,Low,No,Not Responded
7,Rural,Detached,Low,Yes,Responded
8,Rural,Semi-detached,High,No,Responded
9,Rural,Semi-detached,High,Yes,Responded


In [3]:
# Features
features_data = df[["District", "House Type", "Income", "Previous Customer"]]

features_data

Unnamed: 0,District,House Type,Income,Previous Customer
0,Suburban,Detached,High,No
1,Suburban,Semi-detached,High,Yes
2,Rural,Semi-detached,Low,No
3,Urban,Detached,Low,Yes
4,Suburban,Semi-detached,High,No
5,Suburban,Semi-detached,High,Yes
6,Suburban,Detached,Low,No
7,Rural,Detached,Low,Yes
8,Rural,Semi-detached,High,No
9,Rural,Semi-detached,High,Yes


In [4]:
# Target
target = df["Outcome"]

target

0     Not Responded
1         Responded
2         Responded
3     Not Responded
4         Responded
5         Responded
6     Not Responded
7         Responded
8         Responded
9         Responded
10    Not Responded
11    Not Responded
12        Responded
13        Responded
Name: Outcome, dtype: object

In [5]:
# Calculate entropy function
def calculate_entropy(df_label):
    # Count each type of value and value's counts
    classes, classes_counts = np.unique(df_label, return_counts=True)
    # Sum of entropy each type of value
    entropy_value = - np.sum(
        [(classes_counts[i] / np.sum(classes_counts)) * np.log2(classes_counts[i] / np.sum(classes_counts))
         for i in range(len(classes))])
    # Return entropy value
    return entropy_value

In [6]:
# Calculate information gain function
def calculate_information_gain(dataset, root_feature, target_feature):
    # Count all entropy
    dataset_entropy = calculate_entropy(df[target_feature])
    # Count each feature from root feature
    feature, feature_count = np.unique(dataset[root_feature], return_counts=True)

    # Calculate the weighted features entropy
    weight_feature_entropy = np.sum(
        [(feature_count[i] / np.sum(feature_count) * calculate_entropy(dataset.where(dataset[root_feature] == feature[i]).dropna()[target_feature]))
         for i in range(len(feature))]
    )
    # Calculates information gain from the parent node entropy and the sum of child nodes
    feature_information_gain = dataset_entropy - weight_feature_entropy
    return feature_information_gain

In [7]:
# Define decision tree
def create_decision_tree(dataset, original_df, feature_list, label, parent = None):
    data_feature = np.unique(original_df[label], return_counts=True)
    unique_data = np.unique(dataset[label])

    # If attribute is only one, return the attribute
    if len(unique_data) <= 1:
        return unique_data[0]

    # If there is no data, return maximum value attribute
    elif len(dataset) == 0:
        return unique_data[np.argmax(data_feature(1))]

    # If there is no feature, return parent node's attribute
    elif len(feature_list) == 0:
        return parent

    else:
        # Define parent nodes feature
        parent = unique_data[np.argmax(data_feature[1])]
        # Select feature
        item_values = [calculate_information_gain(dataset, feature, label) for feature in feature_list]
        optimum_feature_index = np.argmax(item_values)
        optimum_feature = feature_list[optimum_feature_index]
        # Make tree structure
        decision_tree = {optimum_feature: {}}
        # except the feature that gained most information
        features = [i for i in feature_list if i != optimum_feature]

        # Expand tree
        for value in np.unique(dataset[optimum_feature]):
            # drop row with missing data
            min_data = dataset.where(dataset[optimum_feature] == value).dropna()
            min_tree = create_decision_tree(min_data, df, features, label, parent)
            decision_tree[optimum_feature][value] = min_tree
        return decision_tree

In [8]:
print('Information Gain for the "District" = ', round(calculate_information_gain(df, "District", "Outcome"), 5))
print('Information Gain for the "House Type" = ', round(calculate_information_gain(df, "House Type", "Outcome"), 5))
print('Information Gain for the "Income" = ', round(calculate_information_gain(df, "Income", "Outcome"), 5))
print('Information Gain for the "Previous Customer" = ', round(calculate_information_gain(df, "Previous Customer", "Outcome"), 5), '\n')

tree = create_decision_tree(df, df, ["District", "House Type", "Income", "Previous Customer"], "Outcome")
tree

Information Gain for the "District" =  0.24675
Information Gain for the "House Type" =  0.23612
Information Gain for the "Income" =  0.15184
Information Gain for the "Previous Customer" =  0.01611 



{'District': {'Rural': 'Responded',
  'Suburban': {'House Type': {'Detached': 'Not Responded',
    'Semi-detached': 'Responded'}},
  'Urban': {'Previous Customer': {'No': 'Responded', 'Yes': 'Not Responded'}}}}