In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [11]:
import numpy as np
import pandas as pd

# Define the entropy function
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts=True)
    entropy = np.sum([(-counts[i]/np.sum(counts)) * np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

# Define the Information Gain function
def InfoGain(data, split_attribute_name, target_name="label"):
    total_entropy = entropy(data[target_name])
    vals, counts = np.unique(data[split_attribute_name], return_counts=True)
    Weighted_Entropy = np.sum([(counts[i]/np.sum(counts)) * entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name]) for i in range(len(vals))])
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

# Define the ID3 algorithm
def ID3(data, originaldata, features, target_attribute_name="label", parent_node_class = None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
    elif len(data) == 0:
        return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]
    elif len(features) == 0:
        return parent_node_class
    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        item_values = [InfoGain(data, feature, target_attribute_name) for feature in features]
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]
        tree = {best_feature:{}}
        features = [i for i in features if i != best_feature]
        for value in np.unique(data[best_feature]):
            sub_data = data.where(data[best_feature] == value).dropna()
            subtree = ID3(sub_data, originaldata, features, target_attribute_name, parent_node_class)
            tree[best_feature][value] = subtree
        return tree

# Load the data
train_data = pd.read_csv('/mnt/data/adult.train.10k.discrete', header=None)
test_data = pd.read_csv('/mnt/data/adult.test.10k.discrete', header=None)

# Define column names
columns = ["label", "workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
train_data.columns = columns
test_data.columns = columns

# Apply the ID3 algorithm
features = train_data.columns.tolist()[1:]
tree = ID3(train_data, train_data, features)

def predict(tree, instance):
    for nodes in tree.keys():
        value = instance[nodes]
        tree = tree[nodes][value]
        prediction = 0

        if type(tree) is dict:
            prediction = predict(tree, instance)
        else:
            prediction = tree
            break;

    return prediction

 #accuracy
def calculate_accuracy(df, tree):
    df["predicted"] = df.apply(predict, axis=1, args=(tree,))
    accuracy = np.sum(df["label"] == df["predicted"]) / len(df)
    return accuracy

train_accuracy = calculate_accuracy(train_data, tree)
test_accuracy = calculate_accuracy(test_data, tree)

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Train Accuracy: 0.8731
Test Accuracy: 0.8731
