In [129]:
import numpy as np
import pandas as pd

In [130]:
df = pd.read_excel("dataset.xlsx").drop(columns=["RID"])

In [131]:
def entropy(df):
    labels = df[df.columns[-1]]
    labelCounts = Counter(labels)
    totalSamples = len(labels)
    return -sum((count/totalSamples) * np.log2(count/totalSamples) for count in labelCounts.values())

In [132]:
def infoGain(df,attributeIndex):
    totalEntropy = entropy(df)
    values = set(df[attributeIndex])
    
    weightedEntropy = sum(
        (len(subset) / len(df)) * entropy(subset)
        for value in values
        for subset in [df[df[attributeIndex] == value]]
    )
    return totalEntropy - weightedEntropy

In [144]:
def entropyPerAttribute(data):
    attributes = data.columns[:-1]
    entropy_values = {}
    for attribute in attributes:
        values = data[attribute].unique()
        entropy_sum = 0
        for value in values:
            subset = data[data[attribute] == value]
            if not subset.empty:
                entropy_sum += (len(subset) / len(data)) * entropy(subset)
        entropy_values[attribute] = entropy_sum
    return entropy_values

In [134]:
def bestAttributeId3(df):
    bestIndex = max(range(len(df.columns) - 1),key=lambda i: infoGain(df,df.columns[i]))
    return df.columns[bestIndex]

In [145]:
print(f"Best Attribute for splitting: {bestAttributeId3(df)}")

Best Attribute for splitting: Age


In [136]:
entropy_values = entropyPerAttribute(df)
print(f"Entropy for each attribute: {entropy_values}")

Entropy for each attribute: {'Age': 0.6935361388961918, 'Income': 0.9110633930116763, 'Student': 0.7884504573082896, 'Credit_rating': 0.8921589282623617}


In [137]:
def gini_index(data):
    labels = data.iloc[:, -1]
    label_counts = labels.value_counts()
    total_samples = len(labels)
    return 1 - sum((count / total_samples) ** 2 for count in label_counts)

In [138]:
def gini_attribute(data, attribute):
    values = data[attribute].unique()
    return sum(
        (len(subset) / len(data)) * gini_index(subset)
        for value in values
        for subset in [data[data[attribute] == value]]
    )

In [139]:
def best_attribute_cart(data):
    return min(data.columns[:-1], key=lambda attr: gini_attribute(data, attr))

In [140]:
def cart_tree(data, depth=0):
    if len(data.iloc[:, -1].unique()) == 1:  # If all labels are the same
        return data.iloc[0, -1]
    if depth >= 5 or len(data.columns) == 1:  # Limit depth or prevent empty attribute selection
        return data.iloc[:, -1].mode()[0]
    
    best_attr = best_attribute_cart(data)
    tree = {best_attr: {}}
    
    for value in data[best_attr].unique():
        subset = data[data[best_attr] == value].drop(columns=[best_attr])
        tree[best_attr][value] = cart_tree(subset, depth + 1)
    
    return tree

In [141]:
print(f"Best Attribute for splitting: {best_attribute_cart(df)}")

Best Attribute for splitting: Age


In [142]:
cart_result = cart_tree(df)

In [143]:
cart_result

{'Age': {'youth': {'Student': {'no': 'no', 'yes': 'yes'}},
  'middle_aged': 'yes',
  'senior': {'Credit_rating': {'fair': 'yes', 'excellent': 'no'}}}}