In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# Load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
df = pd.read_csv(url, header=None, na_values="?")
df.dropna(inplace=True)
df = df.astype(int)

# Define features and target
df.columns = ['ID', 'Clump_Thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape', 'Marginal_Adhesion',
              'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class']
X = df.iloc[:, 1:-1]
y = df['Class']

# Train decision tree
clf = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2, min_samples_split=5, criterion='gini', random_state=42)
clf.fit(X, y)

# Compute Gini, Entropy, and Misclassification Error
def calculate_metrics(tree):
    root = tree.tree_
    gini = root.impurity[0]
    entropy = -(gini * np.log2(gini + 1e-9) + (1 - gini) * np.log2(1 - gini + 1e-9))
    misclassification = 1 - np.max(root.value[0]) / np.sum(root.value[0])
    return gini, entropy, float(misclassification)

gini, entropy, misclassification = calculate_metrics(clf)
print(f"Gini: {gini:.4f}, Entropy: {entropy:.4f}, Misclassification Error: {misclassification:.4f}")

# Compute Information Gain
info_gain = entropy - np.sum(clf.tree_.weighted_n_node_samples[1:] / clf.tree_.weighted_n_node_samples[0] * clf.tree_.impurity[1:])
print(f"Information Gain: {info_gain:.4f}")

# Get first split feature and threshold
feature_index = clf.tree_.feature[0]
feature_name = X.columns[feature_index]
threshold = clf.tree_.threshold[0]

print(f"First split feature: {feature_name}")
print(f"Decision boundary value: {threshold:.2f}")


Gini: 0.4550, Entropy: 0.9941, Misclassification Error: 0.3499
Information Gain: 0.7825
First split feature: Uniformity_Cell_Size
Decision boundary value: 2.50
