In [13]:
import numpy as np
from collections import Counter

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier as SkDecisionTree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier  # pip install xgboost if needed


In [14]:
# Load dataset
data = load_breast_cancer()
X = data.data      # features
y = data.target    # labels (0/1)

# Train / Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)


(455, 30) (114, 30)


In [15]:
def entropy(y):
    """Compute entropy of label array y."""
    counts = np.bincount(y)
    probs = counts[counts > 0] / len(y)
    return -np.sum(probs * np.log2(probs))


def gini(y):
    """Compute gini impurity of label array y."""
    counts = np.bincount(y)
    probs = counts[counts > 0] / len(y)
    return 1.0 - np.sum(probs**2)


In [16]:
def best_split(X, y, criterion="entropy"):
    """
    Find best feature index and threshold to split on.
    Returns: (best_feature, best_threshold, best_impurity_gain)
    """
    n_samples, n_features = X.shape
    if n_samples <= 1:
        return None, None, 0

    if criterion == "entropy":
        impurity_func = entropy
    else:
        impurity_func = gini

    parent_impurity = impurity_func(y)
    best_gain = 0.0
    best_feature = None
    best_threshold = None

    for feature_idx in range(n_features):
        # Consider unique sorted values of this feature as candidate thresholds
        values = X[:, feature_idx]
        thresholds = np.unique(values)

        for t in thresholds:
            left_mask = values <= t
            right_mask = ~left_mask

            if left_mask.sum() == 0 or right_mask.sum() == 0:
                continue  # invalid split

            y_left, y_right = y[left_mask], y[right_mask]

            n_left, n_right = len(y_left), len(y_right)
            n_total = n_left + n_right

            impurity_left = impurity_func(y_left)
            impurity_right = impurity_func(y_right)

            child_impurity = (n_left / n_total) * impurity_left + (n_right / n_total) * impurity_right
            gain = parent_impurity - child_impurity

            if gain > best_gain:
                best_gain = gain
                best_feature = feature_idx
                best_threshold = t

    return best_feature, best_threshold, best_gain


In [17]:
class TreeNode:
    def __init__(self, *, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature        # index of feature to split on
        self.threshold = threshold    # threshold value
        self.left = left              # left child (TreeNode)
        self.right = right            # right child (TreeNode)
        self.value = value            # class label if leaf


In [18]:
class MyDecisionTreeClassifier:
    def __init__(self, criterion="entropy", max_depth=None, min_samples_split=2):
        assert criterion in ("entropy", "gini")
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))

        # Stopping conditions
        if (self.max_depth is not None and depth >= self.max_depth) or \
           num_labels == 1 or \
           num_samples < self.min_samples_split:
            leaf_value = self._most_common_label(y)
            return TreeNode(value=leaf_value)

        feature, threshold, gain = best_split(X, y, criterion=self.criterion)

        if feature is None or gain == 0:
            # No useful split
            leaf_value = self._most_common_label(y)
            return TreeNode(value=leaf_value)

        # Split dataset
        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return TreeNode(feature=feature, threshold=threshold, left=left, right=right)

    def _most_common_label(self, y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]

    def _predict_single(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self._predict_single(x, node.left)
        else:
            return self._predict_single(x, node.right)

    def predict(self, X):
        return np.array([self._predict_single(x, self.root) for x in X])


In [19]:
# Custom Decision Tree using Entropy
my_dt_entropy = MyDecisionTreeClassifier(criterion="entropy", max_depth=5)
my_dt_entropy.fit(X_train, y_train)
y_pred_my_entropy = my_dt_entropy.predict(X_test)

# Custom Decision Tree using Gini
my_dt_gini = MyDecisionTreeClassifier(criterion="gini", max_depth=5)
my_dt_gini.fit(X_train, y_train)
y_pred_my_gini = my_dt_gini.predict(X_test)

# Sklearn Decision Tree
sk_dt = SkDecisionTree(criterion="entropy", max_depth=5, random_state=42)
sk_dt.fit(X_train, y_train)
y_pred_sk = sk_dt.predict(X_test)

print("Accuracy (My DT - Entropy):", accuracy_score(y_test, y_pred_my_entropy))
print("Accuracy (My DT - Gini):   ", accuracy_score(y_test, y_pred_my_gini))
print("Accuracy (Sklearn DT):     ", accuracy_score(y_test, y_pred_sk))


Accuracy (My DT - Entropy): 0.9385964912280702
Accuracy (My DT - Gini):    0.9298245614035088
Accuracy (Sklearn DT):      0.9298245614035088


In [20]:
# Random Forest
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
rf.fit(X_train, y_train)
y_pred_rf_train = rf.predict(X_train)
y_pred_rf_test = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)

print("\n=== Model Accuracies ===")
print("My DT (Entropy) - Train:", accuracy_score(y_train, my_dt_entropy.predict(X_train)))
print("My DT (Entropy) - Test :", accuracy_score(y_test,  y_pred_my_entropy))

print("Sklearn DT      - Train:", accuracy_score(y_train, sk_dt.predict(X_train)))
print("Sklearn DT      - Test :", accuracy_score(y_test,  y_pred_sk))

print("Random Forest   - Train:", accuracy_score(y_train, y_pred_rf_train))
print("Random Forest   - Test :", accuracy_score(y_test,  y_pred_rf_test))

print("XGBoost         - Train:", accuracy_score(y_train, y_pred_xgb_train))
print("XGBoost         - Test :", accuracy_score(y_test,  y_pred_xgb_test))



=== Model Accuracies ===
My DT (Entropy) - Train: 0.9912087912087912
My DT (Entropy) - Test : 0.9385964912280702
Sklearn DT      - Train: 0.9912087912087912
Sklearn DT      - Test : 0.9298245614035088
Random Forest   - Train: 0.9934065934065934
Random Forest   - Test : 0.956140350877193
XGBoost         - Train: 1.0
XGBoost         - Test : 0.956140350877193


In [21]:
# Random Forest
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
rf.fit(X_train, y_train)
y_pred_rf_train = rf.predict(X_train)
y_pred_rf_test = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)
xgb.fit(X_train, y_train)
y_pred_xgb_train = xgb.predict(X_train)
y_pred_xgb_test = xgb.predict(X_test)

print("\n=== Model Accuracies ===")
print("My DT (Entropy) - Train:", accuracy_score(y_train, my_dt_entropy.predict(X_train)))
print("My DT (Entropy) - Test :", accuracy_score(y_test,  y_pred_my_entropy))

print("Sklearn DT      - Train:", accuracy_score(y_train, sk_dt.predict(X_train)))
print("Sklearn DT      - Test :", accuracy_score(y_test,  y_pred_sk))

print("Random Forest   - Train:", accuracy_score(y_train, y_pred_rf_train))
print("Random Forest   - Test :", accuracy_score(y_test,  y_pred_rf_test))

print("XGBoost         - Train:", accuracy_score(y_train, y_pred_xgb_train))
print("XGBoost         - Test :", accuracy_score(y_test,  y_pred_xgb_test))



=== Model Accuracies ===
My DT (Entropy) - Train: 0.9912087912087912
My DT (Entropy) - Test : 0.9385964912280702
Sklearn DT      - Train: 0.9912087912087912
Sklearn DT      - Test : 0.9298245614035088
Random Forest   - Train: 0.9934065934065934
Random Forest   - Test : 0.956140350877193
XGBoost         - Train: 1.0
XGBoost         - Test : 0.956140350877193


In [22]:
print("\nTop 5 Feature Importances (Random Forest):")
rf_importances = rf.feature_importances_
for idx in np.argsort(rf_importances)[::-1][:5]:
    print(f"{data.feature_names[idx]}: {rf_importances[idx]:.4f}")

print("\nTop 5 Feature Importances (XGBoost):")
xgb_importances = xgb.feature_importances_
for idx in np.argsort(xgb_importances)[::-1][:5]:
    print(f"{data.feature_names[idx]}: {xgb_importances[idx]:.4f}")



Top 5 Feature Importances (Random Forest):
worst area: 0.1383
worst concave points: 0.1330
worst radius: 0.1008
mean concave points: 0.0985
worst perimeter: 0.0722

Top 5 Feature Importances (XGBoost):
worst perimeter: 0.2547
worst radius: 0.1225
mean concave points: 0.1222
worst area: 0.0915
worst concave points: 0.0645
