In [38]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.datasets import load_iris

In [39]:
samples = 20
attributes = 4
classes = 3

In [40]:
attribute_1 = ["A", "B", "C"]
attribute_2 = ["X", "Y", "Z"]
attribute_3 = (1, 10)
attribute_4 = (5, 15)

In [41]:
data = {
    "Attribute_1": [random.choice(attribute_1) for _ in range(samples)],
    "Attribute_2": [random.choice(attribute_2) for _ in range(samples)],
    "Attribute_3": [random.uniform(*attribute_3) for _ in range(samples)],
    "Attribute_4": [random.uniform(*attribute_4) for _ in range(samples)],
    "Class": [random.randint(1, classes) for _ in range(samples)]
}

In [42]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Attribute_1,Attribute_2,Attribute_3,Attribute_4,Class
0,A,Y,7.106113,6.300782,3
1,A,X,2.765403,7.854343,2
2,B,Y,7.796849,8.903308,2
3,C,X,6.283586,12.039409,1
4,C,Z,3.268272,8.273143,1


In [None]:
def entropy(df):
    value_counts = df["Class"].value_counts(normalize=True)
    entropy = 0
    for p in value_counts:
        entropy -= p * math.log2(p)
    return entropy

In [44]:
def split_dataset(df, column_name):
    subsets = {}
    for value in df[column_name].unique():
        subsets[value] = df[df[column_name] == value]
    return subsets

In [None]:
def IG(df, attribute):
    total_entropy = entropy(df)
    subsets = split_dataset(df, attribute)

    weighted_entropy = 0
    for subset in subsets.values():
        proportion = len(subset) / len(df)
        weighted_entropy += proportion * entropy(subset)

    info_gain = total_entropy - weighted_entropy
    return info_gain

In [None]:
def best_attribute(df):
    attributes = [col for col in df.columns if col != "Class"]
    gains = {}
    for attr in attributes:
        gains[attr] = IG(df, attr)
    return max(gains, key=gains.get)

In [47]:
def build_tree(df):
    if df["Class"].nunique() == 1:
        return df["Class"].iloc[0]
    if len(df.columns) == 1:
        return df["Class"].mode()[0]
    best_attr = best_attribute(df)
    tree = {best_attr: {}}
    subsets = split_dataset(df, best_attr)
    for value, subset in subsets.items():
        if subset.empty:
            tree[best_attr][value] = df["Class"].mode()[0]
        else:
            tree[best_attr][value] = build_tree(subset.drop(columns=[best_attr]))
    return tree

In [49]:
decision_tree = build_tree(df)
decision_tree

{'Attribute_3': {np.float64(7.106112590471241): np.int64(3),
  np.float64(2.765402775615677): np.int64(2),
  np.float64(7.796849312454084): np.int64(2),
  np.float64(6.283585714329733): np.int64(1),
  np.float64(3.26827163646785): np.int64(1),
  np.float64(3.6202080223587467): np.int64(2),
  np.float64(2.4945574093647016): np.int64(2),
  np.float64(1.1170594303533368): np.int64(1),
  np.float64(2.575491525119035): np.int64(2),
  np.float64(4.533623574192272): np.int64(3),
  np.float64(6.266010028333149): np.int64(2),
  np.float64(3.776661207797743): np.int64(1),
  np.float64(8.948439078311043): np.int64(2),
  np.float64(5.935434112498065): np.int64(3),
  np.float64(7.587402123234559): np.int64(2),
  np.float64(4.808177284235077): np.int64(3),
  np.float64(4.666031279359474): np.int64(3),
  np.float64(6.679008382118673): np.int64(3),
  np.float64(4.4869073095889025): np.int64(3),
  np.float64(6.687481377382182): np.int64(2)}}

In [50]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [51]:
K = 1000
n_samples = len(df)

In [52]:
ignored_counts = []
for _ in range(K):
    bootstrap_sample = df.sample(n=n_samples, replace=True)  
    ignored_instances = df.index.difference(bootstrap_sample.index)
    ignored_counts.append(len(ignored_instances) / n_samples)  

avg_ignored_fraction = np.mean(ignored_counts)
print(f"Average percentage of ignored instances over {K} bootstraps: {avg_ignored_fraction * 100:.2f}%")

Average percentage of ignored instances over 1000 bootstraps: 36.61%
