<a href="https://colab.research.google.com/github/hamagami/is2024/blob/main/07_bagging_scratch_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bagging scratch classification

In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# Generate dataset
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=15,
    n_redundant=5,
    random_state=42
)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Scratch implementation of Bagging Classifier
class ScratchBaggingClassifier:
    def __init__(self, base_estimator, n_estimators=10, random_state=None):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators
        self.models = []
        self.random_state = random_state
        if random_state is not None:
            np.random.seed(random_state)

    def fit(self, X, y):
        self.models = []
        n_samples = X.shape[0]
        for i in range(self.n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(n_samples, n_samples, replace=True)
            X_sample, y_sample = X[indices], y[indices]
            model = self.base_estimator()  # Instantiate the base model
            model.fit(X_sample, y_sample)
            self.models.append(model)

    def predict(self, X):
        # Aggregate predictions from all models using majority voting
        predictions = np.array([model.predict(X) for model in self.models])
        majority_vote = np.apply_along_axis(
            lambda x: np.bincount(x).argmax(), axis=0, arr=predictions
        )
        return majority_vote


In [7]:

# Evaluate accuracy with a single decision tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

# Evaluate accuracy with scratch implementation of Bagging
scratch_bagging = ScratchBaggingClassifier(
    base_estimator=DecisionTreeClassifier, n_estimators=50, random_state=42
)
scratch_bagging.fit(X_train, y_train)
y_pred_bagging = scratch_bagging.predict(X_test)
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)

# Display results
print("Accuracy with single decision tree:", accuracy_dt)
print("Accuracy with scratch implementation of Bagging:", accuracy_bagging)



Accuracy with single decision tree: 0.7966666666666666
Accuracy with scratch implementation of Bagging: 0.8833333333333333
