In [1]:
import time
import numpy as np
import pandas as pd

from sklearn.utils import resample
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, clone
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
df = pd.read_csv("../Dry_Bean.csv")

In [3]:
df.head()

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166,SEKER


In [4]:
df.isnull().sum()

Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
Class              0
dtype: int64

In [5]:
le = LabelEncoder()

df['Class'] = le.fit_transform(df['Class'])

In [6]:
X = df.drop(columns=['Class'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10888, 16), (2723, 16), (10888,), (2723,))

In [7]:
class CustomBaggingClassifier(BaseEstimator):
    def __init__(self, base_estimator=DecisionTreeClassifier(), n_estimators=10):
        self.base_estimator = base_estimator
        self.n_estimators = n_estimators

    def fit(self, X, y):
        self.estimators = []
        n_classes = len(np.unique(y))
        random_accuracy = 1.0 / n_classes

        for i in range(self.n_estimators):
            X_resampled, y_resampled = resample(X, y, random_state=42 + i)
            estimator = clone(self.base_estimator)
            estimator.fit(X_resampled, y_resampled)

            y_pred = estimator.predict(X_resampled)
            tree_accuracy = np.mean(y_pred == y_resampled)

            if tree_accuracy > random_accuracy:
                self.estimators.append(estimator)

        return self

    def predict(self, X):
        if not self.estimators:
            raise ValueError("No valid estimators found during training")

        predictions = np.array([estimator.predict(X) for estimator in self.estimators])
        majority_votes = np.apply_along_axis(
            lambda x: np.bincount(x).argmax(), axis=0, arr=predictions
        )
        return majority_votes

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

In [8]:
cb_clf = CustomBaggingClassifier(n_estimators=10)

cb_clf.fit(X_train, y_train)

accuracy = cb_clf.score(X_test, y_test)
print(f"CBG accuracy: {accuracy:.4f}")

CBG accuracy: 0.9207


In [9]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
start_time = time.time()
cv_scores = cross_val_score(cb_clf, X, y, cv=kf)
training_time = time.time() - start_time

mean_cv_score = np.mean(cv_scores)

print(f"CBG cv: {mean_cv_score:.4f}")
print(f"{training_time:.4f} seconds")

CBG cv: 0.9194
8.4512 seconds


In [11]:
y_pred = cb_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy:.4f}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"precision: {precision:.4f}")

recall = recall_score(y_test, y_pred, average='weighted')
print(f"recall: {recall:.4f}")

accuracy: 0.9207
precision: 0.9210
recall: 0.9207


In [12]:
sb_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

start_time = time.time()
sb_clf.fit(X_train, y_train)
sklearn_training_time = time.time() - start_time

sklearn_accuracy = sb_clf.score(X_test, y_test)

sklearn_cv_scores = cross_val_score(sb_clf, X, y, cv=kf)
sklearn_mean_cv_score = np.mean(sklearn_cv_scores)

print(f"\nCBG accuracy: {accuracy:.4f}")
print(f"SBG accuracy: {sklearn_accuracy:.4f}")

print(f"\nCBG {training_time:.4f} seconds")
print(f"SBG {sklearn_training_time:.4f} seconds")

print(f"\nCBG cv accuracy: {mean_cv_score:.4f}")
print(f"SBG cv accuracy: {sklearn_mean_cv_score:.4f}")


CBG accuracy: 0.9207
SBG accuracy: 0.9210

CBG 8.4512 seconds
SBG 1.2484 seconds

CBG cv accuracy: 0.9194
SBG cv accuracy: 0.9183
