In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
data = pd.read_csv('breast-cancer.csv')
data = data.drop(['id'], axis=1)
X = data.drop(['diagnosis'], axis=1)
y = data['diagnosis']
y = np.where(y == 'M', 1, -1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

In [3]:
class AdaBoost:
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators
        self.models = []
        self.alphas = []

    def fit(self, X, y):
        n_samples = X.shape[0]
        weights = np.ones(n_samples) / n_samples
        
        for _ in range(self.n_estimators):
            model = DecisionTreeClassifier(max_depth=1)
            model.fit(X, y, sample_weight=weights)
            pred = model.predict(X)
            
            incorrect = (pred != y)
            error = np.dot(weights, incorrect)
            
            if error >= 0.5 - 1e-10:
                break
            if error < 1e-10:
                alpha = 1e5
            else:
                alpha = 0.5 * np.log((1 - error) / error)
            
            weights *= np.exp(alpha * incorrect)
            weights /= np.sum(weights)
            
            self.models.append(model)
            self.alphas.append(alpha)

    def predict(self, X):
        preds = np.array([model.predict(X) for model in self.models])
        weighted_sum = np.dot(self.alphas, preds)
        return np.sign(weighted_sum)

In [4]:
def cross_validate(model, X, y, k=5):
    np.random.seed(42)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    fold_size = len(indices) // k
    scores = []
    
    for i in range(k):
        test_start = i * fold_size
        test_end = (i + 1) * fold_size

        test_idx = indices[test_start:test_end]
        train_idx = np.concatenate([indices[:test_start], indices[test_end:]])
        
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = np.mean(y_pred == y_test)
        scores.append(accuracy)
    
    return scores

In [5]:
for n in [25, 50, 75, 100, 150, 200, 250]:
    adaboost = AdaBoost(n_estimators=n)
    start_time = datetime.datetime.now()
    scores = cross_validate(adaboost, X.values, y, k=5)
    end_time = datetime.datetime.now()
    print(f"Mean accuracy for n_estimators={n}: {np.mean(scores):.4f} (+-{np.std(scores):.4f}), time: {end_time - start_time}")

Mean accuracy for n_estimators=25: 0.9788 (+-0.0120), time: 0:00:00.354070
Mean accuracy for n_estimators=50: 0.9841 (+-0.0117), time: 0:00:00.750160
Mean accuracy for n_estimators=75: 0.9858 (+-0.0132), time: 0:00:01.122841
Mean accuracy for n_estimators=100: 0.9894 (+-0.0142), time: 0:00:01.438270
Mean accuracy for n_estimators=150: 0.9894 (+-0.0142), time: 0:00:02.075900
Mean accuracy for n_estimators=200: 0.9912 (+-0.0112), time: 0:00:02.856496
Mean accuracy for n_estimators=250: 0.9912 (+-0.0112), time: 0:00:03.476977


In [None]:
for n in [25, 50, 75, 100, 150, 200, 250]:
    adaboost = AdaBoostClassifier(n_estimators=n, algorithm='SAMME')
    start_time = datetime.datetime.now()
    scores = cross_val_score(adaboost, X, y, cv=5, scoring='accuracy')
    end_time = datetime.datetime.now()
    print(f"Mean accuracy for n_estimators={n}: {scores.mean():.4f} (+-{scores.std():.4f}), time: {end_time - start_time}")

Mean accuracy for n_estimators=25: 0.9596 (+-0.0142), time: 0:00:00.458288
Mean accuracy for n_estimators=50: 0.9684 (+-0.0131), time: 0:00:00.931951
Mean accuracy for n_estimators=75: 0.9719 (+-0.0102), time: 0:00:01.371355
Mean accuracy for n_estimators=100: 0.9772 (+-0.0105), time: 0:00:01.755123
Mean accuracy for n_estimators=150: 0.9754 (+-0.0102), time: 0:00:02.557938
Mean accuracy for n_estimators=200: 0.9772 (+-0.0089), time: 0:00:03.620275
Mean accuracy for n_estimators=250: 0.9772 (+-0.0105), time: 0:00:04.592839
