In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data = pd.read_csv('breast-cancer.csv')
data = data.drop(['id'], axis=1)
X = data.drop(['diagnosis'], axis=1)
y = data['diagnosis']
y = np.where(y == 'M', 1, 0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

In [3]:
class GradientBoosting:
    def __init__(self, n_estimators=100, learning_rate=0.1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.models = []
        self.initial_pred = None

    def fit(self, X, y):
        pos = np.mean(y)
        epsilon = 1e-10
        
        if pos < epsilon:
            self.initial_pred = -1e10
        elif pos > 1 - epsilon:
            self.initial_pred = 1e10
        else:
            self.initial_pred = np.log(pos / (1 - pos))
        
        F = np.full(X.shape[0], self.initial_pred)
        
        for _ in range(self.n_estimators):
            p = 1 / (1 + np.exp(-F))
            
            residuals = y - p
            
            model = DecisionTreeRegressor(max_depth=1)
            model.fit(X, residuals)
            predictions = model.predict(X)
            
            F += self.learning_rate * predictions
            self.models.append(model)

    def predict(self, X):
        F = np.full(X.shape[0], self.initial_pred)
        
        for model in self.models:
            F += self.learning_rate * model.predict(X)
        
        proba = 1 / (1 + np.exp(-F))
        return proba >= 0.5

In [4]:
def cross_validate(model, X, y, k=5):
    np.random.seed(42)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    fold_size = len(indices) // k
    scores = []
    
    for i in range(k):
        test_start = i * fold_size
        test_end = (i + 1) * fold_size

        test_idx = indices[test_start:test_end]
        train_idx = np.concatenate([indices[:test_start], indices[test_end:]])
        
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = np.mean(y_pred == y_test)
        scores.append(accuracy)
    
    return scores

In [5]:
for n in [50, 100, 200, 500, 1000]:
    for lr in [0.1, 0.5, 2, 5]:
        gboost = GradientBoosting(n_estimators=n, learning_rate=lr)
        start_time = datetime.datetime.now()
        scores = cross_validate(gboost, X.values, y, k=5)
        end_time = datetime.datetime.now()
        print(f"Mean accuracy for n_estimators={n}, learning_rate={lr}: {np.mean(scores):.4f} (+-{np.std(scores):.4f}), time: {end_time - start_time}")

Mean accuracy for n_estimators=50, learning_rate=0.1: 0.9398 (+-0.0117), time: 0:00:00.624805
Mean accuracy for n_estimators=50, learning_rate=0.5: 0.9593 (+-0.0206), time: 0:00:00.571883
Mean accuracy for n_estimators=50, learning_rate=2: 0.9752 (+-0.0142), time: 0:00:00.602605
Mean accuracy for n_estimators=50, learning_rate=5: 0.9823 (+-0.0097), time: 0:00:00.617291
Mean accuracy for n_estimators=100, learning_rate=0.1: 0.9398 (+-0.0117), time: 0:00:01.157570
Mean accuracy for n_estimators=100, learning_rate=0.5: 0.9699 (+-0.0106), time: 0:00:01.138680
Mean accuracy for n_estimators=100, learning_rate=2: 0.9805 (+-0.0180), time: 0:00:01.197076
Mean accuracy for n_estimators=100, learning_rate=5: 0.9841 (+-0.0152), time: 0:00:01.112291
Mean accuracy for n_estimators=200, learning_rate=0.1: 0.9575 (+-0.0205), time: 0:00:02.316778
Mean accuracy for n_estimators=200, learning_rate=0.5: 0.9770 (+-0.0164), time: 0:00:02.350584
Mean accuracy for n_estimators=200, learning_rate=2: 0.9841 (+

In [7]:
for n in [50, 100, 200, 500, 1000]:
    for lr in [0.1, 0.5, 2, 5]:
        gboost = GradientBoostingClassifier(n_estimators=n, learning_rate=lr, max_depth=1)
        start_time = datetime.datetime.now()
        scores = cross_val_score(gboost, X, y, cv=5, scoring='accuracy')
        end_time = datetime.datetime.now()
        print(f"Mean accuracy for n_estimators={n}, learning_rate={lr}: {scores.mean():.4f} (+-{scores.std():.4f}), time: {end_time - start_time}")

Mean accuracy for n_estimators=50, learning_rate=0.1: 0.9490 (+-0.0179), time: 0:00:00.564475
Mean accuracy for n_estimators=50, learning_rate=0.5: 0.9649 (+-0.0175), time: 0:00:00.579181
Mean accuracy for n_estimators=50, learning_rate=2: 0.7665 (+-0.2056), time: 0:00:00.572921
Mean accuracy for n_estimators=50, learning_rate=5: 0.3726 (+-0.0039), time: 0:00:00.561660
Mean accuracy for n_estimators=100, learning_rate=0.1: 0.9649 (+-0.0166), time: 0:00:01.147919
Mean accuracy for n_estimators=100, learning_rate=0.5: 0.9631 (+-0.0195), time: 0:00:01.100490
Mean accuracy for n_estimators=100, learning_rate=2: 0.7665 (+-0.2056), time: 0:00:01.132785
Mean accuracy for n_estimators=100, learning_rate=5: 0.3726 (+-0.0039), time: 0:00:01.145645
Mean accuracy for n_estimators=200, learning_rate=0.1: 0.9666 (+-0.0140), time: 0:00:02.167629
Mean accuracy for n_estimators=200, learning_rate=0.5: 0.9701 (+-0.0197), time: 0:00:02.231389
Mean accuracy for n_estimators=200, learning_rate=2: 0.7665 (+