In [8]:
import time

# benchmark model
from xgboost import XGBClassifier

# automl modules
from autosklearn.classification import AutoSklearnClassifier
from tpot import TPOTClassifier

# sklearn functions
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

%matplotlib inline

## Load Data

In [7]:
X, y = datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (426, 30), y_train shape: (426,)
X_test shape: (143, 30), y_test shape: (143,)


## xgboost benchmark

In [12]:
time_start = time.time()
clf = XGBClassifier()
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
time_elapsed = time.time() - time_start
print(f"Accuracy score: {accuracy_score(y_test, y_hat):.4f}")
print(f"Elapsed time: {time_elapsed:.4f}s")

Accuracy score: 0.9650
Elapsed time: 0.0823s


## auto-sklearn

In [13]:
time_start = time.time()
clf = AutoSklearnClassifier(time_left_for_this_task=300)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
time_elapsed = time.time() - time_start
print(f"Accuracy score: {accuracy_score(y_test, y_hat):.4f}")
print(f"Elapsed time: {time_elapsed:.4f}s")

Time limit for a single run is higher than total time limit. Capping the limit for a single run to the total time given to SMAC (299.659966)
Accuracy score: 0.9510
Elapsed time: 294.4967s


## tpot

In [14]:
time_start = time.time()
clf = TPOTClassifier(generations=5, verbosity=2, max_time_mins=5)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)
time_elapsed = time.time() - time_start
print(f"Accuracy score: {accuracy_score(y_test, y_hat):.4f}")
print(f"Elapsed time: {time_elapsed:.4f}s")

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', style=ProgressStyle(description_wâ€¦

Generation 1 - Current best internal CV score: 0.9764132629796105
Generation 2 - Current best internal CV score: 0.9764412741840923
Generation 3 - Current best internal CV score: 0.9764419256074524

5.01 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GradientBoostingClassifier(RFE(StandardScaler(input_matrix), criterion=entropy, max_features=0.9500000000000001, n_estimators=100, step=0.25), learning_rate=0.1, max_depth=10, max_features=0.4, min_samples_leaf=13, min_samples_split=20, n_estimators=100, subsample=0.6000000000000001)
Accuracy score: 0.9650
Elapsed time: 302.0887s


## Summary

From above, we compare the performance of out-of-the-box **XGBoost** with **auto-sklearn** and **TPOT** with time capped at 5 minutes. XGBoost achieves performs best and runs fast. TPOT achieves the same performance after 5 minutes while auto-sklearn lagged behind.