# HyperclassifierSearch Examples

# Example 1: train multiple classifiers

In [1]:
# import of the package (after e.g. 'pip install HyperclassifierSearch')
from HyperclassifierSearch import HyperclassifierSearch

# usage dependent imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# example dataset
from sklearn import datasets
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

In [2]:
# define classifiers and parameters for hyperfitting
models = {
    'LogisticRegression': LogisticRegression(solver='lbfgs', max_iter=10000),
    'RandomForestClassifier': RandomForestClassifier()
}
params = { 
    'LogisticRegression': { 'C': [0.1, 1, 2] },
    'RandomForestClassifier': { 'n_estimators': [16, 32] }
}

# run search
X_train, X_test, y_train, y_test = train_test_split(X, y)
search = HyperclassifierSearch(models, params)
best_model = search.train_model(X_train, y_train, cv=2)

Search for LogisticRegression ...
Search for RandomForestClassifier ...
Search is done.


In [3]:
search.evaluate_model()

Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'C': 0.1},0.948598,0.957547,0.953052,0.004474
1,LogisticRegression,{'C': 2},0.939252,0.966981,0.953052,0.013864
2,LogisticRegression,{'C': 1},0.934579,0.966981,0.950704,0.016201
3,RandomForestClassifier,{'n_estimators': 16},0.953271,0.938679,0.946009,0.007296
4,RandomForestClassifier,{'n_estimators': 32},0.943925,0.929245,0.93662,0.00734


# Example 2: add multiple pipelines to example 1

In [4]:
# additional imports
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# define the model including pipelines
models = {
    'LogisticRegression': Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(solver='lbfgs', max_iter=200))
    ]),
    'RandomForestClassifier': Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier())
    ])
}
params = { 
    'LogisticRegression': { 'clf__C': [0.1, 1, 2] },
    'RandomForestClassifier': { 'clf__n_estimators': [16, 32] }
}

X_train, X_test, y_train, y_test = train_test_split(X, y)
search = HyperclassifierSearch(models, params)
best_model = search.train_model(X_train, y_train, cv=10)
search.evaluate_model()

Search for LogisticRegression ...
Search for RandomForestClassifier ...
Search is done.




Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'clf__C': 0.1},1.0,0.931818,0.977273,0.952381,0.97619,0.97619,1.0,0.952381,1.0,0.97619,0.974178,0.021944
1,LogisticRegression,{'clf__C': 1},1.0,0.954545,0.954545,0.97619,0.97619,0.952381,1.0,0.952381,1.0,0.97619,0.974178,0.019361
2,LogisticRegression,{'clf__C': 2},1.0,0.954545,0.931818,0.97619,0.97619,0.928571,1.0,0.952381,1.0,0.97619,0.969484,0.025623
3,RandomForestClassifier,{'clf__n_estimators': 16},0.954545,0.954545,0.954545,0.97619,0.97619,0.952381,0.97619,0.952381,0.97619,1.0,0.967136,0.015194
4,RandomForestClassifier,{'clf__n_estimators': 32},0.931818,0.931818,0.954545,0.97619,0.97619,0.952381,0.97619,0.97619,0.97619,0.952381,0.960094,0.017544


In [5]:
# the usual parameters from GridSearchCV can be obtained:

print('highest score:', round(best_model.best_score_, 4), 
      '\n\n... with this estimator: \n\n', best_model.best_estimator_)

highest score: 0.9742 

... with this estimator: 

 Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=200,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)


# Example 3: using RandomizedSearchCV and more exhaustive search compared to example 2

In [6]:
# additional imports:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score, balanced_accuracy_score, fbeta_score

# model and parameter definition in a function
def build_model():
    models = {
        
        'LogisticRegression': Pipeline([
            ('scale', StandardScaler()),
            ('clf', LogisticRegression(solver='lbfgs'))
        ]),
        'RandomForestClassifier': Pipeline([
            ('scale', StandardScaler()),
            ('clf', RandomForestClassifier())
        ]),
        'AdaBoost': Pipeline([
            ('tfidf', StandardScaler()),
            ('clf', AdaBoostClassifier())  
        ])
    }
    params = { 
        'LogisticRegression': { 'clf__C': np.linspace(0.1, 1.0, num=10) },
        'RandomForestClassifier': { 'clf__n_estimators': np.arange(16,32+1) },
        'AdaBoost': { 'clf__n_estimators': np.arange(16,32+1) }
    }
    scorer = make_scorer(fbeta_score, beta=2, average='weighted')
    return models, params, scorer

In [7]:
# with the model defined in build_model() above the search reduces to:
models, params, scorer = build_model()
search = HyperclassifierSearch(models, params)
skf = StratifiedKFold(n_splits=5)
best_model = search.train_model(X_train, y_train, search='random', scoring=scorer, cv=skf)
search.evaluate_model()

Search for LogisticRegression ...
Search for RandomForestClassifier ...




Search for AdaBoost ...
Search is done.


Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'clf__C': 0.2},0.976535,0.976744,0.988353,0.975968,0.98804,0.98112,0.005775
1,LogisticRegression,{'clf__C': 0.7000000000000001},0.976535,0.98832,0.976667,0.975968,0.988076,0.981105,0.005787
2,LogisticRegression,{'clf__C': 1.0},0.976535,0.98832,0.965129,0.975968,0.988076,0.978775,0.008695
3,LogisticRegression,{'clf__C': 0.8},0.976535,0.98832,0.965129,0.975968,0.988076,0.978775,0.008695
4,LogisticRegression,{'clf__C': 0.9},0.976535,0.98832,0.965129,0.975968,0.988076,0.978775,0.008695
5,LogisticRegression,{'clf__C': 0.30000000000000004},0.976535,0.976744,0.976667,0.975968,0.988076,0.978768,0.004621
6,LogisticRegression,{'clf__C': 0.4},0.976535,0.976744,0.976667,0.975968,0.988076,0.978768,0.004621
7,LogisticRegression,{'clf__C': 0.5},0.976535,0.976744,0.976667,0.975968,0.988076,0.978768,0.004621
8,LogisticRegression,{'clf__C': 0.6},0.976535,0.976744,0.976667,0.975968,0.988076,0.978768,0.004621
9,LogisticRegression,{'clf__C': 0.1},0.964641,0.976744,0.988353,0.975968,0.98804,0.978719,0.008841


In [8]:
# use best model for prediction: best_model.predict(X_test)