# HyperclassifierSearch Examples

# Example 1: train multiple classifiers

In [1]:
# import of the package (after e.g. 'pip install HyperclassifierSearch')
from HyperclassifierSearch import HyperclassifierSearch

# usage dependent imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# example dataset
from sklearn import datasets
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

In [2]:
# define classifiers and parameters for hyperfitting
models = {
    'LogisticRegression': LogisticRegression(solver='lbfgs', max_iter=10000),
    'RandomForestClassifier': RandomForestClassifier()
}
params = { 
    'LogisticRegression': { 'C': [0.1, 1, 2] },
    'RandomForestClassifier': { 'n_estimators': [16, 32] }
}

# run search
X_train, X_test, y_train, y_test = train_test_split(X, y)
search = HyperclassifierSearch(models, params)
best_model = search.train_model(X_train, y_train, cv=2)
search.evaluate_model()

Search for LogisticRegression ...
Search for RandomForestClassifier ...
Search is done.
results round for: LogisticRegression
results round for: RandomForestClassifier


Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,mean_test_score,std_test_score
0,RandomForestClassifier,{'n_estimators': 32},0.967136,0.962441,0.964789,0.002347
1,RandomForestClassifier,{'n_estimators': 16},0.957746,0.967136,0.962441,0.004695
2,LogisticRegression,{'C': 2},0.929577,0.957746,0.943662,0.014085
3,LogisticRegression,{'C': 1},0.924883,0.957746,0.941315,0.016432
4,LogisticRegression,{'C': 0.1},0.920188,0.957746,0.938967,0.018779


# Example 2: add multiple pipelines to example 1

In [3]:
# additional imports
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# define the model including pipelines
models = {
    'LogisticRegression': Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(solver='lbfgs', max_iter=200))
    ]),
    'RandomForestClassifier': Pipeline([
        ('scale', StandardScaler()),
        ('clf', RandomForestClassifier())
    ])
}
params = { 
    'LogisticRegression': { 'clf__C': [0.1, 1, 2] },
    'RandomForestClassifier': { 'clf__n_estimators': [16, 32] }
}

X_train, X_test, y_train, y_test = train_test_split(X, y)
search = HyperclassifierSearch(models, params)
best_model = search.train_model(X_train, y_train, cv=10)
search.evaluate_model()

Search for LogisticRegression ...
Search for RandomForestClassifier ...
Search is done.
results round for: LogisticRegression
results round for: RandomForestClassifier


Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'clf__C': 1},0.953488,1.0,0.953488,0.976744,0.953488,1.0,0.97619,1.0,1.0,0.952381,0.976526,0.020931
1,LogisticRegression,{'clf__C': 0.1},0.953488,0.953488,0.953488,0.976744,0.953488,1.0,0.97619,1.0,1.0,0.952381,0.971831,0.020347
2,LogisticRegression,{'clf__C': 2},0.953488,1.0,0.953488,0.976744,0.930233,1.0,0.97619,0.97619,1.0,0.904762,0.967136,0.030144
3,RandomForestClassifier,{'clf__n_estimators': 32},0.906977,0.953488,0.976744,0.976744,0.976744,0.976744,0.928571,0.97619,0.952381,0.97619,0.960094,0.023581
4,RandomForestClassifier,{'clf__n_estimators': 16},0.930233,0.953488,0.906977,0.953488,0.976744,0.953488,0.928571,0.97619,0.952381,0.97619,0.950704,0.021972


In [4]:
# the usual parameters from GridSearchCV can be obtained:

print('highest score:', round(best_model.best_score_, 4), 
      '\n\n... with this estimator: \n\n', best_model.best_estimator_)

highest score: 0.9765 

... with this estimator: 

 Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=200,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)


# Example 3: using RandomizedSearchCV and more exhaustive search compared to example 2

In [5]:
# additional imports:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score, balanced_accuracy_score, fbeta_score

# model and parameter definition in a function
def build_model():
    models = {
        
        'LogisticRegression': Pipeline([
            ('scale', StandardScaler()),
            ('clf', LogisticRegression(solver='lbfgs'))
        ]),
        'RandomForestClassifier': Pipeline([
            ('scale', StandardScaler()),
            ('clf', RandomForestClassifier())
        ]),
        'AdaBoost': Pipeline([
            ('tfidf', StandardScaler()),
            ('clf', AdaBoostClassifier())  
        ])
    }
    params = { 
        'LogisticRegression': { 'clf__C': np.linspace(0.1, 1.0, num=10) },
        'RandomForestClassifier': { 'clf__n_estimators': np.arange(16,32+1) },
        'AdaBoost': { 'clf__n_estimators': np.arange(16,32+1) }
    }
    scorer = make_scorer(fbeta_score, beta=2, average='weighted')
    return models, params, scorer

In [6]:
# with the model defined in build_model() above the search reduces to:
models, params, scorer = build_model()
search = HyperclassifierSearch(models, params)
skf = StratifiedKFold(n_splits=5)
best_model = search.train_model(X, y, search='random', scoring=scorer, cv=skf)
search.evaluate_model()

Search for LogisticRegression ...
Search for RandomForestClassifier ...
Search for AdaBoost ...
Search is done.
results round for: LogisticRegression
results round for: RandomForestClassifier
results round for: AdaBoost


Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'clf__C': 0.7000000000000001},0.982609,0.982497,0.973188,0.973405,0.991139,0.980582,0.006693
1,LogisticRegression,{'clf__C': 0.8},0.982609,0.982497,0.973188,0.973405,0.991139,0.980582,0.006693
2,LogisticRegression,{'clf__C': 0.9},0.982609,0.982497,0.973188,0.973405,0.991139,0.980582,0.006693
3,LogisticRegression,{'clf__C': 1.0},0.982609,0.973869,0.973188,0.973405,0.991139,0.978838,0.007079
4,LogisticRegression,{'clf__C': 0.5},0.973869,0.973661,0.982185,0.973405,0.991139,0.978816,0.006967
5,LogisticRegression,{'clf__C': 0.6},0.973869,0.982497,0.973188,0.973405,0.991139,0.978815,0.007067
6,LogisticRegression,{'clf__C': 0.30000000000000004},0.973869,0.973661,0.991122,0.96445,0.991139,0.978812,0.010561
7,LogisticRegression,{'clf__C': 0.4},0.973869,0.973661,0.982185,0.96445,0.991139,0.977037,0.008975
8,LogisticRegression,{'clf__C': 0.1},0.982497,0.973661,0.982185,0.96445,0.982301,0.977026,0.00711
9,LogisticRegression,{'clf__C': 0.2},0.973869,0.964766,0.982185,0.96445,0.982301,0.973485,0.007878


In [7]:
# use best model for prediction: best_model.predict(X_test)