In [1]:
# based on code from: https://github.com/davidsbatista/machine-learning-notebooks/blob/master/hyperparameter-across-models.ipynb
# my enhancements: 
#     support for pipelines, 
#     added RandomizedSearchCV, output best overall model, assert statements, documentation 


# imports
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 
from sklearn.pipeline import Pipeline

class HyperclassifierSearch:
    """Train multiple classifiers/pipelines with GridSearchCV or RandomizedSearchCV.
    
    HyperclassifierTuning implements a "train_model" and "evaluate_model" method.
    
    "train_model" returns the optimal model according to the scoring metric.
    
    "evaluate_model" gives the results for all classifiers/pipelines.
    """
    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.grid_results = {}
    
    def train_model(self, X_train, y_train, search='grid', **search_kwargs):
        """
        Optimizing over one or multiple classifiers or pipelines.
        
        Input: 
        X : array or dataframe with features; this should be a training dataset
        y : array or dataframe with label(s); this should be a training dataset

        Output: 
        returns the optimal model according to the scoring metric
        
        Parameters:
        search : str, default='grid'
            define the search
            ``grid`` performs GridSearchCV 
            ``random`` performs RandomizedSearchCV
            
        **search_kwargs : kwargs
            additional parameters passed to the search
        """
        grid_results = {}
        best_score = 0
        
        for key in self.models.keys():
            print('Search for {}'.format(key), '...')
            assert search in ('grid', 'random'), 'search parameter out of range'
            if search=='grid':
                grid = GridSearchCV(self.models[key], self.params[key], **search_kwargs)
            if search=='random':
                grid = RandomizedSearchCV(self.models[key], self.params[key], **search_kwargs)
            grid.fit(X_train, y_train)
            self.grid_results[key] = grid
        
            if grid.best_score_ > best_score: # to return best model
                best_score = grid.best_score_
                best_model = grid
                
        print('Search is done.')
        return best_model # allows to predict with the best model overall
        
    def evaluate_model(self, sort_by='mean_test_score', show_timing_info=False):
        """
        Provides sorted model results for multiple classifier or pipeline runs of
        GridSearchCV or RandomizedSearchCV.
        
        Input: Fitted search object (accesses cv_results_).  
        Output: Dataframe with a line for each training run including estimator name, parameters, etc.
        Parameters:
        sort_by: the metric to rank the model results
        """
        results = []
        for key, result in self.grid_results.items():
            print('results round for:', key) 
            # get rid of column which is estimator specific, 
            # i.e. use df for multiple estimators 
            # regex 'not in': https://stackoverflow.com/questions/1971738/regex-for-all-strings-not-containing-a-string#1971762
            result = pd.DataFrame(result.cv_results_).filter(regex='^(?!.*param_).*')
            if show_timing_info==False: # skip timing info
                result = result.filter(regex='^(?!.*time).*')
            # add column with the name of the estimator
            result = pd.concat((pd.DataFrame({'Estimator': [key] * result.shape[0] }), result), axis=1)
            results.append(result)
                
        # handle combined classifier results: 
        # sort by target metric and remove subset rank scores
        df_results = pd.concat(results).sort_values([sort_by], ascending=False).\
                        reset_index().drop(columns = ['index', 'rank_test_score'])
        return df_results

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn import datasets
breast_cancer = datasets.load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

In [3]:
# Example 1: train multiple classifiers

# Classifiers and parameters for hyperfitting
models = {
    'LogisticRegression': LogisticRegression(solver='lbfgs', max_iter=10000),
    'LinearSVC': LinearSVC(max_iter=100000)
}
params = { 
    'LogisticRegression': { 'C': [0.1, 1, 2] },
    'LinearSVC': { 'C': [1, 10, 100] }
}

X_train, X_test, y_train, y_test = train_test_split(X, y)
search = HyperclassifierSearch(models, params)
best_model = search.train_model(X, y, cv=2, iid=False)
search.evaluate_model()

Search for LogisticRegression ...
Search for LinearSVC ...




Search is done.
results round for: LogisticRegression
results round for: LinearSVC




Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'C': 2},0.947368,0.943662,0.945515,0.001853
1,LogisticRegression,{'C': 1},0.94386,0.943662,0.943761,9.9e-05
2,LinearSVC,{'C': 1},0.929825,0.950704,0.940264,0.01044
3,LogisticRegression,{'C': 0.1},0.94386,0.93662,0.94024,0.00362
4,LinearSVC,{'C': 100},0.926316,0.897887,0.912102,0.014214
5,LinearSVC,{'C': 10},0.894737,0.880282,0.887509,0.007228


In [4]:
# Example 2: add multiple pipelines to example 1

# Defining the Pipeline
models = {
    'LogisticRegression': Pipeline([
        ('scale', StandardScaler()),
        ('clf', LogisticRegression(solver='lbfgs', max_iter=200))
    ]),
    'LinearSVC': Pipeline([
        ('scale', StandardScaler()),
        ('clf', LinearSVC(max_iter=100000))
    ])
}
params = { 
    'LogisticRegression': { 'clf__C': [0.1, 1, 2] },
    'LinearSVC': { 'clf__C': [1, 10, 100] }
}

X_train, X_test, y_train, y_test = train_test_split(X, y)
search = HyperclassifierSearch(models, params)
best_model = search.train_model(X, y, cv=10, iid=False)
search.evaluate_model()

Search for LogisticRegression ...
Search for LinearSVC ...
Search is done.
results round for: LogisticRegression
results round for: LinearSVC


Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'clf__C': 1},0.982759,0.982759,0.982456,0.964912,0.982456,0.982456,0.947368,1.0,1.0,0.982143,0.980731,0.014577
1,LogisticRegression,{'clf__C': 2},0.982759,0.982759,0.982456,0.964912,0.982456,0.982456,0.947368,1.0,1.0,0.982143,0.980731,0.014577
2,LogisticRegression,{'clf__C': 0.1},1.0,0.948276,0.964912,0.964912,1.0,0.964912,0.929825,1.0,1.0,0.964286,0.973712,0.023788
3,LinearSVC,{'clf__C': 1},0.931034,0.982759,0.982456,0.947368,0.964912,0.982456,0.964912,0.982143,1.0,0.982143,0.972018,0.01926
4,LinearSVC,{'clf__C': 10},0.931034,0.982759,0.964912,0.912281,0.964912,0.982456,0.964912,0.982143,1.0,1.0,0.968541,0.02678
5,LinearSVC,{'clf__C': 100},0.931034,0.982759,0.947368,0.894737,0.964912,0.947368,0.947368,0.964286,1.0,1.0,0.957983,0.030491


In [5]:
# the usual parameters from GridSearchCV can be obtained:

print('reach highest score', round(best_model.best_score_, 4), 
      '\n\n...with best estimator: \n\n', best_model.best_estimator_)

reach highest score 0.9807 

...with best estimator: 

 Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=200,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)


In [6]:
# Example 3: using RandomizedSearchCV and more exhaustive search compared to example 2

import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, make_scorer, accuracy_score, balanced_accuracy_score, fbeta_score

def build_model():
    models = {
        
        'LogisticRegression': Pipeline([
            ('scale', StandardScaler()),
            ('clf', LogisticRegression(solver='lbfgs'))
        ]),
        'LinearSVC': Pipeline([
            ('scale', StandardScaler()),
            ('clf', LinearSVC(max_iter=100000))
        ]),
        'AdaBoost': Pipeline([
            ('tfidf', StandardScaler()),
            ('clf', AdaBoostClassifier())  
        ])
    }
    params = { 
        'LogisticRegression': { 'clf__C': np.linspace(0.1, 1.0, num=10) },
        'LinearSVC': { 'clf__C': np.linspace(0.1, 100, num=100) },
        'AdaBoost': { 'clf__n_estimators': np.arange(16,32+1) }
    }
    scorer = make_scorer(fbeta_score, beta=2, average='weighted')
    return models, params, scorer

models, params, scorer = build_model()
search = HyperclassifierSearch(models, params)
skf = StratifiedKFold(n_splits=5)
best_model = search.train_model(X, y, search='random', scoring=scorer, cv=skf, iid=False)
search.evaluate_model()

Search for LogisticRegression ...
Search for LinearSVC ...
Search for AdaBoost ...
Search is done.
results round for: LogisticRegression
results round for: LinearSVC
results round for: AdaBoost


Unnamed: 0,Estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
0,LogisticRegression,{'clf__C': 0.7000000000000001},0.982609,0.982497,0.973188,0.973405,0.991139,0.980568,0.006714
1,LogisticRegression,{'clf__C': 0.8},0.982609,0.982497,0.973188,0.973405,0.991139,0.980568,0.006714
2,LogisticRegression,{'clf__C': 0.9},0.982609,0.982497,0.973188,0.973405,0.991139,0.980568,0.006714
3,LogisticRegression,{'clf__C': 0.5},0.973869,0.973661,0.982185,0.973405,0.991139,0.978852,0.006979
4,LogisticRegression,{'clf__C': 0.30000000000000004},0.973869,0.973661,0.991122,0.96445,0.991139,0.978848,0.01059
5,LogisticRegression,{'clf__C': 1.0},0.982609,0.973869,0.973188,0.973405,0.991139,0.978842,0.007095
6,LogisticRegression,{'clf__C': 0.6},0.973869,0.982497,0.973188,0.973405,0.991139,0.97882,0.007083
7,LogisticRegression,{'clf__C': 0.4},0.973869,0.973661,0.982185,0.96445,0.991139,0.977061,0.009003
8,LogisticRegression,{'clf__C': 0.1},0.982497,0.973661,0.982185,0.96445,0.982301,0.977019,0.007125
9,LogisticRegression,{'clf__C': 0.2},0.973869,0.964766,0.982185,0.96445,0.982301,0.973514,0.007889
