In [1]:
# Reference: http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

In [2]:
# import libraries
import timeit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
% matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV,LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix,accuracy_score, roc_curve,auc
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings("ignore")

In [3]:
# set pandas option
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [4]:
# Create class for grid search across multiple classifiers
class ModelSelection():
    def __init__(self, models, params):
        if set(models.keys()) != set(params.keys()):
            raise ValueError("Unmatched key pairs.")
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
                             
    def fit(self, X, y, cv=3, n_jobs=1, verbose=0, scoring=None, refit=False):
        start = timeit.default_timer()
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            param = self.params[key]
            gs = GridSearchCV(model, param, cv=3, n_jobs=1, verbose=0, scoring=None, refit=False)
            gs.fit(X,y)
            self.grid_searches[key] = gs
        stop = timeit.default_timer()
        print('Process Time: {:.2f}mins'.format((stop - start)/60) )
            
    def best_model(self, rank=1, estimator = 'all'):
        if rank <=0:
            raise ValueError("Rank must be postive number.")
        score_summary = self.score_summary()
        if estimator != 'all':
            score_summary = score_summary[score_summary['estimator']==estimator]
        
        assert score_summary is not None, 'Wrong Esimator name.'     
        best_from_summary = score_summary.sort_values(['mean_test_score'], ascending=False).iloc[rank-1].dropna()
        best_model = self.models[best_from_summary['estimator']]
        params = best_from_summary['params']
        return {'name': str(best_from_summary['estimator']), 'est_params':best_model.set_params(**params)}

    
    def score_summary(self, sort_by='mean_test_score'):
        result_all = []
        for key in self.keys:
            cv_results = self.grid_searches[key].cv_results_
            
            split_test_scores = []
            cv_count = 0
            while True:
                try:
                    split_test_scores.append(cv_results['split'+str(cv_count)+'_test_score'] )
                    cv_count +=1
                except:
                    break
               
            split_test_scores = np.vstack(split_test_scores)    
            
            result = pd.DataFrame.from_dict(cv_results['params'])
            result['estimator'] = key
            result['mean_test_score'] = cv_results['mean_test_score']
            result['std_test_score'] = cv_results['std_test_score']  
            result['min_test_score'] = np.min(split_test_scores,axis=0) 
            result['max_test_score'] = np.max(split_test_scores,axis=0) 
            result['params'] = cv_results['params']
            
            result_all.append(result)
            
        result_all = pd.concat(result_all).sort_values([sort_by], ascending=False) 
            
        columns = ['estimator', 'mean_test_score','min_test_score','max_test_score', 'std_test_score']
        columns = columns + [c for c in result_all.columns if c not in columns]
        
        return result_all[columns]

In [5]:
# import data (using wine data as example)
df = pd.read_csv('https://archive.ics.uci.edu/ml/''machine-learning-databases/wine/wine.data',header=None)
df.columns = ['Class label', 'Alcohol',
                   'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium',
                   'Total phenols', 'Flavanoids',
                   'Nonflavanoid phenols','Proanthocyanins',
                   'Color intensity', 'Hue',
                   'OD280/OD315 of diluted wines',
                   'Proline']
df.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [6]:
# Check data stat
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Class label,178.0,1.938,0.775,1.0,1.0,2.0,3.0,3.0
Alcohol,178.0,13.001,0.812,11.03,12.362,13.05,13.678,14.83
Malic acid,178.0,2.336,1.117,0.74,1.603,1.865,3.083,5.8
Ash,178.0,2.367,0.274,1.36,2.21,2.36,2.558,3.23
Alcalinity of ash,178.0,19.495,3.34,10.6,17.2,19.5,21.5,30.0
Magnesium,178.0,99.742,14.282,70.0,88.0,98.0,107.0,162.0
Total phenols,178.0,2.295,0.626,0.98,1.742,2.355,2.8,3.88
Flavanoids,178.0,2.029,0.999,0.34,1.205,2.135,2.875,5.08
Nonflavanoid phenols,178.0,0.362,0.124,0.13,0.27,0.34,0.438,0.66
Proanthocyanins,178.0,1.591,0.572,0.41,1.25,1.555,1.95,3.58


In [7]:
# Check Class Label Counts
df['Class label'].value_counts()

2    71
1    59
3    48
Name: Class label, dtype: int64

In [8]:
# Train Test Split
X, y = df.iloc[:, 1:].values, df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [9]:
# Standardize data
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [10]:
## Fit model
np.random.seed(0)

# classifiers set
models = {
    'KNN':KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'XGBClassifier':XGBClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'LogisticRegression':LogisticRegression(),
    'SVC': SVC()
}

# params set
params = {
    'KNN': {'n_neighbors': [3,4,5,6,7]},
    'RandomForestClassifier': { 'n_estimators': [50, 100, 200],'max_depth':[1,2,3,4,5,6,7] },
    'XGBClassifier': {'n_estimators': [50, 100, 200], 'min_child_weight': [1, 5, 10],
                      'gamma': [1, 2, 4],'subsample': [0.6, 0.8, 1.0],
                      'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [3, 4, 5]},
    'AdaBoostClassifier':  { 'n_estimators': [50, 100, 200] },
    'GradientBoostingClassifier': { 'n_estimators': [50, 100, 200], 'learning_rate': [0.2,0.4,0.6,0.8,1.0] },
    'LogisticRegression': {'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
    'SVC': [
               {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0]},
               {'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0], 'gamma': [0.01, 0.1, 1.0, 10.0]}
           ]
}

ms = ModelSelection(models, params)
ms.fit(X_train_std, y_train, scoring='accuracy')
ms.score_summary().head(20)

Running GridSearchCV for KNN.
Running GridSearchCV for RandomForestClassifier.
Running GridSearchCV for XGBClassifier.
Running GridSearchCV for AdaBoostClassifier.
Running GridSearchCV for GradientBoostingClassifier.
Running GridSearchCV for LogisticRegression.
Running GridSearchCV for SVC.
Process Time: 1.21mins


Unnamed: 0,estimator,mean_test_score,min_test_score,max_test_score,std_test_score,C,colsample_bytree,gamma,kernel,learning_rate,max_depth,min_child_weight,n_estimators,n_neighbors,params,subsample
18,SVC,0.976,0.929,1.0,0.034,1.0,,0.1,rbf,,,,,,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",
6,LogisticRegression,0.976,0.95,1.0,0.02,100.0,,,,,,,,,{'C': 100.0},
22,SVC,0.976,0.952,1.0,0.02,10.0,,0.1,rbf,,,,,,"{'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'}",
5,LogisticRegression,0.976,0.95,1.0,0.02,10.0,,,,,,,,,{'C': 10.0},
55,XGBClassifier,0.968,0.952,1.0,0.022,,0.6,1.0,,,5.0,1.0,50.0,,"{'colsample_bytree': 0.6, 'gamma': 1, 'max_dep...",0.8
28,XGBClassifier,0.968,0.952,1.0,0.022,,0.6,1.0,,,4.0,1.0,50.0,,"{'colsample_bytree': 0.6, 'gamma': 1, 'max_dep...",0.8
4,LogisticRegression,0.968,0.95,0.976,0.012,1.0,,,,,,,,,{'C': 1.0},
22,XGBClassifier,0.96,0.929,1.0,0.029,,0.6,1.0,,,3.0,10.0,100.0,,"{'colsample_bytree': 0.6, 'gamma': 1, 'max_dep...",0.8
12,RandomForestClassifier,0.96,0.929,1.0,0.029,,,,,,5.0,,50.0,,"{'max_depth': 5, 'n_estimators': 50}",
19,RandomForestClassifier,0.96,0.929,1.0,0.029,,,,,,7.0,,100.0,,"{'max_depth': 7, 'n_estimators': 100}",


In [12]:
# Performance on test set
cls = ms.best_model()['est_params']
cls = cls.fit(X_train_std, y_train)
y_pred = cls.predict(X_test_std)
print('test accuracy: {:.2f}% '.format(accuracy_score(y_test, y_pred)*100))

test accuracy: 100.00% 
