In [1]:
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

# Tuning the parameters of a Classifier with Sklearn in Python

In [2]:
'''
The grid search provided by GridSearchCV exhaustively generates candidates from a grid of parameter values specified in a grid of values
http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
'''

#sklearn imports
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, roc_auc_score, accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

#other imports
from mlxtend.plotting import plot_decision_regions, plot_confusion_matrix
from matplotlib import pyplot as plt
import pandas as pd

ModuleNotFoundError: No module named 'mlxtend'

# IMPORT DATA

In [None]:
df = pd.read_csv('diabetes.txt')
df = df.as_matrix()
X=df[:,0:-1] 
y=df[:,-1]

# DEFINE THE CLASSIFIER AND THE PARAMETERS GRID

In [None]:
classifier = SVC(kernel='rbf') #class_weight='balanced'
#http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
parameters_SVC = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 'auto']}

# SPLIT DATA INTO TRAINING/TEST SET 80-20%

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size =0.2, #by default is 75%-25%
                                                    #shuffle is set True by default,
                                                    stratify=y,
                                                    random_state= 123) #fix random seed for replicability

# GRIDSEARCH the best parameters on TRAIN with CV=3 strategy with scoring='roc_auc' 

In [None]:
print('CLASSIFIER: ', str(type(classifier)))

#defining a gridsearch
gs = GridSearchCV(classifier, parameters_SVC, cv=3, scoring = 'roc_auc', verbose=3, n_jobs=-1, refit=True)

'''
Notes:
1) GS uses a Stratified K-Folds cross-validator: The folds are made by preserving the percentage of samples for each class.
2) if refit=True the model is retrained on the whole training set with the best found params
'''

#gridsearch the best params and retrain on all training set 
%time gs = gs.fit(X_train, y_train)
print('\n')

# summarize results
print('***GRIDSEARCH RESULTS***')
print("Best score: %f using %s" % (gs.best_score_, gs.best_params_))
means = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
params = gs.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
print('\n')

#test on test set
best_model = gs.best_estimator_
y_pred = best_model.predict(X_test)

#print out some metrics
print('***RESULTS ON TEST SET***')
print("accuracy_score",accuracy_score(y_test, y_pred))
print("f1_score", f1_score(y_test, y_pred))
print("roc_auc_score",roc_auc_score(y_test, y_pred))
print('\n')

print(classification_report(y_test, y_pred))
print('\n')
plot_confusion_matrix(confusion_matrix(y_test, y_pred))
plt.show() 