# How to use GridSearchCV?

# Support Vector Machines + GridSearchCV

In [None]:
#First, let us see what are the various arguments that are taken by GridSearchCV function:


sklearn.model_selection.GridSearchCV(estimator, param_grid,scoring=None,
          n_jobs=None, iid='deprecated', refit=True, cv=None, verbose=0, 
          pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False) 



1.estimator: Pass the model instance for which you want to check the hyperparameters.
2.param_grid: the dictionary object that holds the hyperparameters you want to try
3.scoring: evaluation metric that you want to use, you can simply pass a valid string/ object of evaluation metric
4.cv: number of cross-validation you have to try for each selected set of hyperparameters
5.verbose: you can set it to 1 to get the detailed result print out while you fit the data to GridSearchCV
6.n_jobs: number of processes you wish to run in parallel for this task. If set to -1 it will use all available processors. 

#n_jobs: Specify the number of cores to use for key machine learning tasks.-1 means all cores
#It you want to use multiprocessing to parallely train your model on all CPUs , then njobs will be helpful for you.
#It just tells how many parallel processes you want to configure for training . 
#njobs = -1 means you want to use all the available cores , 
#and if you specify with a particular value e.g. 4, then those only cores will be used for training


#You should be aware, and will probably notice if you enable verbose > 0, that printing to the screen is generally a very slow process. 
#The algorithm may run an order of magnitude slower, or more, with verbose enabled
#You can also set verbose = True instead of a number >0
    

In [None]:
#Now, let us see how to use GridSearchCV to improve the accuracy of our model. 

#Here we are going to train the model twice, once without using GridsearchCV(using the default hyperparameters) and 
#the other time we will use GridSearchCV to find the optimal values of hyperparameters for the dataset at hand. 
#We use the famous Breast Cancer Wisconsin (Diagnostic) Data Set which we import from the Scikit-learn library.

# Support Vector Machines Model without GridSearchCV

In [1]:
#import all necessary libraries
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
 
#load the dataset and split it into training and testing sets
dataset = load_breast_cancer()
X=dataset.data
Y=dataset.target
X_train, X_test, y_train, y_test = train_test_split( 
                        X,Y,test_size = 0.30, random_state = 101) 

# train the model on train set without using GridSearchCV 
model = SVC() 

#fit model on training data
model.fit(X_train, y_train) 
   
#obtain prediction results 
predictions = model.predict(X_test) 

#obtain and print performance metrics
print(classification_report(y_test, predictions)) 


              precision    recall  f1-score   support

           0       0.95      0.85      0.90        66
           1       0.91      0.97      0.94       105

    accuracy                           0.92       171
   macro avg       0.93      0.91      0.92       171
weighted avg       0.93      0.92      0.92       171



In [None]:
#This classification report gives you a lot of information.

#You get the precision, recall, F1 score, and accuracy. 
#You can see that your precision for both classes is relatively close, 
#and you also see an small difference in terms of recall for the two classes. 
#The difference between F1 scores is also not sizable.This means the model performed well

# Support Vector Machines Model with GridSearchCV

In [15]:
# define parameter range 
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'gamma':['scale', 'auto'],
              'kernel': ['linear']}  
   
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = True,n_jobs=-1) 
   
# fitting the model with Grid Search 
grid.fit(X_train, y_train) 
 
# print best parameter after tuning 
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 
   
# print classification report 
print(classification_report(y_test, grid_predictions))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        63
           1       0.98      0.97      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



In [None]:
#Note the following about the hyperparameters in param_grid:



#C (or Cost hyperparameter)

#C (aka Cost hyperparameter) is the penalty parameter of the error term. 
#It controls the trade off between smooth decision boundary and classifying the training points correctly. 
#Increasing C values may lead to overfitting the training data.

#The C hyperparameter adds a penalty for each misclassified data point. 
#If C is small, the penalty for misclassified points is low so a decision boundary 
# with a large margin is chosen at the expense of a greater number of misclassifications

#A high value of C means a higher penalty and is focused on achieving better accuracy by avoiding errors or misclassification 
#and hence opts for a SVM hyperplane with a better classification accuracy resulting in 
#a smaller margin hyperplane also known as Hard Margin



#C: (Default = 1.0) Controls the tradeoff between smooth decision boundary and classifying training points correctly. 
#A large value of C will allow to include more training points therefore leading to a more intricate boundary.

#The C parameter trades off correct classification of training examples against maximization of the decision function's margin


#gamma

#The gamma parameters can be seen as the inverse of the radius of influence of samples selected 
#by the model as support vectors. 

#gamma is a hyperparameter used with non-linear SVM. 
#One of the most commonly used non-linear kernels is the radial basis function (RBF). 
#Gamma parameter of RBF controls the distance of the influence of a single training point

#gamma: Defines how far the influence of a single training example reaches. 
#A High gamma value means only the closest points to the decision boundary will carry the weigth leading to a smoother boundary
#gamma is a parameter for non linear hyperplanes. 
#The higher the gamma value it tries to exactly fit the training data set gammas

#The default value for gamma is 3. 
#gamma is the kernel coefficient for 'rbf', 'poly', and 'sigmoid'. 
#If gamma is 'auto', then 1/n features will be used instead

#Kernel

#SVM algorithms use a group of mathematical functions that are known as kernels. 
#The function of a kernel is to require data as input and transform it into the desired form. 

#Different SVM algorithms use differing kinds of kernel functions. 
#These kernel functions are of different kinds. For example: linear, nonlinear, polynomial, radial basis function (RBF), and sigmoid


In [None]:
#This classification report gives you a lot of information.

#You get the precision, recall, F1 score, and accuracy. 
#You can see that your precision for both classes is relatively close, 
#and you also see an small difference in terms of recall for the two classes. 
#The difference between F1 scores is also not sizable.This means the model performed better than the model without GridSearch

In [None]:
#It might seem that {‘C’: 100, ‘gamma’: ‘scale’, ‘kernel’: ‘linear’} are the best values for hyperparameters for an SVM model. 
#This is not the case, the above-mentioned hyperparameters may be the best for the dataset we are working on. 
#But for any other dataset, the SVM model can have different optimal values for hyperparameters that may improve its performance.

# Random Forest Model without Grid Search

In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.datasets import load_breast_cancer 
from sklearn.metrics import (precision_score,
                            accuracy_score)

warnings.filterwarnings('ignore')


df = load_breast_cancer()
X=df.data
y=df.target

#You instantiate a model, call .fit() to train it, and then call .predict() to get predictions. 
#We instantiate a RandomForestClassifier() and keep all default parameter values.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

rfc = RandomForestClassifier()

#Fit model without Grid Search

rfc.fit(X_train, y_train)


# Make predictions for the test set
y_pred_test = rfc.predict(X_test)








In [10]:
# View accuracy score
accuracy_score(y_test, y_pred_test)

0.9707602339181286

In [11]:
# View confusion matrix for test data and predictions
confusion_matrix(y_test, y_pred_test)

array([[ 59,   4],
       [  1, 107]], dtype=int64)

In [12]:



# View the classification report for test data and predictions
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        63
           1       0.96      0.99      0.98       108

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



# Random Forest Model with Grid Search

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings

warnings.filterwarnings('ignore')


df = load_breast_cancer()
X=df.data
y=df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

rfc = RandomForestClassifier()

forest_params = [{'max_depth': list(range(10, 15)), 'max_features': list(range(0,14))}]

#Fit model with Grid Search
clf = GridSearchCV(rfc, forest_params, cv = 10, scoring='accuracy')

clf.fit(X_train, y_train)

#best_params_ will give the combination of hyperparameters along with values that give the best performance of our estimate specified
print(clf.best_params_)

#best_score_ is the average of all cv folds for a single combination of the parameters you specify in the tuned_params
print(clf.best_score_)


{'max_depth': 10, 'max_features': 5}
0.9597435897435898


In [None]:
#Note for Random Forest

#max depth: 
#max_depth is the number of splits that each decision tree is allowed to make. 
#If the number of splits is too low, the model underfits the data and if it is too high the model overfits. 
#Generally, we go with a max depth of 3, 5, or 7

#max features

#Random forest takes random subsets of features and tries to find the best split. 
#max_features helps to find the number of features to take into account in order to make the best split. 
#max_feature is the number of features to consider each time to make the split decision


# Logistic Regression + Random Search

In [None]:
#Below we will use hyperparameter optimization to discover a well-performing model configuration for the sonar dataset.

#The sonar dataset is a standard machine learning dataset comprising 208 rows of data with 60 numerical input variables and a target variable with two class values, e.g. binary classification.

#Using a test harness of repeated stratified 10-fold cross-validation with three repeats, a naive model can achieve an accuracy of about 53 percent. 
#A top-performing model can achieve accuracy on this same test harness of about 88 percent. 
#This provides the bounds of expected performance on this dataset.

#The dataset involves predicting whether sonar returns indicate a rock or simulated mine.

In [19]:
# Random Search logistic regression model on the sonar dataset
#Author - Jason Brownlee

from scipy.stats import loguniform
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)

# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]

# define model
model = LogisticRegression()

# define evaluation/validation strategy
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space

#The search space is a dictionary where names are arguments to the model and values are distributions from which to draw samples. 
#We will optimize the solver, the penalty, and the C hyperparameters of the model with discrete distributions 
#for the solver and penalty type and a log-uniform distribution from 1e-5 to 100 for the C value.

#Log-uniform is useful for searching penalty values as we often explore values at different orders of magnitude, 
#at least as a first step.

space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = loguniform(1e-5, 100)

#C is known as a "Cost hyperparameter." 

#Parameters are numbers that tell the model what to do with the characteristics, 
#whereas hyperparameters instruct the model on how to choose parameters.
#Regularization will penalize the extreme parameters, the extreme values in the training data leads to overfitting.

#Hyperparameters are very critical in building robust and accurate models. 
#Hyperparameters help us find the balance between bias and variance

#A high value of C tells the algorithm to give more weight to the training data. 
#A lower value of C will indicate the model to give complexity more weight at the cost of fitting the data. 
#Thus, a high Hyperparameter value for C indicates that training data is more important and reflects the real world data, 
#whereas low value is just the opposite of this



#Next, we can define the search procedure with all of these elements.

#Importantly, we must set the number of iterations or samples to draw from the search space via the “n_iter” argument. 
#In this case, we will set it to 500.

# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

# execute search
result = search.fit(X, y)

# summarize (and display) results
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: 0.7897619047619049
Best Hyperparameters: {'C': 4.878363034905756, 'penalty': 'l2', 'solver': 'newton-cg'}


In [None]:
#At the end of the search, you can access all of the results via attributes on the class. 
#Perhaps the most important attributes are the best score observed and the hyperparameters that achieved the best score.

In [None]:
#Once you know the set of hyperparameters that achieve the best result, you can then define a new model, 
# set the values of each hyperparameter, then fit the model on all available data. 
#This model can then be used to make predictions on new data

# Logistic Regression + Grid Search

In [None]:
#Using the Grid Search is much like using the Random Search for classification.

#The main difference is that the search space must be a discrete grid to be searched. 
#This means that instead of using a log-uniform distribution for C, we can specify discrete values on a log scale.

#Additionally, the GridSearchCV class does not take a number of iterations, 
#as we are only evaluating combinations of hyperparameters in the grid.



In [None]:
# Grid Search logistic regression model on the sonar dataset
#Author - Jason Brownlee

from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)

# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]

# define model
model = LogisticRegression()

# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space
space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]

# define search
search = GridSearchCV(model, space, scoring='accuracy', n_jobs=-1, cv=cv)

# execute search
result = search.fit(X, y)

# summarize (and display) results
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
#Running the example above may take a moment. 
#It is fast because we are using a small search space and a fast model to fit and evaluate. 
#Again, you may see some warnings during the optimization for invalid configuration combinations. These can be safely ignored.

#At the end of the run, the best score and hyperparameter configuration that achieved the best performance are reported.

#Your specific results will vary given the stochastic nature of the optimization procedure. 
#Try running the example a few times.

#In this case, we can see that the best configuration achieved an accuracy of about 78.2% 
#which is also fair and the specific values for the solver, penalty and C hyperparameters used to achieve that score. 
#Interestingly, the results are very similar to those found via the random search.

# Regression + Random Search

In [None]:
#Hyperparameter Optimization for Regression

#The code below will use hyperparameter optimization to discover a top-performing 
#model configuration for the auto insurance dataset.

#The auto insurance dataset is a standard machine learning dataset 
# comprising 63 rows of data with 1 numerical input variable and a numerical target variable.

#Using a test harness of repeated stratified 10-fold cross-validation with 3 repeats, 
#a naive model can achieve a mean absolute error (MAE) of about 66.
#A top performing model can achieve a MAE on this same test harness of about 28. 
#This provides the bounds of expected performance on this dataset.

#The dataset involves predicting the total amount in claims (thousands of Swedish Kronor) 
#given the number of claims for different geographical regions

In [13]:
#Random Search linear regression model on the auto insurance dataset
#Author - Jason Brownlee

#The main difference in regression compared to classification is the choice of the scoring method.

#For regression, performance is often measured using an error, which is minimized, 
#with zero representing a model with perfect skill. 

#The hyperparameter optimization procedures in scikit-learn assume a maximizing score. 
#Therefore a version of each error metric is provided that is made negative.
#This means that large positive errors become large negative errors, 
#good performance are small negative values close to zero and perfect skill is zero.

#The sign of the negative MAE can be ignored when interpreting the result.

#In this case we will mean absolute error (MAE) and 
#a maximizing version of this error is available by setting the “scoring” argument to “neg_mean_absolute_error“.



from scipy.stats import loguniform
from pandas import read_csv
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV

#Running the example downloads the dataset and splits it into input and output elements. 
#As expected, we can see that there are 63 rows of data with 1 input variable.

# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/auto-insurance.csv'
dataframe = read_csv(url, header=None)

# split into input and output elements


data = dataframe.values
X, y = data[:, :-1], data[:, -1]

#Next, we can use hyperparameter optimization to find a good model configuration for the auto insurance dataset.

#To keep things simple, we will focus on a linear model, the linear regression model 
#and the common hyperparameters tuned for this model.


# define model
model = Ridge()

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)


#Random Search for Regression
#Configuring and using the random search hyperparameter optimization procedure for regression 
#is much like using it for classification.

#In this case, we will configure the important hyperparameters of the linear regression implementation, 
#including the solver, alpha, fit_intercept, and normalize.

#We will use a discrete distribution of values in the search space for all except the “alpha” 
#argument which is a penalty term, in which case we will use a log-uniform distribution 
#as we did in the case for logistic regression for the “C” argument of logistic regression.

# define search space
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = loguniform(1e-5, 100)
space['fit_intercept'] = [True, False]
space['normalize'] = [True, False]

# define search
search = RandomizedSearchCV(model, space, n_iter=500, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1)

# execute search
result = search.fit(X, y)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -29.12510803005527
Best Hyperparameters: {'alpha': 0.00011012287475478404, 'fit_intercept': True, 'normalize': True, 'solver': 'sag'}


In [None]:
#Running the example may take a moment. 
#It is fast because we are using a small search space and a fast model to fit and evaluate. 
#You may see some warnings during the optimization for invalid configuration combinations. These can be safely ignored.

#At the end of the run, the best score and hyperparameter configuration that achieved the best performance are reported.

#Your specific results will vary given the stochastic nature of the optimization procedure. 
#Try running the example a few times.

#In this case, we can see that the best configuration achieved a MAE of about 29.1, 
#which is very close to the best performance on the model. 
#We can then see the specific hyperparameter values that achieved this result.

# Regression + GridSearch

In [None]:
#As a grid search, we cannot define a distribution to sample and 
# instead must define a discrete grid of hyperparameter values. 
#As such, we will specify the “alpha” argument as a range of values on a log-10 scale.

#Grid search for regression requires that the “scoring” be specified, much as we did for random search.

#In this case, we will again use the negative MAE scoring function.

In [14]:
#Grid Search linear regression model on the auto insurance dataset
#Author - Jason Brownlee

from pandas import read_csv
from sklearn.linear_model import Ridge
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/auto-insurance.csv'
dataframe = read_csv(url, header=None)

# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]

# define model
model = Ridge()

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# define search space
space = dict()
space['solver'] = ['svd', 'cholesky', 'lsqr', 'sag']
space['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
space['fit_intercept'] = [True, False]
space['normalize'] = [True, False]

# define search
search = GridSearchCV(model, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)

# execute search
result = search.fit(X, y)

# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -29.23817478943268
Best Hyperparameters: {'alpha': 1, 'fit_intercept': True, 'normalize': False, 'solver': 'sag'}


In [None]:
#Running the example may take a minute. 
#It is fast because we are using a small search space and a fast model to fit and evaluate. 
#Again, you may see some warnings during the optimization for invalid configuration combinations. 
#These can be safely ignored.

#At the end of the run, the best score and hyperparameter configuration that achieved the best performance are reported.

#Your specific results will vary given the stochastic nature of the optimization procedure. 
#Try running the example a few times.

#In this case, we can see that the best configuration achieved a MAE of about 29.2, 
#which is nearly identical to what we achieved with the random search in the previous section. 
#Interestingly, the hyperparameters are also nearly identical, which is good confirmation