### k-Fold Cross Validation

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [14]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [4]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [6]:
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [7]:
classifier.score(X_train, y_train)

0.91333333333333333

In [8]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [9]:
cm

array([[64,  4],
       [ 3, 29]], dtype=int64)

In [10]:
classifier.score(X_test, y_test)

0.93000000000000005

In [11]:
# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
accuracies

array([ 0.80645161,  0.96666667,  0.8       ,  0.93333333,  0.86666667,
        0.83333333,  0.93333333,  0.93333333,  0.96666667,  0.96551724])

In [12]:
accuracies.mean()

0.90053021876158679

In [13]:
accuracies.std()

0.063889573566262847

### Grid search
#### How to choose a model is always a question???
1. Determine whether the problem is regression problem or classification or a clustering problem.

    i. See if it has a dependent variable or it doesn't.
    
    ii. If no dependent variable then it is a clustering problem 
    
    iii. If it it has a dependent variable, see if it has a continous 
    outcome. If yes, then it is a regression else if it has a categorical output a classification problem.
    
2. If the problem is a linear or non linear? Ans: Grid search will help you find if you should choose linear vs non linear models

In [15]:
# Applying Grid Search to find the best model and the best parameters
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 10, 100, 1000], 'kernel':['linear']},
              {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma': [0.5, 0.1, 0.01, 0.001, 0.0001]}
             ]
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = 'accuracy',
                          cv = 10,
                          n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [19]:
best_accuracy = grid_search.best_score_

In [20]:
best_accuracy

0.90000000000000002

In [21]:
best_parameters = grid_search.best_params_

In [23]:
best_parameters

{'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}

In [24]:
#Now lets change the gamma value and try to find the best gamma value

In [26]:
parameters = [{'C':[1, 10, 100, 1000], 'kernel':['linear']},
              {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 
               'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
             ]
             
grid_search = GridSearchCV(estimator = classifier,
                          param_grid = parameters,
                          scoring = 'accuracy',
                          cv = 10,
                          n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

In [27]:
print(grid_search.best_params_)

{'C': 1, 'gamma': 0.7, 'kernel': 'rbf'}


In [28]:
print(grid_search.best_score_)

0.903333333333
