In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [3]:
X = dataset.iloc[:,2:4].values
Y = dataset.iloc[:,-1].values

In [4]:
# dividing data in train and test 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state =0)

In [5]:
# scaling data for right prediction
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [6]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0) #Explicitly making it linear
classifier.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [7]:
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1], dtype=int64)

In [8]:
# This is right method to check performance of model but not the best model

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)
cm

array([[64,  4],
       [ 3, 29]], dtype=int64)

In [9]:
# This is much better way to check model performance K-fold cross and here K = 10

from sklearn.model_selection import cross_val_score 
accuracies = cross_val_score(estimator = classifier, X = X_train, y = Y_train, cv = 10, n_jobs = -1)
    # n_jobs = -1 means to use all CPU
accuracies

array([0.8       , 0.96666667, 0.8       , 0.96666667, 0.86666667,
       0.86666667, 0.9       , 0.93333333, 1.        , 0.93333333])

In [10]:
accuracies.mean()

0.9033333333333333

In [11]:
accuracies.std()

0.06574360974438671

In [12]:
# So upto here we see how to check better performance of our model now we will see how to improve our model by changing 
# hyper parameters to their optimal value

In [17]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1, 10, 100, 1000], 'kernel':['linear']},              # This is our 1st option
              {'C':[1, 10, 100, 1000], 'kernel':['rbf'], 'gamma':[0.1,0.5,0.7,0.001]}]              #this is our 2nd option


In [18]:
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -10)

In [19]:
grid_search = grid_search.fit(X_train, Y_train)

In [21]:
best_accuracy = grid_search.best_score_
best_accuracy

0.9066666666666666

In [22]:
best_parameters = grid_search.best_params_
best_parameters

{'C': 1, 'gamma': 0.7, 'kernel': 'rbf'}

In [None]:
# so we seen our best ooption for C, kernel, and gamma values has been given to us which are giving accuracy of 90%