In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
dataset=pd.read_csv('Social_Network_Ads.csv')

In [3]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [4]:
X=dataset.iloc[:,[2,3]].values # X pulled from all rows and col 2 (age) and 3 (salary)
Y=dataset.iloc[:,4].values

In [5]:
dataset.info() # 400 samples

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null float64
EstimatedSalary    400 non-null float64
Purchased          400 non-null int64
dtypes: float64(2), int64(2), object(1)
memory usage: 15.7+ KB


In [6]:
#split data into training and testing set

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25, random_state=0)

In [7]:
# apply feature scaling

from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

In [8]:
# Logistic regression is a linear regression model - line separates 1 and 0
# Import from 'linear_model

from sklearn.svm import SVC


In [9]:
classifier=SVC(kernel='rbf',random_state=0) # default kernel, Gaussian
classifier.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [10]:
# predict test set results

Y_pred=classifier.predict(X_test)

In [11]:
# making the Confusion Matrix - correct and actual
# evaluate the performance of the classifier

# function to create Confusion Matrix

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(Y_test,Y_pred)

In [12]:
cm

# 64 and 29 are the correct predictions
# 4 and 3 are the incorrect predicitons

array([[64,  4],
       [ 3, 29]])

In [13]:
# applying k-fold cross validation

from sklearn.model_selection import cross_val_score

In [14]:
accuracies=cross_val_score(estimator=classifier,X=X_train,y=Y_train, cv=10)

In [15]:
accuracies

array([ 0.80645161,  0.96666667,  0.8       ,  0.93333333,  0.86666667,
        0.83333333,  0.93333333,  0.93333333,  0.96666667,  0.96551724])

In [16]:
accuracies.mean()
accuracies.std()

0.063889573566262847

In [17]:
# applying Grid Search to find best model and best parameters

from sklearn.model_selection import GridSearchCV

In [18]:
# parameters - list of dictionaries, from SVC kernel

parameters = [{'C':[1,10,100,1000],'kernel':['linear']},
             {'C':[1,10,100,1000],'kernel':['rbf'],'gamma':[0.5,0.1,0.01,0.001,
                                                            0.0001]}]
grid_search=GridSearchCV(estimator=classifier,param_grid=parameters,
                        scoring='accuracy', cv=10)

In [20]:
grid_search=grid_search.fit(X_train,Y_train)

In [21]:
best_accuracy = grid_search.best_score_
best_accuracy

0.90000000000000002

In [22]:
best_parameters=grid_search.best_params_
best_parameters

{'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}