In [26]:
# Social Network Advertisements Support Vector Machine

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


In [27]:
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')

In [28]:
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [29]:
#see what data looks like
columns = dataset.columns
for col in columns:
    uniq = np.unique(dataset[col])
    print ('{} '.format(col) + '  ' + str(uniq[0:5])) 

User ID   [15566689 15569641 15570769 15570932 15571059]
Gender   ['Female' 'Male']
Age   [18 19 20 21 22]
EstimatedSalary   [15000 16000 17000 18000 19000]
Purchased   [0 1]


In [30]:
#check duplicated rows
dataset[dataset.duplicated()]

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased


In [31]:
#check null
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
User ID            400 non-null int64
Gender             400 non-null object
Age                400 non-null int64
EstimatedSalary    400 non-null int64
Purchased          400 non-null int64
dtypes: int64(4), object(1)
memory usage: 15.7+ KB


In [32]:
#convert gender column to labels
#since there are only two unique variables in the Gender feature, we can just lablel encode them to 0,1
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(dataset['Gender'])

#create key for categorical variables
gender_key = {}
for i,g in enumerate(le.classes_):
    gender_key[g] = i
gender_key

dataset['Gender'] = le.transform(dataset['Gender'])

In [33]:
#set training and test set
X = dataset.iloc[:,1:4].values
y = dataset.iloc[:, 4].values

In [34]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [35]:
#StandardScalar to normalize each INDEPENDENT feature such that its distribution will have a mean of 0 and a std of 1
#Normalize so the large values don't overwhlem the small ones
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [36]:
# Fitting classifier to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

In [37]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [38]:
# Confusion matrix
#[actual False + Pred False (TN)  ,  Actual False + Pred True(FP)]
#[actual True + Pred False (FN)  ,  Actual True + Pred True(TP)]
from sklearn.metrics import confusion_matrix # functions are lowercase, classes are uppercase
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[57  1]
 [ 6 16]]


In [39]:
# Classification Report
#Precision ratio tp / (tp + fp). Ability of the classifier NOT to label a positive sample that is negative
#Recall ratio tp / (tp + fn). Ability to find all the positive samples.
#F1-score weighted mean of precision and recall. Closer to 1 is better

from sklearn.metrics import classification_report
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94        58
           1       0.94      0.73      0.82        22

    accuracy                           0.91        80
   macro avg       0.92      0.86      0.88        80
weighted avg       0.91      0.91      0.91        80



## Try RBF kernal

In [40]:
# Fitting classifier to the Training set RBF
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)

In [41]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
# Confusion matrix
#[actual False + Pred False (TN)  ,  Actual False + Pred True(FP)]
#[actual True + Pred False (FN)  ,  Actual True + Pred True(TP)]
from sklearn.metrics import confusion_matrix # functions are lowercase, classes are uppercase
cm = confusion_matrix(y_test,y_pred)
print(cm)
# Classification Report
#Precision ratio tp / (tp + fp). Ability of the classifier NOT to label a positive sample that is negative
#Recall ratio tp / (tp + fn). Ability to find all the positive samples.
#F1-score weighted mean of precision and recall. Closer to 1 is better

from sklearn.metrics import classification_report
cr = classification_report(y_test,y_pred)
print(cr)

[[55  3]
 [ 1 21]]
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        58
           1       0.88      0.95      0.91        22

    accuracy                           0.95        80
   macro avg       0.93      0.95      0.94        80
weighted avg       0.95      0.95      0.95        80



### Different kernals simply making a different hyperplane decision boundary between the classes. The functions are used to map the original dataset into higher dimensionals spaces.

### Enhance performance with k-fold cross validation

In [42]:
# Advanced Performance Evluation Method
# Applying k-Fold Cross Validation
#Divides data into distinct subsets and every subset is used to training and hte other k-1 are used for validation
from sklearn.model_selection import cross_val_score

In [46]:
# Get the 10 accuracies for each one of the 10 combinations that will be created through k-fold cross validation
accuracies = cross_val_score(estimator = classifier, 
                             X = X_train, 
                             y= y_train, #dependent variable vector of the training set 
                             cv = 10)   # number of folds you want to split your training set into. most common choice is 10 because 10 accuracies is enough to get a relevant idea of model performance




In [47]:
# Get average of the 10 accuracies of the accuracies vector
accuracies.mean() # relevant evaluation of model performance


0.894021871945259

In [48]:
# Standard deviation of accuracies variance
accuracies.std() 

0.050156852257787835