In [1]:
# import the lib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load the dataset
data = pd.read_csv('Social_Network_Ads.csv')
print('Data loaded')

Data loaded


In [9]:
data

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [5]:
data.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [6]:
# check how many purchased
data['Purchased'].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [7]:
# Gender using encoding
gender = pd.get_dummies(data['Gender'], drop_first=True)
gender

Unnamed: 0,Male
0,1
1,1
2,0
3,0
4,1
...,...
395,0
396,1
397,0
398,1


In [9]:
df = pd.concat([data, gender], axis=1)

In [10]:
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased,Male
0,15624510,Male,19,19000,0,1
1,15810944,Male,35,20000,0,1
2,15668575,Female,26,43000,0,0
3,15603246,Female,27,57000,0,0
4,15804002,Male,19,76000,0,1
...,...,...,...,...,...,...
395,15691863,Female,46,41000,1,0
396,15706071,Male,51,23000,1,1
397,15654296,Female,50,20000,1,0
398,15755018,Male,36,33000,0,1


In [11]:
# create independent & dependent variables
X = df[['Age', 'EstimatedSalary', 'Male']]
y = df['Purchased']

In [13]:
X 

Unnamed: 0,Age,EstimatedSalary,Male
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1
...,...,...,...
395,46,41000,0
396,51,23000,1
397,50,20000,0
398,36,33000,1


In [14]:
y

0      0
1      0
2      0
3      0
4      0
      ..
395    1
396    1
397    1
398    0
399    1
Name: Purchased, Length: 400, dtype: int64

In [16]:
# Apply scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [17]:
# split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# apply scaling
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [28]:
from sklearn import svm
ml = svm.SVC(kernel='rbf')
ml.fit(X_train, y_train)

In [29]:
pred = ml.predict(X_test)

In [30]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [31]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94        48
           1       0.96      0.84      0.90        32

    accuracy                           0.93        80
   macro avg       0.93      0.91      0.92        80
weighted avg       0.93      0.93      0.92        80



In [32]:
from sklearn import svm
ml = svm.SVC(kernel='linear')
ml.fit(X_train, y_train)

In [33]:
pred = ml.predict(X_test)

In [34]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89        56
           1       0.71      0.83      0.77        24

    accuracy                           0.85        80
   macro avg       0.82      0.85      0.83        80
weighted avg       0.86      0.85      0.85        80



In [38]:
from sklearn.model_selection import GridSearchCV

ml1 = svm.SVC()

param_grid = {
 'C': [1, 10, 100, 1000, 10000],
 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
 'kernel': [ 'rbf', 'linear']
}

grid = GridSearchCV(ml1, param_grid, refit=True, verbose=10, cv=5, n_jobs=-1)

grid_search = grid.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 4/5; 40/50] END C=1000, gamma=0.0001, kernel=linear;, score=0.750 total time=   0.0s
[CV 5/5; 40/50] START C=1000, gamma=0.0001, kernel=linear.......................
[CV 5/5; 40/50] END C=1000, gamma=0.0001, kernel=linear;, score=0.812 total time=   0.0s
[CV 1/5; 41/50] START C=10000, gamma=1, kernel=rbf..............................
[CV 1/5; 41/50] END C=10000, gamma=1, kernel=rbf;, score=0.766 total time=   0.0s
[CV 2/5; 41/50] START C=10000, gamma=1, kernel=rbf..............................
[CV 2/5; 41/50] END C=10000, gamma=1, kernel=rbf;, score=0.875 total time=   0.0s
[CV 3/5; 41/50] START C=10000, gamma=1, kernel=rbf..............................
[CV 3/5; 41/50] END C=10000, gamma=1, kernel=rbf;, score=0.859 total time=   0.0s
[CV 4/5; 41/50] START C=10000, gamma=1, kernel=rbf..............................
[CV 4/5; 41/50] END C=10000, gamma=1, kernel=rbf;, score=0.797 total time=   0.1s
[CV 5/5; 41/50] START C=100

In [39]:
grid_search.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'rbf'}

In [40]:
ml = svm.SVC(C=1, gamma=1, kernel='rbf')
ml.fit(X_train, y_train)

[CV 4/5; 25/50] START C=100, gamma=0.01, kernel=rbf.............................
[CV 4/5; 25/50] END C=100, gamma=0.01, kernel=rbf;, score=0.828 total time=   0.0s
[CV 2/5; 32/50] START C=1000, gamma=1, kernel=linear............................
[CV 2/5; 32/50] END C=1000, gamma=1, kernel=linear;, score=0.844 total time=   0.1s
[CV 3/5; 32/50] START C=1000, gamma=1, kernel=linear............................
[CV 3/5; 32/50] END C=1000, gamma=1, kernel=linear;, score=0.844 total time=   0.1s
[CV 4/5; 32/50] START C=1000, gamma=1, kernel=linear............................
[CV 4/5; 32/50] END C=1000, gamma=1, kernel=linear;, score=0.750 total time=   0.0s
[CV 5/5; 32/50] START C=1000, gamma=1, kernel=linear............................
[CV 5/5; 32/50] END C=1000, gamma=1, kernel=linear;, score=0.812 total time=   0.0s
[CV 1/5; 33/50] START C=1000, gamma=0.1, kernel=rbf.............................
[CV 1/5; 33/50] END C=1000, gamma=0.1, kernel=rbf;, score=0.891 total time=   0.0s
[CV 2/5; 33/

In [41]:
pred = ml.predict(X_test)

In [42]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94        48
           1       0.96      0.84      0.90        32

    accuracy                           0.93        80
   macro avg       0.93      0.91      0.92        80
weighted avg       0.93      0.93      0.92        80



In [43]:
pred_train = ml.predict(X_train)

In [44]:
print(classification_report(pred_train, y_train))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94       195
           1       0.93      0.86      0.89       125

    accuracy                           0.92       320
   macro avg       0.92      0.91      0.91       320
weighted avg       0.92      0.92      0.92       320

