## SVC (support vector classifier) with GridSearchCV

In [1]:
# read the dataset
import pandas as pd 
dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [2]:
# convert categorical into numerical 
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset = dataset.drop(["User ID"],axis=1)
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1


In [3]:
# split input and output 
independent = dataset[[ "Age",	"EstimatedSalary", "Gender_Male"]]
dependent = dataset[[ "Purchased" ]]

In [26]:
# Check dataset is balanced or Imbalanced
dataset["Purchased"].value_counts()    # imbalanced 

Purchased
0    257
1    143
Name: count, dtype: int64

In [4]:
# split train and test
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent,dependent, test_size=0.20,random_state=0)
print(x_train,x_test,y_train,y_test )

     Age  EstimatedSalary  Gender_Male
336   58           144000            1
64    59            83000            0
55    24            55000            0
106   26            35000            0
300   58            38000            0
..   ...              ...          ...
323   48            30000            0
192   29            43000            1
117   36            52000            1
47    27            54000            0
172   26           118000            0

[320 rows x 3 columns]      Age  EstimatedSalary  Gender_Male
132   30            87000            1
309   38            50000            0
341   35            75000            1
196   30            79000            0
246   35            50000            0
..   ...              ...          ...
14    18            82000            1
363   42            79000            0
304   40            60000            0
361   53            34000            0
329   47           107000            0

[80 rows x 3 columns]      Purchased
33

In [5]:
# standardization 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [6]:
y_train = y_train.values.ravel()

In [11]:
# model creation + grid cv
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
param_grid = {
"kernel": ["linear", "poly", "rbf", "sigmoid"],  #precomputed - need x to be a square matrix 
"gamma": ["scale", "auto"]
}
model = SVC(probability=True)

grid_model = GridSearchCV(model, param_grid,cv=5, refit=True , verbose=3, n_jobs =-1, scoring='f1_weighted') 
grid_model.fit(x_train,y_train)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [8]:
# model prediction + grid cv
grid_prediction = grid_model.predict(x_test)
grid_prediction

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1])

In [9]:
# model evaluation --- Performance on the unseen TEST dataset
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,grid_prediction)
clf_report = classification_report(y_test, grid_prediction)
print(cm)
print(clf_report)

[[55  3]
 [ 1 21]]
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        58
           1       0.88      0.95      0.91        22

    accuracy                           0.95        80
   macro avg       0.93      0.95      0.94        80
weighted avg       0.95      0.95      0.95        80



## ROC AUC 

In [15]:
print(grid_model.predict_proba(x_test)[:,1])

[0.07029406 0.03906719 0.02564854 0.03140264 0.01670075 0.04796081
 0.0737412  0.96233663 0.04664184 0.65529522 0.00551429 0.00958219
 0.01360756 0.07995338 0.07289054 0.73694193 0.04979209 0.07430658
 0.88311828 0.10871585 0.01855376 0.88679859 0.10729043 0.94044062
 0.0372402  0.885482   0.06675877 0.02725193 0.02482152 0.06743978
 0.07981256 0.06113587 0.93423212 0.01411516 0.00923895 0.03269413
 0.0421352  0.01794277 0.01952296 0.88792645 0.07039268 0.06154068
 0.05186139 0.13317592 0.79456757 0.01063435 0.06415342 0.81606461
 0.02094393 0.84190689 0.88201692 0.06173477 0.01502986 0.88719441
 0.86668562 0.68006028 0.04636477 0.03705906 0.93348804 0.05641224
 0.08680132 0.95573222 0.04429956 0.54483447 0.07099631 0.93749367
 0.08271373 0.006072   0.05475005 0.12839333 0.91188796 0.03728496
 0.02224546 0.85014893 0.07983843 0.06884002 0.18092262 0.05988611
 0.94958622 0.80907444]


In [16]:
from sklearn.metrics import roc_auc_score     # key metric for binary classification 
roc_auc_score = roc_auc_score(y_test,grid_model.predict_proba(x_test)[:,1] )
roc_auc_score

np.float64(0.9686520376175549)

## f1 score 

In [17]:
from sklearn.metrics import f1_score
f1_score = f1_score(y_test,grid_prediction, average='weighted' )
f1_score

0.950648360030511

## GridSearchCV results 

In [18]:
grid_results = grid_model.cv_results_
print(grid_results)

{'mean_fit_time': array([0.01990581, 0.02556252, 0.02486176, 0.02177629, 0.02016129,
       0.01925211, 0.02381754, 0.02317977]), 'std_fit_time': array([0.00339684, 0.00746415, 0.00337763, 0.00141543, 0.00218797,
       0.0012783 , 0.0017346 , 0.0027346 ]), 'mean_score_time': array([0.0090241 , 0.00992146, 0.0108171 , 0.00905404, 0.00866418,
       0.0089282 , 0.00950603, 0.01159382]), 'std_score_time': array([0.00051244, 0.00201819, 0.00264846, 0.00037176, 0.0005417 ,
       0.0005302 , 0.00046459, 0.00426848]), 'param_gamma': masked_array(data=['scale', 'scale', 'scale', 'scale', 'auto', 'auto',
                   'auto', 'auto'],
             mask=[False, False, False, False, False, False, False, False],
       fill_value=np.str_('?'),
            dtype=object), 'param_kernel': masked_array(data=['linear', 'poly', 'rbf', 'sigmoid', 'linear', 'poly',
                   'rbf', 'sigmoid'],
             mask=[False, False, False, False, False, False, False, False],
       fill_value=np.

In [19]:
# table
table = pd.DataFrame.from_dict(grid_results)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.019906,0.003397,0.009024,0.000512,scale,linear,"{'gamma': 'scale', 'kernel': 'linear'}",0.796088,0.758892,0.740864,0.820367,0.902824,0.803807,0.056776,5
1,0.025563,0.007464,0.009921,0.002018,scale,poly,"{'gamma': 'scale', 'kernel': 'poly'}",0.794154,0.783708,0.769318,0.873807,0.919631,0.828124,0.058387,3
2,0.024862,0.003378,0.010817,0.002648,scale,rbf,"{'gamma': 'scale', 'kernel': 'rbf'}",0.860542,0.890137,0.85992,0.907389,0.96875,0.897348,0.040016,1
3,0.021776,0.001415,0.009054,0.000372,scale,sigmoid,"{'gamma': 'scale', 'kernel': 'sigmoid'}",0.766401,0.795968,0.720769,0.726744,0.842259,0.770428,0.045172,7
4,0.020161,0.002188,0.008664,0.000542,auto,linear,"{'gamma': 'auto', 'kernel': 'linear'}",0.796088,0.758892,0.740864,0.820367,0.902824,0.803807,0.056776,5
5,0.019252,0.001278,0.008928,0.00053,auto,poly,"{'gamma': 'auto', 'kernel': 'poly'}",0.794154,0.783708,0.769318,0.873807,0.919631,0.828124,0.058387,3
6,0.023818,0.001735,0.009506,0.000465,auto,rbf,"{'gamma': 'auto', 'kernel': 'rbf'}",0.860542,0.890137,0.85992,0.907389,0.96875,0.897348,0.040016,1
7,0.02318,0.002735,0.011594,0.004268,auto,sigmoid,"{'gamma': 'auto', 'kernel': 'sigmoid'}",0.766401,0.795968,0.720769,0.726744,0.842259,0.770428,0.045172,7


In [20]:
grid_model.best_params_

{'gamma': 'scale', 'kernel': 'rbf'}

In [21]:
grid_model.best_estimator_

In [22]:
""" Mean cross-validation score on the TRAINING data """
grid_model.best_score_

np.float64(0.8973475249246045)

## model prediction with real time data

In [20]:
## user input 
Gender=input("Enter your gender (0-female/1-male): ")
Age	= int(input("Enter your age: "))
EstimatedSalary = int(input("Enter your estimated salary: "))


""" test input data's
Male	19	19000	- 0
Female	46	41000	- 1
Female	50	20000	- 1
Male	36	33000	- 0
"""

Enter your gender (0-female/1-male):  1
Enter your age:  19
Enter your estimated salary:  19000


" test input data's\nMale\t19\t19000\t- 0\nFemale\t46\t41000\t- 1\nFemale\t50\t20000\t- 1\nMale\t36\t33000\t- 0\n"

In [25]:
## user input predction 
user_input_predction = grid_model.predict([[Gender,Age,EstimatedSalary ]])
if user_input_predction==1:
    print("user will purchase")
else:
    print("user will not purchase")

print(user_input_predction)

user will purchase
[1]
