In [9]:
# Kennedi Todd
# August 7, 2024
# Chapter 11: Support Vector Machines
# predict outcome of a user clickling on an online advertisement

# libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# read data
df = pd.read_csv('advertising.csv')
df.shape

(1000, 10)

In [10]:
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [11]:
# remove variables
del df['Ad Topic Line']
del df['Timestamp']
df.shape

(1000, 8)

In [12]:
# one-hot encoding
df = pd.get_dummies(df, columns = ['Country','City'])
df.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Male,Clicked on Ad,Country_Afghanistan,Country_Albania,Country_Algeria,Country_American Samoa,...,City_Wintersfort,City_Wongland,City_Wrightburgh,City_Wrightview,City_Yangside,City_Youngburgh,City_Youngfort,City_Yuton,City_Zacharystad,City_Zacharyton
0,68.95,35,61833.9,256.09,0,0,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
1,80.23,31,68441.85,193.77,1,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,69.47,26,59785.94,236.5,0,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,74.15,29,54806.18,245.89,1,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,68.37,35,73889.99,225.58,0,0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [13]:
# set X and y variables
X = df.drop('Clicked on Ad', axis = 1)
y = df['Clicked on Ad']

# split into training/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True, random_state = 10)

# set algorithm
model = SVC()

# fit data to model
model.fit(X_train, y_train)

# evaluate
model_predict = model.predict(X_test)

# confusion matrix
# top left = correct negative predictions
# top right = false-positives
# bottom left = false-negatives
# bottom right = correct positive predictions
print(confusion_matrix(y_test, model_predict))

[[124  22]
 [ 68  86]]


In [15]:
# classification_report
# precision = # correctly predicted positive / # predicted positive
# recall = # correctly predicted positive / # actual positive
print(classification_report(y_test, model_predict))

              precision    recall  f1-score   support

           0       0.65      0.85      0.73       146
           1       0.80      0.56      0.66       154

    accuracy                           0.70       300
   macro avg       0.72      0.70      0.70       300
weighted avg       0.72      0.70      0.69       300



In [16]:
# grid search
# focus on hyperparameter C and gamma which have biggest impact on accuracy
# C controls the cost of misclassification
# gamma refers to the Gaussian radial basis function and the influence of the support vector, balances tradeoff between variance/bias
hyperparameters = {'C': [10,25,50],
                   'gamma': [0.001, 0.0001, 0.00001]}

# link hyperparameters to GridSearchCV and the SVC algorithm
grid = GridSearchCV(SVC(), hyperparameters)

# fit grid search to the X and y training data
grid.fit(X_train, y_train)

# review optimal combo
grid.best_params_

{'C': 50, 'gamma': 1e-05}

In [17]:
# link test data with the model using the new hyperparameters from grid search
grid_predict = grid.predict(X_test)
print(confusion_matrix(y_test, grid_predict))

[[129  17]
 [ 15 139]]


In [18]:
print(classification_report(y_test, grid_predict))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89       146
           1       0.89      0.90      0.90       154

    accuracy                           0.89       300
   macro avg       0.89      0.89      0.89       300
weighted avg       0.89      0.89      0.89       300

