## DATA 440 Technical Report
### Using SVMs to Predict Spam
Dataset: https://www.openml.org/d/44

In [2]:
# Importing needed libraries and modules
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix

### Data Ingestion and Cleaning

In [3]:
spambase = fetch_openml(name="spambase", as_frame=True)
df = spambase.frame

In [4]:
df.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total,class
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [5]:
df.shape

(4601, 58)

#### Exploratory Data Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   word_freq_make              4601 non-null   float64 
 1   word_freq_address           4601 non-null   float64 
 2   word_freq_all               4601 non-null   float64 
 3   word_freq_3d                4601 non-null   float64 
 4   word_freq_our               4601 non-null   float64 
 5   word_freq_over              4601 non-null   float64 
 6   word_freq_remove            4601 non-null   float64 
 7   word_freq_internet          4601 non-null   float64 
 8   word_freq_order             4601 non-null   float64 
 9   word_freq_mail              4601 non-null   float64 
 10  word_freq_receive           4601 non-null   float64 
 11  word_freq_will              4601 non-null   float64 
 12  word_freq_people            4601 non-null   float64 
 13  word_freq_report  

In [7]:
df.describe()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total
count,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,...,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0,4601.0
mean,0.104553,0.213015,0.280656,0.065425,0.312223,0.095901,0.114208,0.105295,0.090067,0.239413,...,0.031869,0.038575,0.13903,0.016976,0.269071,0.075811,0.044238,5.191515,52.172789,283.289285
std,0.305358,1.290575,0.504143,1.395151,0.672513,0.273824,0.391441,0.401071,0.278616,0.644755,...,0.285735,0.243471,0.270355,0.109394,0.815672,0.245882,0.429342,31.729449,194.89131,606.347851
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.065,0.0,0.0,0.0,0.0,2.276,15.0,95.0
75%,0.0,0.0,0.42,0.0,0.38,0.0,0.0,0.0,0.0,0.16,...,0.0,0.0,0.188,0.0,0.315,0.052,0.0,3.706,43.0,266.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,10.0,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0


#### Checking and Replacing Null Values

In [8]:
df.isnull().sum()

word_freq_make                0
word_freq_address             0
word_freq_all                 0
word_freq_3d                  0
word_freq_our                 0
word_freq_over                0
word_freq_remove              0
word_freq_internet            0
word_freq_order               0
word_freq_mail                0
word_freq_receive             0
word_freq_will                0
word_freq_people              0
word_freq_report              0
word_freq_addresses           0
word_freq_free                0
word_freq_business            0
word_freq_email               0
word_freq_you                 0
word_freq_credit              0
word_freq_your                0
word_freq_font                0
word_freq_000                 0
word_freq_money               0
word_freq_hp                  0
word_freq_hpl                 0
word_freq_george              0
word_freq_650                 0
word_freq_lab                 0
word_freq_labs                0
word_freq_telnet              0
word_fre

#### Label Encoding
Class is stored as Category. Converting to Integer<br>
0 -> False (not spam)<br>
1 -> True (spam)

In [9]:
le = LabelEncoder()
df["class"] = le.fit_transform(df["class"])

df["class"].dtype

dtype('int64')

#### Data Preprocessing
Separating 'Class' from features

In [10]:
X = df.drop(columns=["class"])
y = df["class"]

X

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_%3B,char_freq_%28,char_freq_%5B,char_freq_%21,char_freq_%24,char_freq_%23,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.0,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.0,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.0,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.0,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.0,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78


In [11]:
y

0       1
1       1
2       1
3       1
4       1
       ..
4596    0
4597    0
4598    0
4599    0
4600    0
Name: class, Length: 4601, dtype: int64

#### Dividing data into training and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)
X_train.shape

(3220, 57)

In [28]:
X_test.shape

(1381, 57)

#### Model Fitting (One vs Rest + Kernel: Poly)

In [29]:
svm_ovr = svm.SVC(kernel="poly", decision_function_shape="ovr")
svm_ovr.fit(X_train, y_train)

# Training set prediction
predict_train = svm_ovr.predict(X_train)

# Test set prediction
predict_test = svm_ovr.predict(X_test)

#### Classification Report & Confusion Matrix

Confusion Matrix: Training

In [30]:
print(confusion_matrix(y_train, predict_train))

[[1903   25]
 [1068  224]]


Confusion Matrix: Testing

In [31]:
print(confusion_matrix(y_test, predict_test))

[[845  15]
 [440  81]]


Classification Report

In [32]:
print(classification_report(y_test, predict_test))

              precision    recall  f1-score   support

           0       0.66      0.98      0.79       860
           1       0.84      0.16      0.26       521

    accuracy                           0.67      1381
   macro avg       0.75      0.57      0.53      1381
weighted avg       0.73      0.67      0.59      1381




#### Model Fitting (One vs One)

In [33]:
svm_ovo = svm.SVC(decision_function_shape="ovo")
svm_ovo.fit(X_train, y_train)

# Training set prediction
predict_train_ovo = svm_ovo.predict(X_train)

# Test set prediction
predict_test_ovo = svm_ovo.predict(X_test)

#### Classification Report & Confusion Matrix

Confusion Matrix: Training

In [34]:
print(confusion_matrix(y_train, predict_train_ovo))

[[1672  256]
 [ 680  612]]


Confusion Matrix: Testing

In [35]:
print(confusion_matrix(y_test, predict_test_ovo))

[[737 123]
 [283 238]]


Classification Report

In [36]:
print(classification_report(y_test, predict_test_ovo))


              precision    recall  f1-score   support

           0       0.72      0.86      0.78       860
           1       0.66      0.46      0.54       521

    accuracy                           0.71      1381
   macro avg       0.69      0.66      0.66      1381
weighted avg       0.70      0.71      0.69      1381

