In [57]:
# Importing modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [58]:
# read the CSV file
df = pd.read_csv('spambase.csv', header=None)
# Printing the length of the dataset
len(df)

4601

In [59]:
# Let's have a look at the dataset
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [60]:
# Assign the column labels
df.columns = [
    'word_freq_make',
    'word_freq_address',
    'word_freq_all',
    'word_freq_3d',
    'word_freq_our',
    'word_freq_over',
    'word_freq_remove',
    'word_freq_internet',
    'word_freq_order',
    'word_freq_mail',
    'word_freq_receive',
    'word_freq_will',
    'word_freq_people',
    'word_freq_report',
    'word_freq_addresses',
    'word_freq_free',
    'word_freq_business',
    'word_freq_email',
    'word_freq_you',
    'word_freq_credit',
    'word_freq_your',
    'word_freq_font',
    'word_freq_000',
    'word_freq_money',
    'word_freq_hp',
    'word_freq_hpl',
    'word_freq_george',
    'word_freq_650',
    'word_freq_lab',
    'word_freq_labs',
    'word_freq_telnet',
    'word_freq_857',
    'word_freq_data',
    'word_freq_415',
    'word_freq_85',
    'word_freq_technology',
    'word_freq_1999',
    'word_freq_parts',
    'word_freq_pm',
    'word_freq_direct',
    'word_freq_cs',
    'word_freq_meeting',
    'word_freq_original',
    'word_freq_project',
    'word_freq_re',
    'word_freq_edu',
    'word_freq_table',
    'word_freq_conference',
    'char_freq_;',
    'char_freq_(',
    'char_freq_[',
    'char_freq_!',
    'char_freq_$',
    'char_freq_#',
    'capital_run_length_average',
    'capital_run_length_longest',
    'capital_run_length_total',
    'spam'
]

In [61]:
# Now let's look at the labelled data
df.sample(n = 100)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
3754,0.33,0.00,0.00,0.0,0.00,0.0,0.0,0.0,0.33,0.00,...,0.000,0.448,0.00,0.056,0.000,0.000,1.788,6,93,0
4480,0.00,0.00,0.00,0.0,0.00,0.0,0.0,0.0,0.00,0.00,...,0.359,0.359,0.00,0.000,0.000,0.000,1.000,1,1,0
154,0.00,0.00,1.68,0.0,0.33,0.0,0.0,0.0,0.00,0.33,...,0.000,0.060,0.00,0.484,0.000,0.000,1.796,19,203,1
4253,0.95,0.00,0.00,0.0,0.00,0.0,0.0,0.0,0.00,0.00,...,0.000,0.350,0.00,0.000,0.000,0.000,2.608,14,60,0
3070,0.00,0.00,0.00,0.0,0.40,0.0,0.0,0.0,0.00,0.00,...,0.000,0.137,0.00,0.068,0.000,0.000,2.282,21,89,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3732,0.00,0.00,0.00,0.0,0.50,0.0,0.0,0.0,0.00,0.00,...,0.083,0.167,0.00,0.502,0.000,0.000,1.547,11,113,0
1892,0.00,0.00,0.00,0.0,0.00,0.0,0.0,0.0,0.00,0.00,...,0.000,0.438,0.00,0.000,0.000,0.000,2.448,37,120,0
946,0.00,0.00,0.00,0.0,0.20,0.0,0.0,0.0,0.00,0.00,...,0.000,0.204,0.00,0.034,0.000,0.000,2.588,15,277,1
1304,0.10,0.10,0.70,0.0,0.60,0.2,0.4,0.1,1.41,0.81,...,0.000,0.000,0.26,0.994,0.391,0.032,3.176,56,1042,1


In [62]:
# Setting the spam value to 0 if the "word_freq_george" or the "word_freq_650" columns are greater than 0.0
# Reason: In the documentation(spambase.DOCUMENTATION) it has clearly mentioned it
df.loc[(df['word_freq_george'] > 0) | (df['word_freq_650'] > 0), 'spam'] = 0

# Save the modified dataframe to a new csv file
df.to_csv('modified_dataset.csv', index=False)


In [63]:
# Creating a reference to the updated dataset
df_new = pd.read_csv('modified_dataset.csv', header=0)
df_new.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [64]:
# Let's have a look at the new dataset and check if its updated or not
columns_to_display = ['word_freq_george', 'word_freq_650', 'spam']
df_subset = df_new.loc[:, columns_to_display]
df_subset

Unnamed: 0,word_freq_george,word_freq_650,spam
0,0.0,0.0,1
1,0.0,0.0,1
2,0.0,0.0,1
3,0.0,0.0,1
4,0.0,0.0,1
...,...,...,...
4596,0.0,0.0,0
4597,0.0,0.0,0
4598,0.0,0.0,0
4599,0.0,0.0,0


In conclusion, we can state that if the values in the "word_freq_george" or "word_freq_650" columns are greater than 0.0, then the corresponding values in the "spam" column have been updated to 0. This implies that the occurrence of the words "george" or "650" in an email are strong indicators that the email is not spam.

In [65]:
# Splitting the data into test and train
X = df_new.iloc[:, 0:57]
y = df_new.iloc[:, 57]
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=0.2)

In [66]:
# Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [67]:
# Finding the value of n_neighbors
import math
math.sqrt(len(y_test))

30.347981810987037

Always the n_neighbors value has to be an odd number, therefore n_neighbors value is 29
n_neighbor = len(y_test) - 1

In [68]:
# Define the KNN model
# Setting p value into 2, because "spam" column has only 2 values, either 0 or 1
classifier = KNeighborsClassifier(n_neighbors=29, p=2, metric='euclidean')
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=29)

In [69]:
# Predict the test set result
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,

In [70]:
# Evaluate the model using confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[509,  38],
       [ 67, 307]], dtype=int64)

In the above matrix, true positive value is 307, which means that the model correctly identified 307 spam emails.
The true negative (TN) value is 509, which means that the model correctly identified 509 non-spam emails.
The false positive (FP) value is 38, which means that the model incorrectly identified 38 non-spam emails as spam.
The false negative (FN) value is 67, which means that the model incorrectly identified 67 spam emails as non-spam.

In [71]:
# printing the experimental results
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.91       547
           1       0.89      0.82      0.85       374

    accuracy                           0.89       921
   macro avg       0.89      0.88      0.88       921
weighted avg       0.89      0.89      0.89       921



## Using Decision Trees

In [72]:
dTree = DecisionTreeClassifier() # Creating an instance
dTree.fit(X_train,y_train) # Training the model

DecisionTreeClassifier()

In [73]:
# Predict the test set result
predictions = dTree.predict(X_test)
predictions

array([1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,

In [74]:
# Evaluate the model using confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[501,  46],
       [ 45, 329]], dtype=int64)

In the above matrix, true positive value is 330, which means that the model correctly identified 330 spam emails.
The true negative (TN) value is 496, which means that the model correctly identified 409 non-spam emails.
The false positive (FP) value is 51, which means that the model incorrectly identified 51 non-spam emails as spam.
The false negative (FN) value is 44, which means that the model incorrectly identified 44 spam emails as non-spam.

In [75]:
# printing the experimental results
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       547
           1       0.88      0.88      0.88       374

    accuracy                           0.90       921
   macro avg       0.90      0.90      0.90       921
weighted avg       0.90      0.90      0.90       921

