In [20]:
# Importing modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [21]:
# read the CSV file
df = pd.read_csv('spambase.csv', header=None)
# Printing the length of the dataset
len(df)

4601

In [22]:
# Let's have a look at the dataset
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [23]:
# Assign the column labels
df.columns = [
    'word_freq_make',
    'word_freq_address',
    'word_freq_all',
    'word_freq_3d',
    'word_freq_our',
    'word_freq_over',
    'word_freq_remove',
    'word_freq_internet',
    'word_freq_order',
    'word_freq_mail',
    'word_freq_receive',
    'word_freq_will',
    'word_freq_people',
    'word_freq_report',
    'word_freq_addresses',
    'word_freq_free',
    'word_freq_business',
    'word_freq_email',
    'word_freq_you',
    'word_freq_credit',
    'word_freq_your',
    'word_freq_font',
    'word_freq_000',
    'word_freq_money',
    'word_freq_hp',
    'word_freq_hpl',
    'word_freq_george',
    'word_freq_650',
    'word_freq_lab',
    'word_freq_labs',
    'word_freq_telnet',
    'word_freq_857',
    'word_freq_data',
    'word_freq_415',
    'word_freq_85',
    'word_freq_technology',
    'word_freq_1999',
    'word_freq_parts',
    'word_freq_pm',
    'word_freq_direct',
    'word_freq_cs',
    'word_freq_meeting',
    'word_freq_original',
    'word_freq_project',
    'word_freq_re',
    'word_freq_edu',
    'word_freq_table',
    'word_freq_conference',
    'char_freq_;',
    'char_freq_(',
    'char_freq_[',
    'char_freq_!',
    'char_freq_$',
    'char_freq_#',
    'capital_run_length_average',
    'capital_run_length_longest',
    'capital_run_length_total',
    'spam'
]

In [24]:
# Now let's look at the labelled data
df.sample(n = 100)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
2925,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.000,0.000,0.000,0.000,2.000,3,8,0
209,0.22,0.22,0.22,0.0,1.77,0.22,0.44,0.44,0.22,2.88,...,0.000,0.000,0.000,0.563,0.150,0.000,86.650,1038,1733,1
4025,0.34,0.00,0.34,0.0,0.34,0.34,0.00,0.00,0.00,0.00,...,0.116,0.292,0.058,0.000,0.000,0.000,2.333,15,182,0
976,0.00,0.16,0.00,0.0,0.16,0.16,0.00,1.14,1.30,0.32,...,0.673,0.350,0.053,0.134,0.107,0.026,5.216,57,1038,1
3283,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.000,0.000,0.000,0.000,1.506,11,113,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4407,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.000,0.000,0.000,0.000,1.750,5,14,0
67,0.00,0.00,0.00,0.0,0.00,0.00,1.47,0.00,0.00,1.47,...,0.000,0.000,0.000,0.500,0.000,0.000,1.214,3,17,1
1074,0.00,0.55,0.55,0.0,1.10,0.55,2.20,0.00,0.00,0.55,...,0.000,0.165,0.000,0.496,0.000,0.082,16.782,148,386,1
2306,0.00,0.00,0.00,0.0,0.00,0.00,1.78,0.00,0.00,1.78,...,0.000,0.000,0.000,0.000,0.000,0.000,7.000,35,63,0


In [25]:
# Checking if there are duplicate rows in the dataset

# get boolean mask of duplicated rows
duplicated_rows = df.duplicated()

# count the number of duplicated rows
num_duplicate_rows = sum(duplicated_rows)

print(f"Number of duplicate rows: {num_duplicate_rows}")

Number of duplicate rows: 391


In [26]:
# Removing duplicate rows
df = df.drop_duplicates()

In [27]:
# Setting the spam value to 0 if the "word_freq_george" or the "word_freq_650" columns are greater than 0.0
# Reason: In the documentation(spambase.DOCUMENTATION) it has clearly mentioned it
df.loc[(df['word_freq_george'] > 0) | (df['word_freq_650'] > 0), 'spam'] = 0

# Save the modified dataframe to a new csv file
df.to_csv('modified_dataset.csv', index=False)


In [28]:
# Creating a reference to the updated dataset
df_new = pd.read_csv('modified_dataset.csv', header=0)
df_new.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [29]:
# Let's have a look at the new dataset and check if its updated or not
columns_to_display = ['word_freq_george', 'word_freq_650', 'spam']
df_subset = df_new.loc[:, columns_to_display]
df_subset

Unnamed: 0,word_freq_george,word_freq_650,spam
0,0.0,0.0,1
1,0.0,0.0,1
2,0.0,0.0,1
3,0.0,0.0,1
4,0.0,0.0,1
...,...,...,...
4205,0.0,0.0,0
4206,0.0,0.0,0
4207,0.0,0.0,0
4208,0.0,0.0,0


In conclusion, we can state that if the values in the "word_freq_george" or "word_freq_650" columns are greater than 0.0, then the corresponding values in the "spam" column have been updated to 0. This implies that the occurrence of the words "george" or "650" in an email are strong indicators that the email is not spam.

In [30]:
# Splitting the data into test and train
X = df_new.iloc[:, 0:57]
y = df_new.iloc[:, 57]
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0, test_size=0.2)

In [31]:
# Feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [32]:
# Finding the value of n_neighbors
import math
math.sqrt(len(y_test))

29.017236257093817

Always the n_neighbors value has to be an odd number, therefore n_neighbors value is 29
n_neighbor = len(y_test) - 1

In [33]:
# Define the KNN model
# Setting p value into 2, because "spam" column has only 2 values, either 0 or 1
classifier = KNeighborsClassifier(n_neighbors=29, p=2, metric='euclidean')
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=29)

In [34]:
# Predict the test set result
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,

In [35]:
# Evaluate the model using confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[471,  32],
       [ 68, 271]], dtype=int64)

In the above matrix, true positive value is 307, which means that the model correctly identified 307 spam emails.
The true negative (TN) value is 509, which means that the model correctly identified 509 non-spam emails.
The false positive (FP) value is 38, which means that the model incorrectly identified 38 non-spam emails as spam.
The false negative (FN) value is 67, which means that the model incorrectly identified 67 spam emails as non-spam.

In [36]:
# printing the experimental results
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90       503
           1       0.89      0.80      0.84       339

    accuracy                           0.88       842
   macro avg       0.88      0.87      0.87       842
weighted avg       0.88      0.88      0.88       842



## Using Decision Trees

In [37]:
dTree = DecisionTreeClassifier() # Creating an instance
dTree.fit(X_train,y_train) # Training the model

DecisionTreeClassifier()

In [38]:
# Predict the test set result
predictions = dTree.predict(X_test)
predictions

array([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,

In [39]:
# Evaluate the model using confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[465,  38],
       [ 40, 299]], dtype=int64)

In the above matrix, true positive value is 328, which means that the model correctly identified 328 spam emails.
The true negative (TN) value is 496, which means that the model correctly identified 496 non-spam emails.
The false positive (FP) value is 51, which means that the model incorrectly identified 51 non-spam emails as spam.
The false negative (FN) value is 46, which means that the model incorrectly identified 46 spam emails as non-spam.

In [40]:
# printing the experimental results
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92       503
           1       0.89      0.88      0.88       339

    accuracy                           0.91       842
   macro avg       0.90      0.90      0.90       842
weighted avg       0.91      0.91      0.91       842



## Comparison of models
For the KNN model, we see that it has an accuracy of 0.89, and slightly lower precision, recall, and f1-score for class 1 compared to class 0. For the Decision Tree model, we see that it has an accuracy of 0.90, and similar precision, recall, and f1-score for both classes.

In general, both models have good overall performance, but the Decision Tree model has slightly better accuracy and balanced precision and recall for both classes.