<a href="https://colab.research.google.com/github/j-ranasinghe/Spam-E-mail-Classification/blob/main/Spam_E_mail_Detection123.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the libraries

In [None]:
import time
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Importing the dataset

In [None]:
cols = ['capital_run_length_total','capital_run_length_longest','capital_run_length_average','char_freq_#','char_freq_$','char_freq_!','char_freq_[','char_freq_(','char_freq_;','word_freq_conference','word_freq_table','word_freq_edu','word_freq_re','word_freq_project','word_freq_original','word_freq_meeting','word_freq_cs','word_freq_direct','word_freq_pm','word_freq_parts','word_freq_1999','word_freq_technology','word_freq_85','word_freq_415','word_freq_data','word_freq_857','word_freq_telnet','word_freq_labs','word_freq_lab','word_freq_650','word_freq_george','word_freq_hpl','word_freq_hp','word_freq_money','word_freq_000','word_freq_font','word_freq_your','word_freq_credit','word_freq_you','word_freq_email','word_freq_business','word_freq_free','word_freq_addresses','word_freq_report','word_freq_people','word_freq_will','word_freq_receive','word_freq_mail','word_freq_order','word_freq_internet','word_freq_remove','word_freq_over','word_freq_our','word_freq_3d','word_freq_all','word_freq_address','word_freq_make','class']
df = pd.read_csv('spambase.data' , names= cols )
print(df.head())

# Data Visualisation

In [None]:
print(df.dtypes) #Check the data type of the attributes


In [None]:
print(df.shape) #Check the shape of the dataset

In [None]:
#Check the distribution of the spam vs not-spam 
new_class = pd.Categorical(df["class"])
new_class = new_class.rename_categories(["spam","not_spam"])              
new_class.describe()

In [None]:
#Boxplot to for outliers
for label in cols[:-1]:
    # for label in range(0,5):
    df.plot(
    kind='box', 
    subplots=True, 
    sharey=False, 
    figsize=(70, 25)
    )
    plt.subplots_adjust(wspace=0.9) 
    plt.show()
    break

#Data Cleaning

In [None]:
# Find NaNs and duplicates in df

print('There are {} missing values or NaNs in data.'
      .format(df.isnull().values.sum()))

temp_energy = df.duplicated(keep='first').sum()

print('There are {} duplicate rows in data based on all columns.'
      .format(temp_energy))

df = df.drop_duplicates()   #drop the duplicates

In [None]:
# extract the features
features = df.drop('class', axis=1)
# extract the labels
labels = df['class']

In [None]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

In [None]:
# Handle outliers
X_winsorized = winsorize(X_scaled, limits=[0.05, 0.05])

# PCA

In [None]:
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_winsorized)

In [None]:
print(X_winsorized.shape) # shape of dataframe before applying PCA
print(X_pca.shape) # shape of dataframe after applying PCA

In [None]:
# handle class imbalance using oversampling technique
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_pca, labels)



---



In [None]:
#K fold validation implementation using 5 splits
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# KNN classification

In [None]:
scores = []
y_true = []
y_pred = []
conf_matrices = []
class_reports = []
execution_times = []

for train, test in kfold.split(X_resampled, y_resampled):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_resampled[train], y_resampled[train])
    y_pred_fold = knn.predict(X_resampled[test])
    y_true_fold = y_resampled[test]

    # record start time
    start_time = time.time()
    score = accuracy_score(y_true_fold, y_pred_fold)
    scores.append(score)
    y_true.extend(y_true_fold)
    y_pred.extend(y_pred_fold)

    # calculate and store accuracy score
    score = accuracy_score(y_resampled[test], y_pred_fold) 
    scores.append(score)
    
    # get the confusion matrix
    conf_matrix = confusion_matrix(y_resampled[test], y_pred_fold)   
    conf_matrices.append(conf_matrix) 

    # get the classification report
    class_report = classification_report(y_resampled[test], y_pred_fold) 
    class_reports.append(class_report)

    # record end time and calculate execution time
    end_time = time.time()
    execution_time = end_time - start_time
    execution_times.append(execution_time)

    # print results for each fold
    print("Fold results:")
    print("Accuracy score:", score)
    print("Confusion matrix:")
    print(conf_matrix)
    print("\nClassification report:")
    print(class_report)
    print("Execution time: {:.2f} seconds".format(execution_time))
    print("="*80)
    print("")

# Evaluations

In [None]:
print("Average accuracy score:", sum(scores)/len(scores))
print("Classification report:\n", classification_report(y_true, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_true, y_pred))
avg_execution_time = sum(execution_times)/len(execution_times)
print("Average execution time: {:.2f} seconds".format(avg_execution_time))



---



# Decision Tree classification

In [None]:
scores1 = []
y_true1= []
y_pred1 = []
conf_matrices1 = []
class_reports1 = []
execution_times1 = []

# loop to run the Decision tree classification through each fold
for train, test in kfold.split(X_resampled, y_resampled):
    dtc = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=5,min_samples_split=7)
    dtc.fit(X_resampled[train], y_resampled[train])
    y_pred_fold = dtc.predict(X_resampled[test])
    y_true_fold = y_resampled[test]

    # record start time
    start_time = time.time()

    score = accuracy_score(y_true_fold, y_pred_fold)
    scores1.append(score)
    y_true1.extend(y_true_fold)
    y_pred1.extend(y_pred_fold)

    # calculate and store accuracy score
    score = accuracy_score(y_resampled[test], y_pred_fold) 
    scores1.append(score)
    
    # get the confusion matrix
    conf_matrix = confusion_matrix(y_resampled[test], y_pred_fold)   
    conf_matrices1.append(conf_matrix) 

    # get the classification report
    class_report = classification_report(y_resampled[test], y_pred_fold) 
    class_reports1.append(class_report)

    # record end time and calculate execution time
    end_time = time.time()
    execution_time = end_time - start_time
    execution_times1.append(execution_time)

    # print results for each fold
    print("Fold results:")
    print("Accuracy score:", score)
    print("Confusion matrix:")
    print(conf_matrix)
    print("\nClassification report:")
    print(class_report)
    print("Execution time: {:.2f} seconds".format(execution_time))
    print("="*80)
    print("")

# Evaluations

In [None]:
print("Average accuracy score:", sum(scores1)/len(scores1))
print("Classification report:\n", classification_report(y_true1, y_pred1))
print("Confusion matrix:\n", confusion_matrix(y_true1, y_pred1))
avg_execution_time = sum(execution_times1)/len(execution_times1)
print("Average execution time: {:.2f} seconds".format(avg_execution_time))