<a href="https://colab.research.google.com/github/j-ranasinghe/Spam-E-mail-Classification/blob/main/Spam_E_mail_Detection%20without%20ouputs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the libraries

In [None]:
import time
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats.mstats import winsorize
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Importing the dataset

In [None]:
cols = ['capital_run_length_total','capital_run_length_longest','capital_run_length_average','char_freq_#','char_freq_$','char_freq_!','char_freq_[','char_freq_(','char_freq_;','word_freq_conference','word_freq_table','word_freq_edu','word_freq_re','word_freq_project','word_freq_original','word_freq_meeting','word_freq_cs','word_freq_direct','word_freq_pm','word_freq_parts','word_freq_1999','word_freq_technology','word_freq_85','word_freq_415','word_freq_data','word_freq_857','word_freq_telnet','word_freq_labs','word_freq_lab','word_freq_650','word_freq_george','word_freq_hpl','word_freq_hp','word_freq_money','word_freq_000','word_freq_font','word_freq_your','word_freq_credit','word_freq_you','word_freq_email','word_freq_business','word_freq_free','word_freq_addresses','word_freq_report','word_freq_people','word_freq_will','word_freq_receive','word_freq_mail','word_freq_order','word_freq_internet','word_freq_remove','word_freq_over','word_freq_our','word_freq_3d','word_freq_all','word_freq_address','word_freq_make','class']
df = pd.read_csv('spambase.data' , names= cols )
print(df.head())

# Data Visualisation

In [None]:
print(df.dtypes) #Check the data type of the attributes


In [None]:
print(df.shape) #Check the shape of the dataset

In [None]:
#Check the distribution of the spam vs not-spam 
new_class = pd.Categorical(df["class"])
new_class = new_class.rename_categories(["spam","not_spam"])              
new_class.describe()

In [None]:
#Boxplot to for outliers
for label in cols[:-1]:
    # for label in range(0,5):
    df.plot(
    kind='box', 
    subplots=True, 
    sharey=False, 
    figsize=(70, 25)
    )
    plt.subplots_adjust(wspace=0.9) 
    plt.show()
    break

#Data Cleaning

In [None]:
# Find NaNs and duplicates in df

print('There are {} missing values or NaNs in data.'
      .format(df.isnull().values.sum()))

temp_energy = df.duplicated(keep='first').sum()

print('There are {} duplicate rows in data based on all columns.'
      .format(temp_energy))

df = df.drop_duplicates()   #drop the duplicates

In [None]:
# extract the features
features = df.drop('class', axis=1)
# extract the labels
labels = df['class']

In [None]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

In [None]:
# Handle outliers
X_winsorized = winsorize(X_scaled, limits=[0.05, 0.05])

# PCA

In [None]:
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X_winsorized)

In [None]:
print(X_winsorized.shape) # shape of dataframe before applying PCA
print(X_pca.shape) # shape of dataframe after applying PCA



---



In [None]:
# handle class imbalance using oversampling technique
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_pca, labels)

In [None]:
# split dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.25, random_state=0)

# KNN classification

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_true = y_test

# Evaluations

In [None]:
# calculate and store accuracy score
score = accuracy_score(y_true, y_pred) 

# get the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)   

# get the classification report
class_report = classification_report(y_true, y_pred) 

# print results
print("Accuracy score:", score)
print("Confusion matrix:")
print(conf_matrix)
print("\nClassification report:")
print(class_report)



---



# Decision Tree classification

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred1 = dtc.predict(X_test)
y_true1 = y_test

# Evaluations

In [None]:
# calculate and store accuracy score
score1 = accuracy_score(y_true, y_pred1) 

# get the confusion matrix
conf_matrix1 = confusion_matrix(y_true, y_pred1)   

# get the classification report
class_report1 = classification_report(y_true, y_pred1) 

# print results
print("Accuracy score:", score1)
print("Confusion matrix:")
print(conf_matrix1)
print("\nClassification report:")
print(class_report1)