## Data Set Collection
### Dataset collected from (https://spamassassin.apache.org/old/publiccorpus/)

In [125]:
#librarie we use in our project
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to C:\Users\Mohit
[nltk_data]     Computers\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Mohit
[nltk_data]     Computers\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Mohit
[nltk_data]     Computers\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mohit Computers\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to C:\Users\Mohit
[nltk_data]     Computers\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Mohit
[nltk_data]     Computers\AppData\Roaming\nltk_data...
[n

True

Convert our dataset (spam and ham) into single csv file

In [112]:
def load_emails(folder):
    emails = []
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r', encoding='latin1') as file:
            emails.append(file.read())
    return emails

spam_emails = load_emails('hard_ham') ## List of all spam email
ham_emails = load_emails('easy_ham') ## List of all ham email. (Ham mean un-spam email)

In [113]:
data = {
    'message': spam_emails + ham_emails,
    'label': ['spam'] * len(spam_emails) + ['ham'] * len(ham_emails)
}
df = pd.DataFrame(data)
df.to_csv('email_spam_dataset.csv', index=False)

## Preprocessing

In [114]:
spam_data=pd.read_csv("email_spam_dataset.csv")
spam_data.head()

Unnamed: 0,message,label
0,Return-Path: Fool@motleyfool.com\nDelivery-Dat...,spam
1,Return-Path: <malcolm-sweeps@mrichi.com>\nDeli...,spam
2,From nic@starflung.com Mon Jun 24 17:06:54 20...,spam
3,Received: from bran.mc.mpls.visi.com (bran.mc....,spam
4,Return-Path: <iso17799@securityrisk.co.uk>\nRe...,spam


In [115]:
spam_data.shape

(2752, 2)

In [116]:
def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

spam_data['message'] = spam_data['message'].apply(remove_html)
spam_data.head(10)


Unnamed: 0,message,label
0,Return-Path: Fool@motleyfool.com\nDelivery-Dat...,spam
1,Return-Path: \nDelivered-To: rod@arsecandle.or...,spam
2,From nic@starflung.com Mon Jun 24 17:06:54 20...,spam
3,Received: from bran.mc.mpls.visi.com (bran.mc....,spam
4,Return-Path: \nReceived: (qmail 9820 invoked b...,spam
5,Return-Path: \nReceived: from sunu422.rz.ruhr-...,spam
6,Return-Path: \nReceived: from sunu422.rz.ruhr-...,spam
7,Return-Path: \nReceived: from sunu422.rz.ruhr-...,spam
8,Return-Path: \nReceived: from sunu422.rz.ruhr-...,spam
9,Return-Path: \nReceived: from sunu422.rz.ruhr-...,spam


Covert text into lower case for consistent data and remove punctuation 

In [117]:
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

spam_data['message'] = spam_data['message'].apply(normalize_text)
spam_data.head()


Unnamed: 0,message,label
0,returnpath foolmotleyfoolcom\ndeliverydate wed...,spam
1,returnpath \ndeliveredto rodarsecandleorg\nrec...,spam
2,from nicstarflungcom mon jun 24 170654 2002\n...,spam
3,received from branmcmplsvisicom branmcmplsvisi...,spam
4,returnpath \nreceived qmail 9820 invoked by al...,spam


Split the text into word

In [119]:
def tokenize_text(text):
    return nltk.word_tokenize(text)

spam_data['message'] = spam_data['message'].apply(tokenize_text)


In [120]:
spam_data.head()
##List of words in message column

Unnamed: 0,message,label
0,"[returnpath, foolmotleyfoolcom, deliverydate, ...",spam
1,"[returnpath, deliveredto, rodarsecandleorg, re...",spam
2,"[from, nicstarflungcom, mon, jun, 24, 170654, ...",spam
3,"[received, from, branmcmplsvisicom, branmcmpls...",spam
4,"[returnpath, received, qmail, 9820, invoked, b...",spam


Removing Stop words (Common words in each email)

In [121]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

spam_data['message'] = spam_data['message'].apply(remove_stopwords)


In [122]:
spam_data

Unnamed: 0,message,label
0,"[returnpath, foolmotleyfoolcom, deliverydate, ...",spam
1,"[returnpath, deliveredto, rodarsecandleorg, re...",spam
2,"[nicstarflungcom, mon, jun, 24, 170654, 2002, ...",spam
3,"[received, branmcmplsvisicom, branmcmplsvisico...",spam
4,"[returnpath, received, qmail, 9820, invoked, a...",spam
...,...,...
2747,"[ilugadminlinuxie, wed, dec, 4, 115304, 2002, ...",ham
2748,"[ilugadminlinuxie, wed, dec, 4, 115308, 2002, ...",ham
2749,"[spambayesbouncespythonorg, wed, dec, 4, 11520...",ham
2750,"[ilugadminlinuxie, wed, dec, 4, 115315, 2002, ...",ham


In [123]:
spam_data["message"][1]

['returnpath',
 'deliveredto',
 'rodarsecandleorg',
 'received',
 'qmail',
 '16821',
 'invoked',
 'uid',
 '505',
 '7',
 'may',
 '2002',
 '143701',
 '0000',
 'received',
 'malcolmsweepsmrichicom',
 'blazingarsecandleorg',
 'uid',
 '500',
 'qmailscanner110',
 'fprot',
 '312',
 'clear0',
 'processed',
 '0260914',
 'secs',
 '07',
 'may',
 '2002',
 '143701',
 '0000',
 'deliveredto',
 'rod3dsarsecandleorg',
 'received',
 'qmail',
 '16811',
 'invoked',
 'uid',
 '505',
 '7',
 'may',
 '2002',
 '143700',
 '0000',
 'received',
 'malcolmsweepsmrichicom',
 'blazingarsecandleorg',
 'uid',
 '502',
 'qmailscanner110',
 'fprot',
 '312',
 'clear0',
 'processed',
 '0250416',
 'secs',
 '07',
 'may',
 '2002',
 '143700',
 '0000',
 'received',
 'bocellisiteprotectcom',
 '644112021',
 'h0090272a42dbneclient2attbicom',
 'smtp',
 '7',
 'may',
 '2002',
 '143659',
 '0000',
 'received',
 'mailmrichicom',
 '2083395187',
 'bocellisiteprotectcom',
 '893893',
 'smtp',
 'id',
 'jaa14328',
 'tue',
 '7',
 'may',
 '2002',

We use stemmer for converting words into there actual form.
This will make our data more generalize

In [127]:
stemmer = PorterStemmer()
def stem_text(text):
    return [stemmer.stem(word) for word in text]

spam_data['message'] = spam_data['message'].apply(stem_text)

In [128]:
spam_data["message"][1]


['returnpath',
 'deliveredto',
 'rodarsecandleorg',
 'receiv',
 'qmail',
 '16821',
 'invok',
 'uid',
 '505',
 '7',
 'may',
 '2002',
 '143701',
 '0000',
 'receiv',
 'malcolmsweepsmrichicom',
 'blazingarsecandleorg',
 'uid',
 '500',
 'qmailscanner110',
 'fprot',
 '312',
 'clear0',
 'process',
 '0260914',
 'sec',
 '07',
 'may',
 '2002',
 '143701',
 '0000',
 'deliveredto',
 'rod3dsarsecandleorg',
 'receiv',
 'qmail',
 '16811',
 'invok',
 'uid',
 '505',
 '7',
 'may',
 '2002',
 '143700',
 '0000',
 'receiv',
 'malcolmsweepsmrichicom',
 'blazingarsecandleorg',
 'uid',
 '502',
 'qmailscanner110',
 'fprot',
 '312',
 'clear0',
 'process',
 '0250416',
 'sec',
 '07',
 'may',
 '2002',
 '143700',
 '0000',
 'receiv',
 'bocellisiteprotectcom',
 '644112021',
 'h0090272a42dbneclient2attbicom',
 'smtp',
 '7',
 'may',
 '2002',
 '143659',
 '0000',
 'receiv',
 'mailmrichicom',
 '2083395187',
 'bocellisiteprotectcom',
 '893893',
 'smtp',
 'id',
 'jaa14328',
 'tue',
 '7',
 'may',
 '2002',
 '093701',
 '0500',
 

In [129]:
# Convert preprocessed text data to strings
spam_data['message'] = spam_data['message'].apply(lambda x: ' '.join(x))

## Feature Selection 

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# Load data
df = pd.read_csv('spam_data.csv')
X = spam_data['message']
y = spam_data['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Vectorize text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Apply Feature Selection
feature_selector = SequentialFeatureSelector(MultinomialNB(),
                                             k_features=1000,
                                             forward=True,
                                             floating=False,
                                             scoring='accuracy',
                                             cv=5)

X_train_selected = feature_selector.fit_transform(X_train_tfidf, y_train)
X_test_selected = feature_selector.transform(X_test_tfidf)

# Train classifier
classifier = MultinomialNB()
classifier.fit(X_train_selected, y_train)

# Predict and evaluate
y_pred = classifier.predict(X_test_selected)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Accuracy:")
print(accuracy_score(y_test, y_pred))


FileNotFoundError: [Errno 2] No such file or directory: 'spam_data.csv'

In [134]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [137]:
# Define pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('feature_selection', SequentialFeatureSelector(MultinomialNB(), 
                                                     k_features=500, 
                                                     forward=True, 
                                                     floating=False,
                                                     scoring='accuracy',
                                                     cv=5)),
    ('classifier', MultinomialNB())
])

In [138]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

In [71]:
# Train and Evaluate Gaussian Naive Bayes
gnb = GaussianNB()
gnb.fit(X_selected, spam_data['label'])
y_pred_gnb = gnb.predict(X_selected)
conf_matrix_gnb = confusion_matrix(spam_data['label'], y_pred_gnb)
print("Confusion Matrix for Gaussian Naive Bayes:\n", conf_matrix_gnb)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 507208 stored elements and shape (2752, 85418)>
  Coords	Values
  (0, 74821)	2
  (0, 46802)	1
  (0, 42206)	1
  (0, 83592)	5
  (0, 63260)	6
  (0, 4805)	2
  (0, 14690)	1
  (0, 21204)	8
  (0, 74034)	4
  (0, 52984)	2
  (0, 25904)	1
  (0, 46180)	1
  (0, 62568)	1
  (0, 83005)	1
  (0, 23141)	1
  (0, 45124)	3
  (0, 61029)	3
  (0, 21206)	1
  (0, 10940)	2
  (0, 7900)	3
  (0, 68267)	3
  (0, 25905)	1
  (0, 31201)	1
  (0, 63774)	1
  (0, 73030)	2
  :	:
  (2751, 6001)	1
  (2751, 6002)	1
  (2751, 6003)	1
  (2751, 6004)	1
  (2751, 6005)	1
  (2751, 6006)	1
  (2751, 6007)	1
  (2751, 6009)	1
  (2751, 6010)	1
  (2751, 6011)	1
  (2751, 6012)	1
  (2751, 6014)	1
  (2751, 6013)	1
  (2751, 6016)	1
  (2751, 6015)	1
  (2751, 6017)	1
  (2751, 6018)	1
  (2751, 6019)	1
  (2751, 6020)	1
  (2751, 6022)	1
  (2751, 6021)	1
  (2751, 6023)	1
  (2751, 6024)	1
  (2751, 6026)	1
  (2751, 6025)	1
0       1
1       1
2       1
3       1
4       1
       ..
2747    0
27

In [72]:
# Train and Evaluate Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_selected, spam_data['label'])
y_pred_mnb = mnb.predict(X_selected)
conf_matrix_mnb = confusion_matrix(spam_data['label'], y_pred_mnb)
print("Confusion Matrix for Multinomial Naive Bayes:\n", conf_matrix_mnb)

In [73]:
# Train and Evaluate Decision Tree (J48)
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_selected, spam_data['label'])
y_pred_dtc = dtc.predict(X_selected)
conf_matrix_dtc = confusion_matrix(spam_data['label'], y_pred_dtc)
print("Confusion Matrix for Decision Tree (J48):\n", conf_matrix_dtc)