Dataset Found at https://www.kaggle.com/datasets/venky73/spam-mails-dataset/data

Load Dataset

In [65]:
import pandas as pd
import numpy as np
import string


In [66]:
data = pd.read_csv('../Datasets/spam_ham_dataset.csv')


In [67]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jamiroscreti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [68]:
data


Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


Remove \r\n

In [69]:
data.text= data.text.apply(lambda x: x.replace('\r\n', ' '))


In [70]:
data.text


0       Subject: enron methanol ; meter # : 988291 thi...
1       Subject: hpl nom for january 9 , 2001 ( see at...
2       Subject: neon retreat ho ho ho , we ' re aroun...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs this deal is to b...
                              ...                        
5166    Subject: put the 10 on the ft the transport vo...
5167    Subject: 3 / 4 / 2000 and following noms hpl c...
5168    Subject: calpine daily gas nomination > > juli...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert dear v...
Name: text, Length: 5171, dtype: object

In [71]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


Preprocessing

In [72]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
corpus = []
stop_words = set(stopwords.words('english'))
for i in range(0, len(data)):
    review = data.text.iloc[i].lower()
    review = review.translate(str.maketrans('', '', string.punctuation)).split()
    review = [stemmer.stem(word) for word in review if not word in stop_words]
    review = ' '.join(review)
    corpus.append(review)



In [73]:
display(corpus[0])
display(data.text.iloc[0])


'subject enron methanol meter 988291 follow note gave monday 4 3 00 preliminari flow data provid daren pleas overrid pop daili volum present zero reflect daili activ obtain ga control chang need asap econom purpos'

"Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is needed asap for economics purposes ."

Vectorizing the text

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus).toarray()
y = data.label_num


In [75]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [76]:
X[0]


array([1, 0, 0, ..., 0, 0, 0])

In [77]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs =-1)
rfc.fit(X_train, y_train)


In [78]:
rfc.score(X_test, y_test) #accuracy


0.9719806763285024

Create a sample email

In [79]:
from random import randint

num_mail = randint(0, len(data))
email_to_predict = data.text.values[num_mail]
email_text = email_to_predict.lower().translate(str.maketrans('', '', string.punctuation)).split()
email_text = [stemmer.stem(word) for word in email_text if not word in stop_words]
email_text = ' '.join(email_text)
email_to_predict


'Subject: re : meter # : 1266 ; august 2000 / allocation exception conoco has nominated 5 . 0 / d at this meter . conoco transport can be allocated the entire meter flow for the days in which the meter flow exceed 5 . 0 mm because they are the only party doing business at the meter . hpl may also be able to extract a sell for this overage which is why i wanted you to be aware of the daily activity thus far . - - - - - - - - - - - - - - - - - - - - - - forwarded by robert e lloyd / hou / ect on 08 / 21 / 2000 01 : 57 pm - - - - - - - - - - - - - - - - - - - - - - - - - - - from : lee l papayoti on 08 / 21 / 2000 01 : 44 pm to : robert e lloyd / hou / ect @ ect cc : sherlyn schumack / hou / ect @ ect , anita luong / hou / ect @ ect , daren j farmer / hou / ect @ ect , gary a hanks / hou / ect @ ect , pat clynes / corp / enron @ enron subject : re : meter # : 1266 ; august 2000 / allocation exception why are these volumes flowing ? from : robert e lloyd 08 / 21 / 2000 01 : 36 pm to : lee 

In [80]:
email_corpus= [email_text]
X_email = vectorizer.transform(email_corpus)


In [90]:
print(f"mail number {num_mail} is categorized as {"spam" if rfc.predict(X_email)[0] == 1 else "not spam"}")
print(f"original mail number {num_mail} was categorized as {"spam" if data.label_num.iloc[num_mail] == 1 else "not spam"}" )
print(f"original mail text: \n\n {data.text.iloc[num_mail]}")


mail number 5163 is categorized as not spam
original mail number 5163 was categorized as not spam
original mail text: 

 Subject: re : meter # : 1266 ; august 2000 / allocation exception conoco has nominated 5 . 0 / d at this meter . conoco transport can be allocated the entire meter flow for the days in which the meter flow exceed 5 . 0 mm because they are the only party doing business at the meter . hpl may also be able to extract a sell for this overage which is why i wanted you to be aware of the daily activity thus far . - - - - - - - - - - - - - - - - - - - - - - forwarded by robert e lloyd / hou / ect on 08 / 21 / 2000 01 : 57 pm - - - - - - - - - - - - - - - - - - - - - - - - - - - from : lee l papayoti on 08 / 21 / 2000 01 : 44 pm to : robert e lloyd / hou / ect @ ect cc : sherlyn schumack / hou / ect @ ect , anita luong / hou / ect @ ect , daren j farmer / hou / ect @ ect , gary a hanks / hou / ect @ ect , pat clynes / corp / enron @ enron subject : re : meter # : 1266 ; augu