# Implementation

In [295]:
#Import Liabraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load Dataset

In [296]:
data = pd.read_csv('../dataset/emails.csv')

print("Dataset loaded successfully")

Dataset loaded successfully


In [297]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


### Data Preprocessing

In [298]:
data.shape

(5728, 2)

In [299]:
# Identify duplicates
duplicates = data[data.duplicated()]
print("Duplicate data before drop:", len(duplicates))

# Drop duplicates
data = data.drop_duplicates()
print("Duplicated Data Drop Done")

# Check for duplicates after dropping
print("Duplicate data after drop:", data.duplicated().sum())

Duplicate data before drop: 33
Duplicated Data Drop Done
Duplicate data after drop: 0


In [300]:
#Check Data Null
data.isnull().sum()

text    0
spam    0
dtype: int64

### Text Preprocessing

In [301]:
import re
import string

Convert Uppercase to Lowercase

In [302]:
data["text"].head

<bound method NDFrame.head of 0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5695, dtype: object>

In [303]:
data["text"] = data["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [304]:
data["text"].head

<bound method NDFrame.head of 0       subject: naturally irresistible your corporate...
1       subject: the stock trading gunslinger fanny is...
2       subject: unbelievable new homes made easy im w...
3       subject: 4 color printing special request addi...
4       subject: do not have money , get software cds ...
                              ...                        
5723    subject: re : research and development charges...
5724    subject: re : receipts from visit jim , thanks...
5725    subject: re : enron case study update wow ! al...
5726    subject: re : interest david , please , call s...
5727    subject: news : aurora 5 . 2 update aurora ver...
Name: text, Length: 5695, dtype: object>

Remove Links 

In [305]:
data["text"] = data["text"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

In [306]:
data["text"].head

<bound method NDFrame.head of 0       subject: naturally irresistible your corporate...
1       subject: the stock trading gunslinger fanny is...
2       subject: unbelievable new homes made easy im w...
3       subject: 4 color printing special request addi...
4       subject: do not have money , get software cds ...
                              ...                        
5723    subject: re : research and development charges...
5724    subject: re : receipts from visit jim , thanks...
5725    subject: re : enron case study update wow ! al...
5726    subject: re : interest david , please , call s...
5727    subject: news : aurora 5 . 2 update aurora ver...
Name: text, Length: 5695, dtype: object>

Remove Punctuations

In [307]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

data["text"] = data["text"].apply(remove_punctuations)

In [308]:
data["text"].head

<bound method NDFrame.head of 0       subject naturally irresistible your corporate ...
1       subject the stock trading gunslinger fanny is ...
2       subject unbelievable new homes made easy im wa...
3       subject 4 color printing special request addit...
4       subject do not have money  get software cds fr...
                              ...                        
5723    subject re  research and development charges t...
5724    subject re  receipts from visit jim  thanks ag...
5725    subject re  enron case study update wow  all o...
5726    subject re  interest david  please  call shirl...
5727    subject news  aurora 5  2 update aurora versio...
Name: text, Length: 5695, dtype: object>

Remove Numbers

In [309]:
data["text"] = data['text'].str.replace(r'\d+', '', regex=True)


In [310]:
data["text"].head

<bound method NDFrame.head of 0       subject naturally irresistible your corporate ...
1       subject the stock trading gunslinger fanny is ...
2       subject unbelievable new homes made easy im wa...
3       subject  color printing special request additi...
4       subject do not have money  get software cds fr...
                              ...                        
5723    subject re  research and development charges t...
5724    subject re  receipts from visit jim  thanks ag...
5725    subject re  enron case study update wow  all o...
5726    subject re  interest david  please  call shirl...
5727    subject news  aurora    update aurora version ...
Name: text, Length: 5695, dtype: object>

Remove Stopwords

In [311]:
import nltk
import os

#check the folder
download_dir = '../stopwordsMain'
os.makedirs(download_dir, exist_ok=True)

#download stopwords
nltk.download('stopwords', download_dir='../stopwordsMain')

#open Stopwords
with open('../stopwordsMain/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()
    
# print(sw)
    

[nltk_data] Downloading package stopwords to ../stopwordsMain...
[nltk_data]   Package stopwords is already up-to-date!


In [312]:
data["text"] = data["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [313]:
data["text"].head

<bound method NDFrame.head of 0       subject naturally irresistible corporate ident...
1       subject stock trading gunslinger fanny merrill...
2       subject unbelievable new homes made easy im wa...
3       subject color printing special request additio...
4       subject money get software cds software compat...
                              ...                        
5723    subject research development charges gpg forwa...
5724    subject receipts visit jim thanks invitation v...
5725    subject enron case study update wow day super ...
5726    subject interest david please call shirley cre...
5727    subject news aurora update aurora version fast...
Name: text, Length: 5695, dtype: object>

Remove Common words on Email 

In [314]:
data['text'] = data['text'].str.replace(r'\b(subject|cc|forward)\b', '', regex=True)


In [315]:
data["text"].head

<bound method NDFrame.head of 0        naturally irresistible corporate identity lt ...
1        stock trading gunslinger fanny merrill muzo c...
2        unbelievable new homes made easy im wanting s...
3        color printing special request additional inf...
4        money get software cds software compatibility...
                              ...                        
5723     research development charges gpg forwarded sh...
5724     receipts visit jim thanks invitation visit ls...
5725     enron case study update wow day super thank m...
5726     interest david please call shirley crenshaw a...
5727     news aurora update aurora version fastest mod...
Name: text, Length: 5695, dtype: object>

Stamming

In [316]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

data["text"] = data["text"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [317]:
data["text"].head

<bound method NDFrame.head of 0       natur irresist corpor ident lt realli hard rec...
1       stock trade gunsling fanni merril muzo colza a...
2       unbeliev new home made easi im want show homeo...
3       color print special request addit inform click...
4       money get softwar cd softwar compat great grow...
                              ...                        
5723    research develop charg gpg forward shirley cre...
5724    receipt visit jim thank invit visit lsu shirle...
5725    enron case studi updat wow day super thank muc...
5726    interest david pleas call shirley crenshaw ass...
5727    news aurora updat aurora version fastest model...
Name: text, Length: 5695, dtype: object>

Data after Text Preprocessing

In [318]:
data

Unnamed: 0,text,spam
0,natur irresist corpor ident lt realli hard rec...,1
1,stock trade gunsling fanni merril muzo colza a...,1
2,unbeliev new home made easi im want show homeo...,1
3,color print special request addit inform click...,1
4,money get softwar cd softwar compat great grow...,1
...,...,...
5723,research develop charg gpg forward shirley cre...,0
5724,receipt visit jim thank invit visit lsu shirle...,0
5725,enron case studi updat wow day super thank muc...,0
5726,interest david pleas call shirley crenshaw ass...,0


Buliding a Vocabulary

In [325]:
from collections import Counter
vocab = Counter()

for sentence in data['text']:
    vocab.update(sentence.split())
    
print("Length of vocabulary : " ,len(vocab))
print("Data shape : " , (data.shape))


Length of vocabulary :  25670
Data shape :  (5695, 2)
