#### Importing necessary libraries

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import random
random.seed(42)
import seaborn as sns
import matplotlib.pyplot as plt
import re

%matplotlib inline
import matplotlib.pyplot as plt

nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("emails.csv")

#### Some knowledge of the dataset

In [3]:
data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
data['spam'].value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [5]:
spam_n, ham_n = len(data[data["spam"] == 1]), len(data[data["spam"] == 0])
print(f"Percentage of spam emails = {round((spam_n)/(spam_n + ham_n) * 100, 2)}%")
print(f"Percentage of ham emails = {round((ham_n)/(spam_n + ham_n) * 100, 2)}%")

Percentage of spam emails = 23.88%
Percentage of ham emails = 76.12%


In [6]:
data.shape

(5728, 2)

In [7]:
print(data.describe())

              spam
count  5728.000000
mean      0.238827
std       0.426404
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max       1.000000


This is a check to see the structure of messages, i.e., whether all the messages have the structure - 

"Subject: \<text\>"

In [8]:
c = 0

regex = f"^Subject:\s"
for i in range(data.shape[0]):
    match = re.search(regex, data.iloc[i, 0])
    if match:
        c = c + 1
if c == data.shape[0]:
    print("All messages have the form 'Subject: <text>'")
else:
    print("The form breaks somewhere")

All messages have the form 'Subject: <text>'


#### Preprocessing

In [9]:
def preprocess_text(text):
    text = text.lower()
    regex = f"^subject:\s(.*)"
    match = re.search(regex, text)
    if match:
      text = match.group(1)
    text = re.sub(r"[^a-z .]", "", text)
    words = text.split()
    words = [word for word in words if word.isalpha() and word not in stopwords.words('english')]
    return ' '.join(words)


print("Begin text preprocessing:", end="\n\n")
data["processed_text"] = ""
for i in range(data.shape[0]):
    if i % 500 == 0 and i != 0:
        a = round(i/data.shape[0]*100)
        print("+"*(a//10*4) + "-"*(40-(a//10*4)) + " : "+ str(a) + "% completed")
    data["processed_text"][i] = preprocess_text(data["text"][i])
    if i == data.shape[0]-1:
        print("+"*40 + " : " + "100% completed", end="\n\n")
print("Preprocessing complete")

Begin text preprocessing:

---------------------------------------- : 9% completed
++++------------------------------------ : 17% completed
++++++++-------------------------------- : 26% completed
++++++++++++---------------------------- : 35% completed
++++++++++++++++------------------------ : 44% completed
++++++++++++++++++++-------------------- : 52% completed
++++++++++++++++++++++++---------------- : 61% completed
++++++++++++++++++++++++++++------------ : 70% completed
++++++++++++++++++++++++++++------------ : 79% completed
++++++++++++++++++++++++++++++++-------- : 87% completed
++++++++++++++++++++++++++++++++++++---- : 96% completed
++++++++++++++++++++++++++++++++++++++++ : 100% completed

Preprocessing complete


In [10]:
data.head(10)

Unnamed: 0,text,spam,processed_text
0,Subject: naturally irresistible your corporate...,1,naturally irresistible corporate identity lt r...
1,Subject: the stock trading gunslinger fanny i...,1,stock trading gunslinger fanny merrill muzo co...
2,Subject: unbelievable new homes made easy im ...,1,unbelievable new homes made easy im wanting sh...
3,Subject: 4 color printing special request add...,1,color printing special request additional info...
4,"Subject: do not have money , get software cds ...",1,money get software cds software compatibility ...
5,"Subject: great nnews hello , welcome to medzo...",1,great nnews hello welcome medzonline sh ground...
6,Subject: here ' s a hot play in motion homela...,1,hot play motion homeland security investments ...
7,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...
8,Subject: undeliverable : home based business f...,1,undeliverable home based business grownups mes...
9,Subject: save your money buy getting this thin...,1,save money buy getting thing tried cialls yet ...


Now, we can separate the dataset into train, validation and test datasets, in 70%, 15% and 15% respectively.

In [11]:
# Breaking the dataset into 70%, 15%, 15% for train, validation and test respectively
train, val_test = train_test_split(data[["processed_text", "spam"]], test_size = 0.30, random_state=42)
val, test = train_test_split(val_test, test_size = 0.50, random_state=42)

Saving these datasets

In [12]:
train.to_csv("train.csv", index=False)
val.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)