In [70]:
import numpy as np
import pandas as pd
import glob
import re
from sklearn.utils import shuffle
import torch
import torch.nn.functional as F
import torch.nn as nn
from sklearn.metrics import confusion_matrix, f1_score
import torch.optim as optim
from sklearn.model_selection import train_test_split


In [71]:
hamDirs = pd.Series(np.array(glob.glob('datasets/email/ham/*/*/*')))
spamDirs = pd.Series(np.array(glob.glob('datasets/email/spam/*/*/*')))

len(spamDirs), len(hamDirs)

(21103, 19088)

In [72]:
# For Header Filter

mailHeaders = ["received", 'mime-version', 'reply-to',
             "date", "content-transfer-encoding", "message-id", "x-from", "x-to", "x-cc", "x-bcc",
             "x-origin", "x-filename", "x-priority", "x-msmail-priority", "organization", "x-mailer"]

EmailHeaderRegex = "from:\s(?P<from>.*)?[^^]*subject:\s(?P<subject>.*)?[^^]*content-type:\s(?P<content_type>.*)?"

In [73]:
# Extract Mail

def getContent(filePath):
    
    with open(filePath, encoding='windows-1252') as f:
        contents = f.read()
        contents = contents.lower()
        contents = re.sub("<[^<>]+>", " __tag__ ", contents)
        contents = re.sub("[0-9]+", "number", contents)
        contents = re.sub("(http|https)://[^\s]*", "httpaddr", contents)
        contents = re.sub("[^\s]+@[^\s]+", "emailaddr", contents)
        contents = re.sub("[$]+", "dollar", contents)
        contents = re.sub("(" + ":|".join(mailHeaders)  + ")+", "header:", contents)
        
    emailHeaderParts = re.findall(EmailHeaderRegex, contents)
    (emailFrom, subject, content_type) = emailHeaderParts[0] if len(emailHeaderParts) else ("", "", "") 
    
    __SEPERATOR__ = " __SEPERATOR__ "
    bodyRegex = "(?:[^^]*)(?:header\s*:.*\n)([^^]*)"  
    body = re.findall(bodyRegex, contents)
    body = body[0] if len(body) else ""
    
    
    mailMessage = re.split("-----original message-----", contents)[0]
    content = emailFrom + __SEPERATOR__ + subject + __SEPERATOR__ + content_type + __SEPERATOR__ + "\n" + body
    
    contentSeries = pd.Series([filePath, content, body, emailFrom, subject, content_type], 
                              index=["filePath", "content", "body", "from", "subject", "content_type"]);
    
    return contentSeries

In [74]:
hamDf = hamDirs[:700].apply(getContent)
spamDf = spamDirs[:700].apply(getContent)
hamDf["label"] = 0
spamDf["label"] = 1

# Setup Mail Dataset
datasetDf = pd.concat([hamDf, spamDf], axis = 0, ignore_index = True)
datasetDf = shuffle(datasetDf, random_state= 0)
datasetDf.to_csv("datasets/email/email.csv", index = False)

In [75]:
# Genarate Vocab
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
train_iter = datasetDf["content"].to_numpy()

def yield_tokens(data_iter):
    for content in data_iter:
        yield tokenizer(content)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [76]:
# Tokenization
def preprocessData(dfS):
    content = dfS["content"]
    tokenizer(content)
    tokensIdx = vocab(tokenizer(content))
    
    tokens = np.zeros(len(vocab), dtype="int64")
    
    tokens[tokensIdx] = 1
    
    tokens = tokens.tolist()
    data = pd.Series([dfS["filePath"], tokens], index = ["file", "tokens"])
    
    return data

In [77]:
dataset_tokens_df = datasetDf.apply(preprocessData, axis =1)
dataset_tokens_df["label"] = datasetDf["label"]
# dataset_tokens_df["tokens"] = dataset_tokens_df["tokens"].apply(lambda x: "|".join(map(str, x)))
# dataset_tokens_df.to_csv("datasets/email/emailTokens.csv", index = False)


In [78]:
X = np.array(dataset_tokens_df["tokens"].to_list())
Y = dataset_tokens_df["label"].values


X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.2, random_state=0)

In [79]:
model = nn.Sequential(
    nn.Linear(X.shape[1], 20), 
    
    nn.ReLU(),
    nn.Linear(20, 1),
    nn.Sigmoid()
)
optimizer = optim.SGD(model.parameters(), lr=0.1)
loss_fn = nn.BCELoss()

In [80]:

def train(X, Y, epoches):
    X = torch.FloatTensor(X)
    Y = torch.FloatTensor(Y)
    N = len(Y)
    model.train()
    
    sum_loss = 0
    for epoch in range(epoches):
        
        optimizer.zero_grad()
        output = model(X).squeeze()
        loss = loss_fn(output, Y)
        
        loss.backward()
        optimizer.step()

        sum_loss += float(loss)
    
    print("Epoch: {:4d}\tloss: {}".format(epoch, sum_loss / N))
    
    


In [81]:
def test(X, Y):
    X = torch.FloatTensor(X)
    Y = torch.FloatTensor(Y)
    N = len(Y)
    model.eval()
    sum_loss = 0
    
    with torch.no_grad():
        output = model(X).squeeze()
        
        loss = loss_fn(output, Y)
        
        sum_loss += float(loss)
    
    print("loss: {}".format(sum_loss / N))

In [82]:
def predict(X):
    y_pred = (model(torch.FloatTensor(X)).squeeze().detach().numpy() > 0.5).astype("int64")
    return y_pred

In [83]:
train(X_train, y_train, 800)

Epoch:  799	loss: 0.025565710051367724


In [84]:
test(X_test, y_test)

loss: 5.030113139322826e-05


In [85]:
y_pred = predict(X_test)
confusion_matrix(y_test, y_pred), f1_score(y_test, y_pred)

(array([[137,   0],
        [  2, 141]]),
 0.9929577464788732)

In [86]:
y_pred = predict(X_train)
confusion_matrix(y_train, y_pred)

array([[563,   0],
       [  0, 557]])

In [87]:
hamDf2 = hamDirs[700:900].apply(getContent)
spamDf2 = spamDirs[700:800].apply(getContent)
hamDf2["label"] = 0
spamDf2["label"] = 1

datasetDf2 = pd.concat([hamDf2, spamDf2], axis = 0, ignore_index = True)
datasetDf2 = shuffle(datasetDf2, random_state= 0)

dataset_tokens_df2 = datasetDf2.apply(preprocessData, axis =1)
dataset_tokens_df2["label"] = datasetDf2["label"]

In [88]:
X2 = np.array(dataset_tokens_df2["tokens"].to_list())
Y2 = dataset_tokens_df2["label"].values

X_train2, X_test2, y_train2, y_test2 = train_test_split( X2, Y2, test_size=0.2, random_state=0)

In [89]:
y_pred = predict(X_test2)
print(f1_score(y_test2, y_pred))

confusion_matrix(y_test2, y_pred)

1.0


array([[32,  0],
       [ 0, 28]])

In [90]:
y_pred = predict(X_train2)
print(f1_score(y_train2, y_pred))
confusion_matrix(y_train2, y_pred)

1.0


array([[168,   0],
       [  0,  72]])