In [65]:
import torch
torch.__version__
torch.cuda.is_available()

device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [66]:
def read_new_email(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    parts = text.split('"')  
    
    subject = parts[1].strip()
    body = parts[3].strip()
    label = parts[5].strip()
    
    new_email = {'Subject':subject, 'Body':body, 'Label':label}
    return new_email

In [67]:
confirmation_emails = []

for i in range(1, 93):
    file_path = f"Data/confirmation/conf{i}.txt"
    new_email = read_new_email(file_path)
    confirmation_emails.append(new_email)

print(len(confirmation_emails))

92


In [68]:
rejection_emails = []

for i in range(1, 105):
    file_path = f"Data/rejections/rejection{i}.txt"
    new_email = read_new_email(file_path)
    rejection_emails.append(new_email)

print(len(rejection_emails))

104


In [69]:
from sklearn.model_selection import train_test_split

all_emails = []


for i in range(0, len(confirmation_emails)):
    if isinstance(confirmation_emails[i], dict):
        all_emails.append(confirmation_emails[i])

for i in range(0, len(rejection_emails)):
    if isinstance(rejection_emails[i], dict):
        all_emails.append(rejection_emails[i])

messages = []
labels = []


for i in range(0, len(all_emails)):
    text = all_emails[i]['Subject'] + " " + all_emails[i]['Body']
    label = all_emails[i]['Label']

    messages.append(text)
    labels.append(label)


X_train, X_test, y_train, y_test = train_test_split(messages,
                                                    labels, 
                                                    test_size=0.2, 
                                                    train_size=0.8, 
                                                    random_state=42)

In [70]:
label_to_index = {"rejection": 0,
                  "confirmation": 1}

def tensor_format(arr):
    for i in range(0, len(arr)):
        if(arr[i] == 'rejection'):
            arr[i] = 0
        else:
            arr[i] = 1


tensor_format(y_train)    

tensor_format(y_test)


In [117]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def token_iterator():
    for text in X_train:
        yield tokenizer(text)

# Build the vocabulary from the iterator
vocab = build_vocab_from_iterator(token_iterator(), specials = ["<unk>"])
vocab.set_default_index(vocab["<unk>"])

print(len(vocab))

1785


In [116]:
max_length = max(len(tokenizer(text)) for text in X_train)


# def encode_text(text):
#     tokens = tokenizer(text)
#     return [vocab[token] for token in tokens]

def encode_text(text, max_length):
    tokens = tokenizer(text)
    encoded_tokens = [vocab[token] for token in tokens]
    # Pad or truncate sequences to max_length
    if len(encoded_tokens) < max_length:
        encoded_tokens += [vocab["<PAD>"]] * (max_length - len(encoded_tokens))
    else:
        encoded_tokens = encoded_tokens[:max_length]
    return encoded_tokens

y_train_tensors = torch.tensor(y_train, dtype=torch.float)
y_test_tensors = torch.tensor(y_test, dtype=torch.float)

X_train_tensors = torch.stack([torch.tensor(encode_text(text, max_length), dtype=torch.float) for text in X_train])
X_test_tensors = torch.stack([torch.tensor(encode_text(text, max_length), dtype=torch.float) for text in X_test])

print(max_length)


338


In [132]:
#Model
from torch import nn

class EmailModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(1785, embedding_dim=128) #vocab size
        self.fc1 = nn.Linear(338*128, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1) #flatten the tensor
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x
    
model_1 = EmailModel().to(device)
model_1


EmailModel(
  (embedding): Embedding(1785, 128)
  (fc1): Linear(in_features=43264, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [133]:
#Loss Function and Optimizer

from torch import nn

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model_1.parameters(), lr=0.01)

In [134]:
#Accuracy Function

def acc_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    accuracy = (correct/len(y_pred)) * 100
    return accuracy

In [135]:
#Training and Testing Loop
epochs = 10

X_test_tensors = X_test_tensors.long().to(device)
X_train_tensors = X_train_tensors.long().to(device)
y_train_tensors = y_train_tensors.float().to(device)
y_test_tensors = y_test_tensors.float().to(device)


for epoch in range(epochs):
    model_1.train()

    train_preds = model_1(X_train_tensors).squeeze()
    train_loss = loss_fn(train_preds, y_train_tensors)
    train_acc = acc_fn(y_train_tensors, train_preds)

    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()


    model_1.eval()
    with torch.inference_mode():
        test_preds = model_1(X_test_tensors).squeeze()
        test_loss = loss_fn(test_preds, y_test_tensors)
        test_acc = acc_fn(y_test_tensors, test_preds)
        
    print(f"Epoch: {epoch} | Loss: {train_loss: .4f}, Acc: {train_acc:.2f}% | Test Loss: {test_loss: .4f}, Test Acc: {test_acc:.2f}%")




Epoch: 0 | Loss:  0.7026, Acc: 0.00% | Test Loss:  52.5000, Test Acc: 47.50%
Epoch: 1 | Loss:  50.7061, Acc: 46.79% | Test Loss:  43.1116, Test Acc: 30.00%
Epoch: 2 | Loss:  24.2509, Acc: 49.36% | Test Loss:  46.4719, Test Acc: 52.50%
Epoch: 3 | Loss:  43.5513, Acc: 54.49% | Test Loss:  47.5000, Test Acc: 52.50%
Epoch: 4 | Loss:  44.3584, Acc: 54.49% | Test Loss:  47.5000, Test Acc: 52.50%
Epoch: 5 | Loss:  44.2473, Acc: 55.13% | Test Loss:  47.5000, Test Acc: 52.50%
Epoch: 6 | Loss:  44.2308, Acc: 55.77% | Test Loss:  47.5000, Test Acc: 52.50%
Epoch: 7 | Loss:  44.2308, Acc: 55.77% | Test Loss:  47.5000, Test Acc: 52.50%
Epoch: 8 | Loss:  44.2308, Acc: 55.77% | Test Loss:  47.5000, Test Acc: 52.50%
Epoch: 9 | Loss:  44.2308, Acc: 55.77% | Test Loss:  47.5000, Test Acc: 52.50%
