In [49]:
import torch
torch.__version__
torch.cuda.is_available()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [50]:
def read_new_email(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    parts = text.split('"')  
    
    subject = parts[1].strip()
    body = parts[3].strip()
    label = parts[5].strip()
    
    new_email = {'Subject':subject, 'Body':body, 'Label':label}
    return new_email

In [51]:
confirmation_emails = []

for i in range(1, 93):
    file_path = f"Data/confirmation/conf{i}.txt"
    new_email = read_new_email(file_path)
    confirmation_emails.append(new_email)

print(len(confirmation_emails))

92


In [52]:
rejection_emails = []

for i in range(1, 105):
    file_path = f"Data/rejections/rejection{i}.txt"
    new_email = read_new_email(file_path)
    rejection_emails.append(new_email)

print(len(rejection_emails))

104


In [53]:
from torch import nn

class EmailModel(nn.Module):
    def __init__(self, input_size):
        super(EmailModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

input_size = 338
model_1 = EmailModel(input_size).to(device)
model_1


EmailModel(
  (fc1): Linear(in_features=338, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)

In [54]:
#Accuracy Function

def acc_fn(y_true, y_pred):
    y_pred = torch.round(y_pred)
    correct = torch.eq(y_true, y_pred).sum().item()
    accuracy = (correct/len(y_pred)) * 100
    return accuracy

In [55]:
for i in range(0, len(confirmation_emails)):
    subject = confirmation_emails[i]["Subject"]
    body = confirmation_emails[i]["Body"]
    confirmation_emails[i] = f"{subject} {body}"

confirmation_emails[0]

'thanks for your application to spotify! dear applicant, we just got your application for the 2024 summer internship, engineering (new york city) role! even though this is just an automated confirmation email, you should know that we’re truly excited you want to join the band. we’ll get back to you as soon as we can. we get a huge amount of applications, and we look at them all to give everyone fair consideration – so it may take a few weeks (or sometimes months for really popular roles). in the meantime, you can listen to the playlist we’ll play in the office as we read your resume. want to sneak a backstage peek? follow life at spotify on linkedin, instagram, twitter, and youtube. learn more about our culture through our band manifesto, or listen to the greenroom and spoton! podcasts. still curious? check out our hr blog, podcasting website, and engineering blog! all the best, the spotify recruiting team'

In [56]:
for i in range(0, len(rejection_emails)):
    subject = rejection_emails[i]["Subject"]
    body = rejection_emails[i]["Body"]
    rejection_emails[i] = f"{subject} {body}"

rejection_emails[0]

"update on your skydio application - wireless software intern hi applicant, thank you for applying for the wireless software intern role at skydio! we know that there are lots of exciting companies out there, so we appreciate the time you took to apply to ours. after reviewing your application, we have unfortunately decided not to move forward at this time. while we greatly appreciate your interest in skydio, we ultimately decided to proceed with other candidates whose skills and experience align more closely with our needs for this specific role. if you submitted an application to other roles at skydio and haven't heard back, then your resume is still under review and we will be in touch soon with an update. our hiring priorities are constantly evolving, so we encourage you to keep an eye on our jobs page. if you see another role that seems like a good fit, please don't hesitate to apply. in the meantime, we'd like to thank you again for your interest and we wish you all the best in y

In [57]:
confirmation_labels = [1] * len(confirmation_emails)
rejection_labels = [0] * len(rejection_emails)

In [58]:
emails = confirmation_emails + rejection_emails
labels = confirmation_labels + rejection_labels

In [59]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=338)  # Limit to 338 features
X = vectorizer.fit_transform(emails).toarray()
y = np.array(labels)

In [60]:
import torch
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensors = torch.tensor(X_train, dtype=torch.float32)
X_test_tensors = torch.tensor(X_test, dtype=torch.float32)
y_train_tensors = torch.tensor(y_train, dtype=torch.float32)
y_test_tensors = torch.tensor(y_test, dtype=torch.float32)

In [61]:
from torch import nn, optim

# Define the model
input_size = X_train_tensors.shape[1]
model = EmailModel(input_size).to(device)
X_test_tensors = X_test_tensors.to(device)
X_train_tensors = X_train_tensors.to(device)
y_train_tensors = y_train_tensors.to(device)
y_test_tensors = y_test_tensors.to(device)

# Loss function and optimizer
loss_fn = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100

for epoch in range(epochs):
    model.train()

    # Forward pass
    train_preds = model(X_train_tensors).squeeze()
    train_loss = loss_fn(train_preds, y_train_tensors)
    train_acc = acc_fn(y_train_tensors, train_preds)

    # Backward pass and optimization
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        test_preds = model(X_test_tensors).squeeze()
        test_loss = loss_fn(test_preds, y_test_tensors)
        test_acc = acc_fn(y_test_tensors, test_preds)
        
    print(f"Epoch: {epoch+1} | Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}% | Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")

Epoch: 1 | Train Loss: 0.6960, Train Acc: 46.79% | Test Loss: 0.6945, Test Acc: 47.50%
Epoch: 2 | Train Loss: 0.6947, Train Acc: 46.79% | Test Loss: 0.6937, Test Acc: 47.50%
Epoch: 3 | Train Loss: 0.6936, Train Acc: 46.79% | Test Loss: 0.6928, Test Acc: 47.50%
Epoch: 4 | Train Loss: 0.6923, Train Acc: 46.79% | Test Loss: 0.6919, Test Acc: 47.50%
Epoch: 5 | Train Loss: 0.6910, Train Acc: 46.79% | Test Loss: 0.6908, Test Acc: 47.50%
Epoch: 6 | Train Loss: 0.6895, Train Acc: 46.79% | Test Loss: 0.6897, Test Acc: 47.50%
Epoch: 7 | Train Loss: 0.6878, Train Acc: 46.79% | Test Loss: 0.6884, Test Acc: 47.50%
Epoch: 8 | Train Loss: 0.6860, Train Acc: 50.00% | Test Loss: 0.6869, Test Acc: 55.00%
Epoch: 9 | Train Loss: 0.6839, Train Acc: 64.74% | Test Loss: 0.6853, Test Acc: 70.00%
Epoch: 10 | Train Loss: 0.6817, Train Acc: 86.54% | Test Loss: 0.6835, Test Acc: 87.50%
Epoch: 11 | Train Loss: 0.6792, Train Acc: 94.87% | Test Loss: 0.6815, Test Acc: 87.50%
Epoch: 12 | Train Loss: 0.6765, Train Acc