In [None]:
# Imports
from IPython.display import clear_output
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import spacy
import re
import string
from collections import Counter
from sklearn.model_selection import train_test_split
import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Downloading the Spam SMS Dataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip

!unzip /content/smsspamcollection.zip
!rm /content/readme
!rm !rm /content/smsspamcollection.zip

clear_output()

In [None]:
# Downloading the GloVe embeddings database

!unzip /content/drive/MyDrive/glove.6B.txt.zip

clear_output()

In [None]:
# Function for loading GloVe embedding vectors
def load_GloVe_embeddings(glove_file):
    word_embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.array(values[1:], dtype='float32')
            word_embeddings[word] = embedding
    return word_embeddings

# Load GloVe embeddings
word_embeddings = load_GloVe_embeddings('/content/glove.6B.50d.txt')

In [None]:
text = []
label = []

with open("/content/SMSSpamCollection") as f:
  # Read each line of the text file and create a Pandas DataFrame
  for line in f:
    # Split each line into label and text
    line_parts = line.strip().split('\t')
    # Label spam messages as 1 and ham messages as 0
    if line_parts[0] == 'spam':
      label.append(1)
    else:
      label.append(0)
    # Add the text to the list
    text.append(line_parts[1])

In [None]:
# Creating a Pandas Dataframe
sms = pd.DataFrame(zip(text, label), columns = ["Text", "Label"])

In [None]:
spacy_tokenizer = spacy.load('en_core_web_sm')

# Function for tokenizing the message
def tokenize(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # remove non-ASCII characters
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)  # remove punctuation
    tokens = [token.text.lower() for token in spacy_tokenizer(text)]  # tokenize the text
    return tokens

In [None]:
# Tokenize the text sms in the Pandas Dataframe
sms["Tokenized_Text"] = sms["Text"].apply(tokenize)

In [None]:
# Function for converting sequence of tokens to sequence of embedding vectors (max_text_length to provide padding for batch processing)
def embed_text(tokenized_text, word_embeddings, max_text_length=25, embedding_size=50):
    embedded_text = np.zeros((max_text_length, embedding_size))
    for i, word in enumerate(tokenized_text[:max_text_length]):
        if word in word_embeddings:
            embedded_text[i] = word_embeddings[word]
    return embedded_text

In [None]:
# Convert tokens to their corresponding embedding vectors
sms["Embedded_Text"] = sms["Tokenized_Text"].apply(lambda x: embed_text(x, word_embeddings))

In [None]:
class load_dataset(Dataset):
    def __init__(self, X, Y):
        """
        X: the embeddings of the sentence
        Y: ground truth of the sentence (0- ham, 1- spam)
        """
        self.X = X
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.long)

# Load the data from the DataFrame
X = sms['Embedded_Text'].tolist()  # Convert column to list of lists
y = sms['Label'].tolist()  # Convert labels to list

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create Dataset objects
train_dataset = load_dataset(X_train, y_train)
test_dataset = load_dataset(X_test, y_test)

# Create DataLoader objects
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate RNN
        out, _ = self.rnn(x, h0)

        # Decode the hidden state of the last time step using the linear layer
        # out[:, -1, :] selects the hidden state of the last time step for each batch
        # fc expects input of shape (batch_size, hidden_size)
        # The output has shape (batch_size, num_classes)
        out = self.fc(out[:, -1, :])
        return out

In [None]:
def train_model(num_epochs, train_loader, model, criterion, optimizer):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')

        epoch_loss = running_loss / len(train_loader)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')

        # Check accuracy on training set
        train_accuracy = check_accuracy(train_loader, model)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Training Accuracy: {train_accuracy:.2f}%')

def check_accuracy(loader, model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
input_size = 50  # Size of word embeddings
hidden_size = 128
num_layers = 2
num_classes = 2
num_epochs = 10
learning_rate = 0.001

# Initialize model, criterion, and optimizer
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
train_model(num_epochs, train_loader, model, criterion, optimizer)

# Check accuracy on test set
test_accuracy = check_accuracy(test_loader, model)
print(f'Test Accuracy: {test_accuracy:.2f}%')

# Save the model
torch.save(model.state_dict(), 'rnn_model.pth')

Epoch [1/10], Step [100/140], Loss: 0.1195
Epoch [1/10], Loss: 0.2111
Epoch [1/10], Training Accuracy: 94.39%
Epoch [2/10], Step [100/140], Loss: 0.2789
Epoch [2/10], Loss: 0.1390
Epoch [2/10], Training Accuracy: 96.03%
Epoch [3/10], Step [100/140], Loss: 0.1530
Epoch [3/10], Loss: 0.1157
Epoch [3/10], Training Accuracy: 96.99%
Epoch [4/10], Step [100/140], Loss: 0.0332
Epoch [4/10], Loss: 0.1033
Epoch [4/10], Training Accuracy: 96.59%
Epoch [5/10], Step [100/140], Loss: 0.2134
Epoch [5/10], Loss: 0.1061
Epoch [5/10], Training Accuracy: 95.94%
Epoch [6/10], Step [100/140], Loss: 0.0341
Epoch [6/10], Loss: 0.1357
Epoch [6/10], Training Accuracy: 96.84%
Epoch [7/10], Step [100/140], Loss: 0.0180
Epoch [7/10], Loss: 0.0891
Epoch [7/10], Training Accuracy: 97.56%
Epoch [8/10], Step [100/140], Loss: 0.0856
Epoch [8/10], Loss: 0.0797
Epoch [8/10], Training Accuracy: 97.65%
Epoch [9/10], Step [100/140], Loss: 0.0364
Epoch [9/10], Loss: 0.0764
Epoch [9/10], Training Accuracy: 98.03%
Epoch [10/