## Author's note

Based off wee zen's notebook but now there's attention

## Imports and setup

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd

torch.manual_seed(0)
np.random.seed(0)

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Hyperparameters

In [16]:
# Preprocessing parameters
MAX_LENGTH = 50
VOCAB_SIZE = 10000
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 0.001
TRAIN_RATIO, VAL_RATIO = 0.9, 0.1

## Datasets: Parquet to "df"

In [3]:
# Load AG News dataset from Parquet files
train_df = pd.read_parquet("ag_news_train.parquet")
test_df = pd.read_parquet("ag_news_test.parquet")

In [4]:
val_size = int(len(train_df) * VAL_RATIO)
val_size

12000

In [5]:
train_size = int(len(train_df) * TRAIN_RATIO)
val_size = int(len(train_df) * VAL_RATIO)
train_df, val_df = torch.utils.data.random_split(train_df, [train_size, val_size])

## Build vocab

In [6]:
# Tokenization and vocabulary building
counter = Counter()
for sample in train_df.dataset["text"]:
    tokens = word_tokenize(sample.lower())
    counter.update(tokens)

vocab = {word: i+1 for i, (word, _) in enumerate(counter.most_common(VOCAB_SIZE))}
vocab["<UNK>"] = VOCAB_SIZE + 1

def encode_text(text):
    tokens = word_tokenize(text.lower())
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens][:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokens))

In [8]:
len(vocab.values())

10001

## Datasets: "df" to DataLoader

In [7]:
# Custom PyTorch dataset class
class AGNewsDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = [torch.tensor(encode_text(text), dtype=torch.long) for text in dataframe["text"]]
        self.labels = torch.tensor(dataframe["label"].values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [9]:
# Load data
train_dataset = AGNewsDataset(train_df.dataset.iloc[train_df.indices])
val_dataset = AGNewsDataset(val_df.dataset.iloc[val_df.indices])
test_dataset = AGNewsDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## Model

In [10]:
class SelfAttention(nn.Module):
    def __init__(self, input_dim, dropout_rate=0.5):
        super(SelfAttention, self).__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)  # Linear layer for queries
        self.key = nn.Linear(input_dim, input_dim)    # Linear layer for keys
        self.value = nn.Linear(input_dim, input_dim)  # Linear layer for values
        self.softmax = nn.Softmax(dim=-1)             # Softmax over the last dimension
        self.dropout = nn.Dropout(dropout_rate)       # Dropout layer

    def forward(self, x):  # x.shape (batch_size, seq_length, input_dim)
        batch_size, seq_length, _ = x.size()

        queries = self.query(x)  # Shape: (batch_size, seq_length, input_dim)
        keys = self.key(x)        # Shape: (batch_size, seq_length, input_dim)
        values = self.value(x)    # Shape: (batch_size, seq_length, input_dim)

        # Compute attention scores
        score = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim ** 0.5)  # Shape: (batch_size, seq_length, seq_length)

        attention = self.softmax(score)  # Shape: (batch_size, seq_length, seq_length)
        attention = self.dropout(attention)  # Apply dropout to attention weights

        weighted = torch.bmm(attention, values)  # Shape: (batch_size, seq_length, input_dim)
        return weighted


In [11]:
# Model definition
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.attn = SelfAttention(embed_dim)
        self.dropout = nn.Dropout(0.5)
        self.lnorm = nn.LayerNorm(embed_dim)
        self.embedding = nn.Embedding(vocab_size + 2, embed_dim)
        self.lstm = nn.LSTM(embed_dim , hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.attn(x) + x #attn, add+norm
        x = self.lnorm(x)
        x = self.dropout(x) #some dropouts
        x, _ = self.lstm(x)
        x = self.dropout(x)
        return self.fc(x[:, -1, :])

# Initialize model
model = TextClassifier(VOCAB_SIZE, 128, 128, 4)
device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

## Train

In [12]:
# Training function
def train(model, loader):
    model.train()
    total_loss, correct = 0, 0
    for texts, labels in loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(train_dataset)

# Validation function
def validate(model, loader):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(val_dataset)

# Testing function
def test(model, loader):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(test_dataset)

In [17]:
# Training loop
for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader)
    val_loss, val_acc = validate(model, val_loader)
    print(f"Epoch {epoch+1}: Train Loss {train_loss:.4f}, Train Acc {train_acc:.4f}, Val Loss {val_loss:.4f}, Val Acc {val_acc:.4f}")

Epoch 1: Train Loss 0.2191, Train Acc 0.9263, Val Loss 0.2674, Val Acc 0.9075
Epoch 2: Train Loss 0.2092, Train Acc 0.9296, Val Loss 0.2654, Val Acc 0.9100
Epoch 3: Train Loss 0.2031, Train Acc 0.9319, Val Loss 0.2680, Val Acc 0.9102
Epoch 4: Train Loss 0.1995, Train Acc 0.9335, Val Loss 0.2673, Val Acc 0.9107
Epoch 5: Train Loss 0.1930, Train Acc 0.9353, Val Loss 0.2783, Val Acc 0.9117


## Test

In [18]:
# Final test evaluation
test_loss, test_acc = test(model, test_loader)
print(f"Test Loss {test_loss:.4f}, Test Acc {test_acc:.4f}")

Test Loss 0.2941, Test Acc 0.9051


## Save model

In [None]:
# Save the model
torch.save(model.state_dict(), "text_classifier.pth")