## Author's note

This was entirely generated with ChatGPT 👻

This was made with no pretrained models at all. Instead, I used an LSTM.

## Imports and setup

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd

torch.manual_seed(0)
np.random.seed(0)

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/nwz1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/nwz1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Hyperparameters

In [2]:
# Preprocessing parameters
MAX_LENGTH = 50
VOCAB_SIZE = 10000
BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 0.001
TRAIN_RATIO, VAL_RATIO = 0.9, 0.1

## Datasets: Parquet to "df"

In [3]:
# Load AG News dataset from Parquet files
train_df = pd.read_parquet("data/ag_news_train.parquet")
test_df = pd.read_parquet("data/ag_news_test.parquet")

In [4]:
val_size = int(len(train_df) * VAL_RATIO)
val_size

12000

In [5]:
train_size = int(len(train_df) * TRAIN_RATIO)
val_size = int(len(train_df) * VAL_RATIO)
train_df, val_df = torch.utils.data.random_split(train_df, [train_size, val_size])

## Build vocab

In [6]:
# Tokenization and vocabulary building
counter = Counter()
for sample in train_df.dataset["text"]:
    tokens = word_tokenize(sample.lower())
    counter.update(tokens)

vocab = {word: i+1 for i, (word, _) in enumerate(counter.most_common(VOCAB_SIZE))}
vocab["<UNK>"] = VOCAB_SIZE + 1

def encode_text(text):
    tokens = word_tokenize(text.lower())
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens][:MAX_LENGTH] + [0] * (MAX_LENGTH - len(tokens))

## Datasets: "df" to DataLoader

In [7]:
# Custom PyTorch dataset class
class AGNewsDataset(Dataset):
    def __init__(self, dataframe):
        self.texts = [torch.tensor(encode_text(text), dtype=torch.long) for text in dataframe["text"]]
        self.labels = torch.tensor(dataframe["label"].values, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [8]:
# Load data
train_dataset = AGNewsDataset(train_df.dataset.iloc[train_df.indices])
val_dataset = AGNewsDataset(val_df.dataset.iloc[val_df.indices])
test_dataset = AGNewsDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

## Model

In [9]:
# Model definition
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 2, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        return self.fc(hidden[-1])
    
# Initialize model
model = TextClassifier(VOCAB_SIZE, 128, 128, 4)
device = torch.device("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

## Train

In [10]:
# Training function
def train(model, loader):
    model.train()
    total_loss, correct = 0, 0
    for texts, labels in loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(train_dataset)

# Validation function
def validate(model, loader):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(val_dataset)

# Testing function
def test(model, loader):
    model.eval()
    total_loss, correct = 0, 0
    with torch.no_grad():
        for texts, labels in loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(loader), correct / len(test_dataset)

In [11]:
# Training loop
for epoch in range(EPOCHS):
    train_loss, train_acc = train(model, train_loader)
    val_loss, val_acc = validate(model, val_loader)
    print(f"Epoch {epoch+1}: Train Loss {train_loss:.4f}, Train Acc {train_acc:.4f}, Val Loss {val_loss:.4f}, Val Acc {val_acc:.4f}")

Epoch 1: Train Loss 0.5356, Train Acc 0.7920, Val Loss 0.3109, Val Acc 0.8928
Epoch 2: Train Loss 0.2559, Train Acc 0.9131, Val Loss 0.2593, Val Acc 0.9110
Epoch 3: Train Loss 0.1907, Train Acc 0.9352, Val Loss 0.2523, Val Acc 0.9116
Epoch 4: Train Loss 0.1441, Train Acc 0.9511, Val Loss 0.2597, Val Acc 0.9135
Epoch 5: Train Loss 0.1058, Train Acc 0.9643, Val Loss 0.2896, Val Acc 0.9123


## Test

In [12]:
# Final test evaluation
test_loss, test_acc = test(model, test_loader)
print(f"Test Loss {test_loss:.4f}, Test Acc {test_acc:.4f}")

Test Loss 0.2992, Test Acc 0.9093


## Save model

In [13]:
# Save the model
torch.save(model.state_dict(), "text_classifier.pth")