# Document Classification with PyTorch and AG_NEWS Dataset

This notebook demonstrates document classification using PyTorch with gradient descent on the AG_NEWS dataset. The AG_NEWS dataset contains news articles from 4 categories: World, Sports, Business, and Sci/Tech.

## 1. Import Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

## 2. Define Dataset Class

In [None]:
class DocumentDataset(Dataset):
    """Custom dataset for text classification with AG_NEWS data"""
    
    def __init__(self, texts: List[str], labels: List[int], vocab: Dict[str, int], tokenizer):
        self.texts = texts
        self.labels = torch.LongTensor(labels)
        self.vocab = vocab
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        tokens = self.tokenizer(text)
        indices = [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]
        return torch.LongTensor(indices), self.labels[idx]

## 3. Define Neural Network Model

In [None]:
class DocumentClassifier(nn.Module):
    """Neural network for document classification using embeddings and fully connected layers"""
    
    def __init__(self, vocab_size: int, embed_dim: int, hidden_size: int, num_classes: int, dropout_rate: float = 0.3):
        super(DocumentClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        # Embed tokens and pool by averaging
        embedded = self.embedding(x)
        pooled = torch.mean(embedded, dim=1)
        
        # Pass through fully connected layers
        x = self.relu(self.fc1(pooled))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

## 4. Utility Functions

In [None]:
def collate_batch(batch):
    """Collate function for DataLoader to handle variable-length sequences"""
    label_list, text_list = [], []
    for text, label in batch:
        label_list.append(label)
        text_list.append(text)
    
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=0)
    return text_list, label_list

## 5. Document Classification Pipeline

In [None]:
class DocumentClassificationPipeline:
    """Complete pipeline for document classification with AG_NEWS dataset"""
    
    def __init__(self, vocab_size: int = 10000, embed_dim: int = 64, hidden_size: int = 256, dropout_rate: float = 0.3):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.tokenizer = get_tokenizer('basic_english')
        self.vocab = None
        self.model = None
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.label_names = {1: 'World', 2: 'Sports', 3: 'Business', 4: 'Sci/Tech'}
        
    def build_vocab(self, texts: List[str]) -> Dict[str, int]:
        """Build vocabulary from training texts"""
        counter = Counter()
        for text in texts:
            tokens = self.tokenizer(text)
            counter.update(tokens)
        
        vocab = {'<pad>': 0, '<unk>': 1}
        for i, (word, _) in enumerate(counter.most_common(self.vocab_size - 2)):
            vocab[word] = i + 2
        
        return vocab
    
    def load_ag_news_data(self) -> Tuple[List[str], List[int], List[str], List[int]]:
        """Load and preprocess AG_NEWS dataset"""
        train_iter = AG_NEWS(split='train')
        test_iter = AG_NEWS(split='test')
        
        train_texts, train_labels = [], []
        for label, text in train_iter:
            train_texts.append(text)
            train_labels.append(label - 1)  # Convert to 0-indexed
        
        test_texts, test_labels = [], []
        for label, text in test_iter:
            test_texts.append(text)
            test_labels.append(label - 1)  # Convert to 0-indexed
        
        return train_texts, train_labels, test_texts, test_labels
    
    def create_model(self, num_classes: int):
        """Initialize the neural network model"""
        self.model = DocumentClassifier(
            vocab_size=len(self.vocab),
            embed_dim=self.embed_dim,
            hidden_size=self.hidden_size,
            num_classes=num_classes,
            dropout_rate=self.dropout_rate
        ).to(self.device)
        
    def train(self, learning_rate: float = 0.001, epochs: int = 10, batch_size: int = 64):
        """Train the model using gradient descent"""
        print("Loading AG_NEWS dataset...")
        train_texts, train_labels, test_texts, test_labels = self.load_ag_news_data()
        
        print(f"Dataset loaded: {len(train_texts)} train samples, {len(test_texts)} test samples")
        
        print("Building vocabulary...")
        self.vocab = self.build_vocab(train_texts)
        print(f"Vocabulary size: {len(self.vocab)}")
        
        train_dataset = DocumentDataset(train_texts, train_labels, self.vocab, self.tokenizer)
        test_dataset = DocumentDataset(test_texts, test_labels, self.vocab, self.tokenizer)
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
        
        num_classes = 4
        self.create_model(num_classes)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        
        train_losses = []
        train_accuracies = []
        test_accuracies = []
        
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            correct_train = 0
            total_train = 0
            
            for batch_texts, batch_labels in train_loader:
                batch_texts = batch_texts.to(self.device)
                batch_labels = batch_labels.to(self.device)
                
                # Gradient descent step
                optimizer.zero_grad()
                outputs = self.model(batch_texts)
                loss = criterion(outputs, batch_labels)
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total_train += batch_labels.size(0)
                correct_train += (predicted == batch_labels).sum().item()
            
            avg_loss = total_loss / len(train_loader)
            train_accuracy = 100 * correct_train / total_train
            
            test_accuracy = self.evaluate(test_loader)
            
            train_losses.append(avg_loss)
            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)
            
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, '
                  f'Train Acc: {train_accuracy:.2f}%, Test Acc: {test_accuracy:.2f}%')
        
        return {
            'train_losses': train_losses,
            'train_accuracies': train_accuracies,
            'test_accuracies': test_accuracies
        }
    
    def evaluate(self, test_loader: DataLoader) -> float:
        """Evaluate model performance"""
        self.model.eval()
        correct = 0
        total = 0
        
        with torch.no_grad():
            for batch_texts, batch_labels in test_loader:
                batch_texts = batch_texts.to(self.device)
                batch_labels = batch_labels.to(self.device)
                
                outputs = self.model(batch_texts)
                _, predicted = torch.max(outputs.data, 1)
                total += batch_labels.size(0)
                correct += (predicted == batch_labels).sum().item()
        
        return 100 * correct / total
    
    def predict(self, documents: List[str]) -> List[str]:
        """Make predictions on new documents"""
        if self.model is None or self.vocab is None:
            raise ValueError("Model not trained yet. Call train() first.")
        
        self.model.eval()
        predictions = []
        
        with torch.no_grad():
            for text in documents:
                tokens = self.tokenizer(text)
                indices = [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]
                text_tensor = torch.LongTensor(indices).unsqueeze(0).to(self.device)
                
                output = self.model(text_tensor)
                _, predicted = torch.max(output, 1)
                predicted_class = predicted.item() + 1
                predictions.append(self.label_names[predicted_class])
        
        return predictions

## 6. Initialize and Train the Model

In [None]:
# Initialize the pipeline
pipeline = DocumentClassificationPipeline(
    vocab_size=20000, 
    embed_dim=100, 
    hidden_size=256
)

print("Training document classifier with AG_NEWS dataset...")
print("This may take several minutes depending on your hardware.")

In [None]:
# Train the model
results = pipeline.train(
    learning_rate=0.001,
    epochs=5,
    batch_size=64
)

## 7. Visualize Training Progress

In [None]:
# Plot training results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot loss
ax1.plot(results['train_losses'], label='Training Loss', color='blue')
ax1.set_title('Training Loss Over Epochs')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True)

# Plot accuracy
ax2.plot(results['train_accuracies'], label='Training Accuracy', color='green')
ax2.plot(results['test_accuracies'], label='Test Accuracy', color='red')
ax2.set_title('Accuracy Over Epochs')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.legend()
ax2.grid(True)

plt.tight_layout()
plt.show()

print(f"Final Training Accuracy: {results['train_accuracies'][-1]:.2f}%")
print(f"Final Test Accuracy: {results['test_accuracies'][-1]:.2f}%")

## 8. Test the Model with Sample Predictions

In [None]:
# Test with sample documents
test_documents = [
    "Apple Inc. reported strong quarterly earnings beating analyst expectations.",
    "NASA discovers water on Mars in groundbreaking space mission.",
    "Lakers win NBA championship after defeating Celtics in game 7.",
    "New iPhone features advanced AI chip for faster processing.",
    "Stock market reaches new all-time high as investors remain optimistic.",
    "Scientists develop breakthrough gene therapy for treating cancer.",
    "World Cup final draws record television audience worldwide.",
    "Tech companies face new regulations on data privacy and security."
]

predictions = pipeline.predict(test_documents)

print("Predictions on sample texts:")
print("=" * 60)
for i, (doc, pred) in enumerate(zip(test_documents, predictions), 1):
    print(f"{i}. Text: {doc}")
    print(f"   Predicted class: {pred}")
    print()

## 9. Interactive Prediction

In [None]:
# Interactive cell for custom predictions
def predict_custom_text(text: str):
    """Helper function to predict class for custom text"""
    prediction = pipeline.predict([text])[0]
    print(f"Text: {text}")
    print(f"Predicted class: {prediction}")
    return prediction

# Example usage - modify the text below to test your own examples
custom_text = "The company's stock price soared after announcing record profits."
predict_custom_text(custom_text)

## 10. Model Summary and Analysis

In [None]:
# Print model summary
print("Model Architecture:")
print("=" * 40)
print(pipeline.model)

# Count parameters
total_params = sum(p.numel() for p in pipeline.model.parameters())
trainable_params = sum(p.numel() for p in pipeline.model.parameters() if p.requires_grad)

print(f"\nModel Statistics:")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Vocabulary size: {len(pipeline.vocab):,}")
print(f"Embedding dimension: {pipeline.embed_dim}")
print(f"Hidden layer size: {pipeline.hidden_size}")

print(f"\nDataset Information:")
print(f"Classes: {list(pipeline.label_names.values())}")
print(f"Device used: {pipeline.device}")

## Conclusion

This notebook demonstrated:

1. **Data Loading**: Using torchtext to load the AG_NEWS dataset
2. **Preprocessing**: Building vocabulary and tokenizing text
3. **Model Architecture**: Neural network with embedding layer and fully connected layers
4. **Training**: Gradient descent optimization with Adam optimizer
5. **Evaluation**: Monitoring training progress and test accuracy
6. **Prediction**: Making predictions on new text samples

The model uses gradient descent to minimize cross-entropy loss and classify news articles into 4 categories. The embedding layer converts words to dense vectors, which are then averaged and passed through fully connected layers for classification.