# 04 - Neural Networks Basics for Text Classification

**Duration:** 2-3 hours | **Difficulty:** Intermediate

## 🎯 Learning Objectives
- Understand `nn.Module` and parameter management
- Build text classification models
- Implement complete training and evaluation pipelines
- Analyze model performance and predictions

## 📚 Contents
1. Understanding nn.Module
2. Text Classification Architecture
3. Training Loop Implementation
4. Model Evaluation and Analysis
5. Practical Exercise

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# Import utilities
import sys
sys.path.append('../utils')
from text_utils import SimpleTokenizer, clean_text, pad_sequences
from model_helpers import get_device, count_parameters

torch.manual_seed(42)
device = get_device()
print(f"Using device: {device}")

## 1. Understanding nn.Module

The foundation of all PyTorch neural networks.

In [None]:
class SimpleTextClassifier(nn.Module):
    """
    Simple text classifier using embeddings and linear layers.
    """
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, dropout=0.1):
        super().__init__()
        
        # Define layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        """Initialize model weights."""
        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
    
    def forward(self, input_ids, attention_mask=None):
        # Embed tokens
        embeddings = self.embedding(input_ids)  # (batch_size, seq_len, embedding_dim)
        
        # Apply attention mask if provided
        if attention_mask is not None:
            embeddings = embeddings * attention_mask.unsqueeze(-1).float()
            seq_lengths = attention_mask.sum(dim=1, keepdim=True).float()
            pooled = embeddings.sum(dim=1) / seq_lengths
        else:
            pooled = embeddings.mean(dim=1)
        
        # Forward through layers
        pooled = self.dropout(pooled)
        hidden = F.relu(self.fc1(pooled))
        hidden = self.dropout(hidden)
        logits = self.fc2(hidden)
        
        return logits

# Create and examine model
model = SimpleTextClassifier(
    vocab_size=1000, embedding_dim=64, hidden_dim=128, num_classes=5
)

print("Model Architecture:")
print(model)
print(f"\nParameters: {count_parameters(model)}")

# Test forward pass
test_input = torch.randint(0, 1000, (2, 10))
test_mask = torch.ones(2, 10)

with torch.no_grad():
    output = model(test_input, test_mask)
    
print(f"\nInput: {test_input.shape} -> Output: {output.shape}")

## 2. Data Preparation

Load and prepare text classification data.

In [None]:
# Load conversation data
with open('../data/conversations/simple_qa_pairs.json', 'r') as f:
    data = json.load(f)

# Extract texts and create intent mapping
texts = []
labels = []
intent_to_id = {}

for item in data:
    query = clean_text(item['query'], lowercase=True, remove_extra_whitespace=True)
    intent = item['intent']
    
    texts.append(query)
    
    if intent not in intent_to_id:
        intent_to_id[intent] = len(intent_to_id)
    labels.append(intent_to_id[intent])

print(f"Dataset: {len(texts)} texts, {len(intent_to_id)} intents")
print(f"Intents: {list(intent_to_id.keys())}")
print(f"Label distribution: {torch.bincount(torch.tensor(labels))}")

# Prepare tokenized data
def prepare_data(texts, labels, vocab_size=500, max_length=32):
    # Build tokenizer
    tokenizer = SimpleTokenizer(vocab_size=vocab_size)
    tokenizer.build_vocabulary(texts, min_freq=1)
    
    # Encode texts
    encoded_texts = [tokenizer.encode(text, add_special_tokens=True, max_length=max_length) 
                    for text in texts]
    
    # Create tensors
    pad_id = tokenizer.token_to_id[tokenizer.special_tokens["pad_token"]]
    input_ids = pad_sequences(encoded_texts, max_length=max_length, pad_value=pad_id)
    attention_mask = (input_ids != pad_id).long()
    label_tensor = torch.tensor(labels, dtype=torch.long)
    
    return input_ids, attention_mask, label_tensor, tokenizer

input_ids, attention_mask, label_tensor, tokenizer = prepare_data(texts, labels)
print(f"\nPrepared data: {input_ids.shape}, vocab size: {tokenizer.get_vocab_size()}")

## 3. Training Implementation

Complete training loop with data splits.

In [None]:
# Split data
train_size = int(0.8 * len(input_ids))
indices = torch.randperm(len(input_ids))

train_input_ids = input_ids[indices[:train_size]]
train_attention_mask = attention_mask[indices[:train_size]]
train_labels = label_tensor[indices[:train_size]]

test_input_ids = input_ids[indices[train_size:]]
test_attention_mask = attention_mask[indices[train_size:]]
test_labels = label_tensor[indices[train_size:]]

# Create data loaders
batch_size = 8
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train: {len(train_dataset)}, Test: {len(test_dataset)}")

# Create model
model = SimpleTextClassifier(
    vocab_size=tokenizer.get_vocab_size(),
    embedding_dim=64,
    hidden_dim=128,
    num_classes=len(intent_to_id),
    dropout=0.1
).to(device)

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 15

# Training loop
train_losses = []
train_accs = []
test_accs = []

for epoch in range(num_epochs):
    # Training
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    
    for batch_input_ids, batch_attention_mask, batch_labels in train_loader:
        batch_input_ids = batch_input_ids.to(device)
        batch_attention_mask = batch_attention_mask.to(device)
        batch_labels = batch_labels.to(device)
        
        optimizer.zero_grad()
        logits = model(batch_input_ids, batch_attention_mask)
        loss = criterion(logits, batch_labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        correct += (predicted == batch_labels).sum().item()
        total += batch_labels.size(0)
    
    train_loss = epoch_loss / len(train_loader)
    train_acc = correct / total
    
    # Evaluation
    model.eval()
    test_correct = 0
    test_total = 0
    
    with torch.no_grad():
        for batch_input_ids, batch_attention_mask, batch_labels in test_loader:
            batch_input_ids = batch_input_ids.to(device)
            batch_attention_mask = batch_attention_mask.to(device)
            batch_labels = batch_labels.to(device)
            
            logits = model(batch_input_ids, batch_attention_mask)
            _, predicted = torch.max(logits, 1)
            test_correct += (predicted == batch_labels).sum().item()
            test_total += batch_labels.size(0)
    
    test_acc = test_correct / test_total
    
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:2d}: Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

print(f"\nFinal Test Accuracy: {test_accs[-1]:.4f}")

## 4. Model Evaluation and Analysis

Detailed analysis of model performance.

In [None]:
# Visualize training progress
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(train_accs, label='Train Accuracy')
plt.plot(test_accs, label='Test Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Get detailed predictions
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch_input_ids, batch_attention_mask, batch_labels in test_loader:
        batch_input_ids = batch_input_ids.to(device)
        batch_attention_mask = batch_attention_mask.to(device)
        
        logits = model(batch_input_ids, batch_attention_mask)
        _, predicted = torch.max(logits, 1)
        
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(batch_labels.numpy())

# Classification report
id_to_intent = {v: k for k, v in intent_to_id.items()}
target_names = [id_to_intent[i] for i in range(len(intent_to_id))]

print("Classification Report:")
print(classification_report(all_labels, all_predictions, target_names=target_names))

# Confusion matrix
cm = confusion_matrix(all_labels, all_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 5. Practical Exercise: Test New Examples

Test the trained model on new text examples.

In [None]:
def predict_intent(text):
    """
    Predict intent for new text.
    """
    model.eval()
    
    # Preprocess
    cleaned_text = clean_text(text, lowercase=True, remove_extra_whitespace=True)
    encoded = tokenizer.encode(cleaned_text, add_special_tokens=True, max_length=32)
    
    # Convert to tensors
    input_ids = torch.tensor([encoded]).to(device)
    pad_id = tokenizer.token_to_id[tokenizer.special_tokens["pad_token"]]
    attention_mask = (input_ids != pad_id).long()
    
    # Predict
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probabilities = F.softmax(logits, dim=1)
        predicted_id = torch.argmax(logits, dim=1).item()
    
    predicted_intent = id_to_intent[predicted_id]
    confidence = probabilities[0][predicted_id].item()
    
    return predicted_intent, confidence, probabilities[0].cpu().numpy()

# Test examples
test_texts = [
    "Hi there, how are you?",
    "What's your name?",
    "Can you explain machine learning?",
    "Thank you so much!",
    "Goodbye, see you later"
]

print("Testing model on new examples:")
print("=" * 40)

for text in test_texts:
    intent, confidence, probs = predict_intent(text)
    print(f"Text: '{text}'")
    print(f"Predicted: {intent} (confidence: {confidence:.3f})")
    
    # Show top 2 predictions
    top_indices = np.argsort(probs)[::-1][:2]
    print("Top predictions:")
    for i, idx in enumerate(top_indices):
        intent_name = target_names[idx]
        print(f"  {i+1}. {intent_name}: {probs[idx]:.3f}")
    print()

## 🎉 Congratulations!

You've successfully built and trained a neural network for text classification:

✅ **nn.Module**: Understanding PyTorch's neural network foundation  
✅ **Text Classification**: Complete pipeline from data to predictions  
✅ **Training Loop**: Implementing training with validation  
✅ **Model Evaluation**: Classification reports and confusion matrices  
✅ **Real-world Testing**: Predicting on new examples  

## 🚀 Next Steps

In the next notebook, we'll explore language modeling:
- Character and word-level models
- Text generation techniques
- Perplexity evaluation

**Ready for language modeling?** Continue to [`05_language_modeling.ipynb`](05_language_modeling.ipynb)!