# 08: Sequence Models for Chatbots

**Duration:** 3-4 hours | **Difficulty:** Intermediate-Advanced

## Learning Objectives
- RNN, LSTM, and GRU architectures
- Sequence-to-sequence modeling
- Teacher forcing strategies
- Conversational response generation

## Table of Contents
1. [Sequence Models Overview](#1-overview)
2. [LSTM Implementation](#2-lstm)
3. [Seq2Seq Architecture](#3-seq2seq)
4. [Training and Generation](#4-training)
5. [Practical Exercise](#5-exercise)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import numpy as np
import matplotlib.pyplot as plt
import json
import random
from typing import List, Tuple

# Import utilities
import sys
sys.path.append('../')
from utils.text_utils import SimpleTokenizer
from utils.model_helpers import get_device, count_parameters

device = get_device("auto")
print(f"Using device: {device}")

torch.manual_seed(42)
random.seed(42)

## 1. Sequence Models Overview {#1-overview}

**Sequence models** process sequential data where order matters:
- **RNN**: Basic recurrent connections, short memory
- **LSTM**: Long Short-Term Memory, handles long dependencies
- **Seq2Seq**: Encoder-decoder for input→output transformation

For chatbots: User message → Bot response

In [None]:
# Load conversation data
with open('../data/conversations/simple_qa_pairs.json', 'r') as f:
    conversation_data = json.load(f)

conversations = [(item['question'], item['answer']) for item in conversation_data]
print(f"Loaded {len(conversations)} conversation pairs")

# Sample conversations
for i in range(3):
    print(f"Q: {conversations[i][0]}")
    print(f"A: {conversations[i][1]}\n")

# Prepare tokenizer
tokenizer = SimpleTokenizer(vocab_size=3000)
all_text = []
for q, a in conversations:
    all_text.extend([q, a])

tokenizer.fit(all_text)
vocab_size = len(tokenizer.vocab)
print(f"Vocabulary size: {vocab_size}")

## 2. LSTM Implementation {#2-lstm}

LSTM networks can learn long-term dependencies through gating mechanisms.

In [None]:
class LSTMEncoder(nn.Module):
    """LSTM Encoder for sequence encoding."""
    
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                           batch_first=True, dropout=0.2)
    
    def forward(self, x):
        embedded = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, (hidden, cell)

class LSTMDecoder(nn.Module):
    """LSTM Decoder for sequence generation."""
    
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                           batch_first=True, dropout=0.2)
        self.output = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.lstm(embedded, hidden)
        prediction = self.output(output)
        return prediction, hidden

# Test LSTM components
encoder = LSTMEncoder(vocab_size).to(device)
decoder = LSTMDecoder(vocab_size).to(device)

print(f"Encoder params: {count_parameters(encoder)['total']:,}")
print(f"Decoder params: {count_parameters(decoder)['total']:,}")

## 3. Seq2Seq Architecture {#3-seq2seq}

Complete encoder-decoder model for conversation generation.

In [None]:
class Seq2SeqModel(nn.Module):
    """Sequence-to-sequence model for conversation."""
    
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.vocab_size = vocab_size
        
        self.encoder = LSTMEncoder(vocab_size, embed_dim, hidden_dim, num_layers)
        self.decoder = LSTMDecoder(vocab_size, embed_dim, hidden_dim, num_layers)
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        """Training forward pass with teacher forcing."""
        batch_size, tgt_len = tgt.shape
        
        # Encode source
        _, hidden = self.encoder(src)
        
        # Decode with teacher forcing
        outputs = torch.zeros(batch_size, tgt_len, self.vocab_size).to(device)
        decoder_input = tgt[:, 0:1]  # Start with SOS token
        
        for t in range(1, tgt_len):
            output, hidden = self.decoder(decoder_input, hidden)
            outputs[:, t:t+1, :] = output
            
            # Teacher forcing decision
            if random.random() < teacher_forcing_ratio:
                decoder_input = tgt[:, t:t+1]
            else:
                decoder_input = output.argmax(dim=-1)
        
        return outputs
    
    def generate(self, src, max_length=30, sos_token=1, eos_token=2):
        """Generate response without teacher forcing."""
        self.eval()
        
        with torch.no_grad():
            # Encode
            _, hidden = self.encoder(src)
            
            # Generate tokens
            generated = []
            decoder_input = torch.tensor([[sos_token]]).to(device)
            
            for _ in range(max_length):
                output, hidden = self.decoder(decoder_input, hidden)
                predicted = output.argmax(dim=-1)
                generated.append(predicted.item())
                
                if predicted.item() == eos_token:
                    break
                
                decoder_input = predicted
        
        return generated

# Create model
model = Seq2SeqModel(vocab_size, embed_dim=128, hidden_dim=256).to(device)
print(f"Total parameters: {count_parameters(model)['total']:,}")

## 4. Training and Generation {#4-training}

Train the model and implement text generation.

In [None]:
class ConversationDataset(Dataset):
    """Dataset for conversation pairs."""
    
    def __init__(self, conversations, tokenizer, max_length=40):
        self.data = []
        
        for q, a in conversations:
            src = tokenizer.encode(q, add_special_tokens=True, max_length=max_length)
            tgt = tokenizer.encode(a, add_special_tokens=True, max_length=max_length)
            
            if len(src) > 0 and len(tgt) > 0:
                self.data.append((torch.tensor(src), torch.tensor(tgt)))
        
        print(f"Dataset size: {len(self.data)}")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    """Pad sequences in batch."""
    src_batch, tgt_batch = zip(*batch)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
    tgt_padded = pad_sequence(tgt_batch, batch_first=True, padding_value=0)
    return src_padded, tgt_padded

def train_model(model, dataloader, epochs=3, lr=0.001):
    """Train the sequence model."""
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    losses = []
    
    for epoch in range(epochs):
        epoch_loss = 0
        model.train()
        
        for batch_idx, (src, tgt) in enumerate(dataloader):
            src, tgt = src.to(device), tgt.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            output = model(src, tgt, teacher_forcing_ratio=0.7)
            
            # Calculate loss (skip first token)
            output_flat = output[:, 1:, :].contiguous().view(-1, vocab_size)
            target_flat = tgt[:, 1:].contiguous().view(-1)
            
            loss = criterion(output_flat, target_flat)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            epoch_loss += loss.item()
            
            if batch_idx % 5 == 0:
                print(f'Epoch {epoch+1}/{epochs}, Batch {batch_idx}, Loss: {loss.item():.4f}')
        
        avg_loss = epoch_loss / len(dataloader)
        losses.append(avg_loss)
        print(f'Epoch {epoch+1} Average Loss: {avg_loss:.4f}')
        
        # Test generation
        test_generation(model, tokenizer, "What is machine learning?")
    
    return losses

def test_generation(model, tokenizer, input_text):
    """Test response generation."""
    model.eval()
    
    tokens = tokenizer.encode(input_text, add_special_tokens=True)
    src = torch.tensor([tokens]).to(device)
    
    generated = model.generate(src, max_length=20)
    response = tokenizer.decode(generated)
    
    print(f"Input: {input_text}")
    print(f"Response: {response}\n")

# Create dataset and train
dataset = ConversationDataset(conversations, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

print("Training Seq2Seq model...")
losses = train_model(model, dataloader, epochs=3)

# Plot training progress
plt.figure(figsize=(8, 4))
plt.plot(losses, 'b-', linewidth=2)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)
plt.show()

# Test final model
print("Final model testing:")
test_queries = [
    "What is deep learning?",
    "How do neural networks work?",
    "What is artificial intelligence?"
]

for query in test_queries:
    test_generation(model, tokenizer, query)

## 5. Practical Exercise {#5-exercise}

**Exercise**: Enhance the sequence-to-sequence model

### Tasks:
1. Add bidirectional encoder
2. Implement beam search decoding
3. Add attention mechanism (preview for next notebook)
4. Create conversational context handling

### Questions:
1. How does teacher forcing affect training?
2. What are the limitations of basic seq2seq?
3. How can we handle longer conversations?

### Extensions:
- Multi-turn conversation modeling
- Copy mechanism for handling rare words
- Scheduled sampling for training
- Evaluation metrics (BLEU, ROUGE)

In [None]:
# Exercise: Simple conversational bot
class SimpleSeq2SeqBot:
    """Simple conversational bot using seq2seq model."""
    
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.history = []
    
    def respond(self, user_input):
        """Generate response to user input."""
        tokens = self.tokenizer.encode(user_input, add_special_tokens=True)
        src = torch.tensor([tokens]).to(device)
        
        generated = self.model.generate(src, max_length=25)
        response = self.tokenizer.decode(generated)
        
        # Clean response
        response = response.replace('<SOS>', '').replace('<EOS>', '').strip()
        if not response:
            response = "I don't understand."
        
        self.history.append({'user': user_input, 'bot': response})
        return response

# Create and test bot
bot = SimpleSeq2SeqBot(model, tokenizer)

print("=== Seq2Seq Chatbot Demo ===")
test_inputs = [
    "Hello",
    "What is AI?",
    "How do I learn programming?",
    "Tell me about neural networks"
]

for user_input in test_inputs:
    response = bot.respond(user_input)
    print(f"User: {user_input}")
    print(f"Bot: {response}\n")

print("=== Sequence Models Complete ===")
print("Key Concepts Learned:")
print("• LSTM architecture and memory mechanisms")
print("• Encoder-decoder sequence-to-sequence models")
print("• Teacher forcing training strategy")
print("• Text generation and decoding")
print("• Conversational response generation")
print("\nNext: Attention mechanisms for better context understanding!")