# RNN Implementation Methods

Different ways to implement RNNs in PyTorch with code examples.

---

## Contents
1. [Method 1: Manual RNN](#manual)
2. [Method 2: Built-in nn.RNN](#builtin)
3. [Method 3: LSTM & GRU](#lstm)
4. [Method 4: Bidirectional RNN](#bidirectional)
5. [Quick Comparison](#comparison)

In [None]:
import torch
import torch.nn as nn

# Sample data for all examples
batch_size = 32
seq_len = 10
input_size = 50
hidden_size = 128
output_size = 10

# Create sample batch
x = torch.randn(batch_size, seq_len, input_size)
print(f"Input shape: {x.shape}")

<a id='manual'></a>
# Method 1: Manual RNN

Build RNN from scratch using linear layers.

In [None]:
class ManualRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, input, hidden):
        """Process ONE time step"""
        combined = torch.cat((input, hidden), 1)
        hidden = torch.tanh(self.i2h(combined))
        output = self.i2o(combined)
        return output, hidden
    
    def init_hidden(self, batch_size=1):
        return torch.zeros(batch_size, self.hidden_size)

# Usage
model = ManualRNN(input_size, hidden_size, output_size)
hidden = model.init_hidden(batch_size)

# Must loop through each time step
for t in range(seq_len):
    output, hidden = model(x[:, t, :], hidden)

print(f"Output shape: {output.shape}")  # (batch, output_size)
print(f"Hidden shape: {hidden.shape}")  # (batch, hidden_size)

<a id='builtin'></a>
# Method 2: Built-in nn.RNN

Use PyTorch's optimized RNN layer.

In [None]:
class BuiltInRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """Process ENTIRE sequence at once"""
        rnn_out, hidden = self.rnn(x)  # rnn_out: (batch, seq, hidden)
        output = self.fc(rnn_out[:, -1, :])  # Use last time step
        return output

# Usage
model = BuiltInRNN(input_size, hidden_size, output_size)
output = model(x)

print(f"Output shape: {output.shape}")  # (batch, output_size)

<a id='lstm'></a>
# Method 3: LSTM & GRU

Advanced RNN variants that handle long-term dependencies better.

In [None]:
# LSTM
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        lstm_out, (hidden, cell) = self.lstm(x)
        output = self.fc(lstm_out[:, -1, :])
        return output

# GRU
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        gru_out, hidden = self.gru(x)
        output = self.fc(gru_out[:, -1, :])
        return output

# Usage
lstm_model = LSTMModel(input_size, hidden_size, output_size)
gru_model = GRUModel(input_size, hidden_size, output_size)

lstm_output = lstm_model(x)
gru_output = gru_model(x)

print(f"LSTM output: {lstm_output.shape}")
print(f"GRU output: {gru_output.shape}")

<a id='bidirectional'></a>
# Method 4: Bidirectional RNN

Process sequence in both forward and backward directions.

In [None]:
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super().__init__()
        self.rnn = nn.RNN(
            input_size, 
            hidden_size, 
            num_layers, 
            batch_first=True,
            bidirectional=True  # Key difference
        )
        # hidden_size * 2 because bidirectional
        self.fc = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        rnn_out, hidden = self.rnn(x)
        output = self.fc(rnn_out[:, -1, :])
        return output

# Usage
model = BiRNN(input_size, hidden_size, output_size)
output = model(x)

print(f"Output shape: {output.shape}")

## Many-to-Many RNN

Output at every time step (e.g., sequence labeling).

In [None]:
class ManyToManyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        rnn_out, _ = self.rnn(x)  # (batch, seq, hidden)
        
        # Apply fc to ALL time steps
        output = self.fc(rnn_out)  # (batch, seq, output_size)
        return output

# Usage
model = ManyToManyRNN(input_size, hidden_size, output_size)
output = model(x)

print(f"Output shape: {output.shape}")  # (batch, seq, output_size)

## Encoder-Decoder (Seq2Seq)

Variable input length → Variable output length.

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
    
    def forward(self, x):
        _, hidden = self.rnn(x)
        return hidden  # Context vector

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden):
        rnn_out, hidden = self.rnn(x, hidden)
        output = self.fc(rnn_out)
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.encoder = Encoder(input_size, hidden_size)
        self.decoder = Decoder(output_size, hidden_size, output_size)
    
    def forward(self, src, tgt):
        # Encode
        context = self.encoder(src)
        
        # Decode
        output, _ = self.decoder(tgt, context)
        return output

# Usage
model = Seq2Seq(input_size, hidden_size, output_size)
src = torch.randn(batch_size, seq_len, input_size)
tgt = torch.randn(batch_size, seq_len, output_size)
output = model(src, tgt)

print(f"Output shape: {output.shape}")

<a id='comparison'></a>
# Quick Comparison

## Architecture Comparison

| Method | Use Case | Pros | Cons |
|--------|----------|------|------|
| **Manual RNN** | Learning | Full control | Slow, complex |
| **nn.RNN** | Simple tasks | Fast, easy | Vanishing gradients |
| **LSTM** | Long sequences | Good memory | More parameters |
| **GRU** | Balance | Faster than LSTM | Less capacity |
| **Bidirectional** | Context matters | Both directions | 2x slower |
| **Seq2Seq** | Translation | Variable I/O | Complex training |

## When to Use What?

**RNN**: Short sequences, simple patterns
- Sentiment analysis (short reviews)
- Simple time series

**LSTM**: Long sequences, long-term dependencies
- Language modeling
- Speech recognition
- Long text classification

**GRU**: Similar to LSTM but faster
- When LSTM works but speed matters
- Less data available

**Bidirectional**: Context from both sides needed
- Named Entity Recognition
- Fill-in-the-blank tasks
- Not for real-time/streaming

**Seq2Seq**: Variable length input/output
- Machine translation
- Text summarization
- Chatbots

## Parameter Count Comparison

In [None]:
# Compare parameter counts
models = {
    'Manual RNN': ManualRNN(input_size, hidden_size, output_size),
    'nn.RNN': BuiltInRNN(input_size, hidden_size, output_size),
    'LSTM': LSTMModel(input_size, hidden_size, output_size),
    'GRU': GRUModel(input_size, hidden_size, output_size),
    'BiRNN': BiRNN(input_size, hidden_size, output_size),
}

for name, model in models.items():
    params = sum(p.numel() for p in model.parameters())
    print(f"{name:15} {params:,} parameters")

## Training Tips

### 1. Gradient Clipping
```python
nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
```

### 2. Learning Rate
- Start with 0.001 for Adam
- Use learning rate scheduling

### 3. Hidden Size
- Start with 128 or 256
- Increase if underfitting
- Decrease if overfitting

### 4. Num Layers
- 1-2 layers usually sufficient
- 3-4 for complex tasks
- More layers = harder to train

### 5. Dropout
```python
nn.RNN(input_size, hidden_size, num_layers, dropout=0.5)
```