In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.labels[index]


In [3]:
sequences = [
    [1, 3, 2, 6],
    [4, 1, 5, 0],
    [2, 1, 3, 0],
    [1, 2, 3, 0],
    [1, 2, 4, 0]
]

labels = [0, 0, 0, 0, 1]  # 0=Even, 1=Odd


In [4]:
dataset = SequenceDataset(sequences, labels)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)


In [5]:
next(iter(dataloader))

[tensor([[1., 2., 3., 0.],
         [2., 1., 3., 0.]]),
 tensor([0, 0])]

In [10]:
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNNClassifier, self).__init__()

        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(input_size=input_size,hidden_size=hidden_size,batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    def forward(self, x):
        print("‚û°Ô∏è FORWARD PASS STARTED")
        print("Input x shape (batch_size, seq_len):", x.shape)
        print("Input x:", x)

        # Add feature dimension
        print("Unsqueezing input to add feature dimension")
        x = x.unsqueeze(-1)
        print("After unsqueeze")
        print("x shape (batch_size, seq_len, input_size):", x.shape)

        # RNN
        rnn_output, hidden_state = self.rnn(x)

        print("üîÅ RNN OUTPUT")
        print("rnn_output shape (batch_size, seq_len, hidden_size):", rnn_output.shape)
        print("hidden_state shape (num_layers, batch_size, hidden_size):", hidden_state[0].shape)

        # Last hidden state 
        print("Extracting last hidden state for classification")
        last_hidden = hidden_state[0].squeeze(0) # squeeze num_layers dimension
        print("üß† Last hidden state")
        print("last_hidden shape (batch_size, hidden_size):", last_hidden.shape)

        # Fully connected layer
        output = self.fc(last_hidden)
        print("üì§ Output logits")
        print("output shape (batch_size, num_classes):", output.shape)
        print("output:", output)
        print("‚û°Ô∏è FORWARD PASS ENDED")
        print("=" * 50)

        return output

    

---

## üîß Understanding `unsqueeze(-1)`

### The Problem
Your sequences come in as raw numbers:
```python
x shape: (batch_size, seq_len) = (2, 4)
x = [[1, 3, 2, 6],
     [4, 1, 5, 0]]
```

Each token is just a **single number** with no explicit features.

### LSTM's Requirement
LSTM expects: `(batch_size, seq_len, input_features)`

It needs to know:
- How many sequences in the batch?
- How long is each sequence?
- How many **features** does each token have?

### What `unsqueeze(-1)` Does
Adds a new dimension at the **end** (index -1):

```python
Before: (2, 4)
          ‚Üì
After:  (2, 4, 1)
          ‚îÇ  ‚îÇ  ‚îî‚îÄ 1 feature per token
          ‚îÇ  ‚îî‚îÄ‚îÄ‚îÄ‚îÄ 4 tokens per sequence
          ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ 2 sequences in batch
```

### Visual Example
```python
# Before unsqueeze
x = [[1, 3, 2, 6],
     [4, 1, 5, 0]]
# Shape: (2, 4) - just 2D array

# After unsqueeze(-1)
x = [[[1], [3], [2], [6]],
     [[4], [1], [5], [0]]]
# Shape: (2, 4, 1) - each token now wrapped in a list
```

### Why?
- LSTM processes **sequences with multiple features** (like embeddings)
- Your raw tokens are treated as "1-dimensional features"
- The `input_size=1` parameter confirms LSTM expects 1 feature per token

### Common Pattern
```python
# If you had embeddings (256-d vectors per token):
# Shape would be (batch, seq_len, 256) - no unsqueeze needed

# But with raw tokens:
# Shape is (batch, seq_len) - need unsqueeze to get (batch, seq_len, 1)
```

---

## üîÑ LSTM Output: `rnn_output` vs `hidden_state`

When you call LSTM, it returns **two things**:
```python
rnn_output, (hidden_state, cell_state) = self.lstm(x)
```

### `rnn_output` - ALL Hidden States
```python
Shape: (batch_size, seq_len, hidden_size) = (2, 4, 16)
```

Hidden state at **every time step**:
```
Time 0: hidden_0 = [0.23, -0.45, ..., 0.12]  (16 values)
Time 1: hidden_1 = [0.55, -0.12, ..., 0.88]  (16 values)
Time 2: hidden_2 = [0.01, 0.34, ..., -0.56]  (16 values)
Time 3: hidden_3 = [-0.23, 0.67, ..., 0.42]  (16 values)

For each of 2 sequences in the batch!
```

**When to use:** You want intermediate representations at each step
- Machine translation (attention mechanism)
- Named entity recognition (classify each token)

### `hidden_state` - FINAL Hidden State
```python
Shape: (num_layers, batch_size, hidden_size) = (1, 2, 16)
```

Hidden state at the **last time step only**:
```
hidden_state[0] = [[-0.23, 0.67, ..., 0.42],    # Sequence 1's final state
                   [ 0.81, -0.15, ..., 0.33]]   # Sequence 2's final state
```

**When to use:** You want a single summary of the entire sequence
- Classification (summarize whole sentence)
- Seq2seq encoder (compress article ‚Üí pass to decoder)

### `cell_state` - LSTM Memory
```python
Shape: (num_layers, batch_size, hidden_size) = (1, 2, 16)
```

The internal "memory" that LSTM maintains.

**When to use:**
- Seq2seq: Pass both `hidden_state` and `cell_state` from encoder ‚Üí decoder
- Classification: Usually ignore it (not needed for final prediction)

### In Your Code
You use **`hidden_state[0]`** for classification because:
- You want one summary vector per sequence
- `hidden_state[0]` extracts the final hidden state of the single layer
- `squeeze(0)` removes the layer dimension ‚Üí `(2, 16)`
- Pass to FC layer ‚Üí predict class for each sequence

---

## üß† Why `hidden_state[0]` specifically?

LSTM returns states with shape `(num_layers, batch_size, hidden_size)`.

**Your model:** 1 layer, unidirectional
```python
self.rnn = nn.LSTM(input_size=1, hidden_size=16, batch_first=True)
```

So `hidden_state` has shape `(1, 2, 16)`:
- Dimension 0: Which layer? Only 1 layer ‚Üí only index 0
- Dimension 1: Which sequence in batch? 2 sequences ‚Üí indices 0, 1
- Dimension 2: Which hidden value? 16 hidden units ‚Üí indices 0-15

**`hidden_state[0]`** selects:
```python
hidden_state[0]  ‚Üí  (2, 16)  # All batch sequences, final layer
```

Then **`squeeze(0)`** removes the now-redundant layer dimension:
```python
hidden_state[0].squeeze(0)  ‚Üí  (2, 16)  # Ready for FC layer!
```

**If you had 2 layers:**
```python
self.rnn = nn.LSTM(input_size=1, hidden_size=16, num_layers=2, batch_first=True)
```
- `hidden_state[0]` = first layer's final state ‚Üí `(2, 16)`
- `hidden_state[1]` = second layer's final state ‚Üí `(2, 16)`
- **Common pattern:** Use last layer only ‚Üí `hidden_state[-1]`

---

## üì§ FC Layer - Classification Head

```python
self.fc = nn.Linear(hidden_size=16, num_classes=2)
```

Takes the compressed sequence summary and outputs class scores:

```python
Input:  (batch_size, hidden_size) = (2, 16)
        [[0.23, -0.45, ..., 0.12],     # Sequence 1's summary
         [0.55, -0.12, ..., 0.88]]     # Sequence 2's summary

FC Layer (learned weights):
        Transforms 16 ‚Üí 2

Output: (batch_size, num_classes) = (2, 2)
        [[0.234, -0.402],              # Logits for [class 0, class 1]
         [0.507, -0.579]]              # Logits for [class 0, class 1]

Interpretation:
        Seq 1: Class 0 more likely (0.234 > -0.402)
        Seq 2: Class 0 more likely (0.507 > -0.579)
```

**How it works:** Each output is a weighted sum:
```
output[0] = w[0,0]*hidden[0] + w[0,1]*hidden[1] + ... + b[0]
output[1] = w[1,0]*hidden[0] + w[1,1]*hidden[1] + ... + b[1]
```

These logits are then passed to `CrossEntropyLoss` which:
1. Applies softmax to convert to probabilities
2. Compares with ground truth labels
3. Computes loss

---

## üéØ Complete Forward Pass Flow

```
Input Sequences:     (2, 4)
        ‚Üì
unsqueeze(-1):       (2, 4, 1)
        ‚Üì
LSTM Processing:     Process each token through LSTM, remembering previous states
        ‚Üì
rnn_output:          (2, 4, 16)  ‚Üê All hidden states
hidden_state:        (1, 2, 16)  ‚Üê Final hidden state
        ‚Üì
Extract final:       hidden_state[0] ‚Üí (2, 16)
        ‚Üì
squeeze(0):          (2, 16)  ‚Üê Ready for classification
        ‚Üì
FC Layer:            (2, 16) ‚Üí (2, 2)
        ‚Üì
Output Logits:       (2, 2)  ‚Üê Class predictions!
```

For classification:
- We only care about the **final summary** (hidden_state)
- We throw away **intermediate steps** (rnn_output)
- We compress **16 dimensions ‚Üí 2 class scores**

In [7]:
model = RNNClassifier(input_size=1,hidden_size=16,num_classes=2)

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)


In [9]:
num_epochs = 2  # keep small because prints are heavy

for epoch in range(num_epochs):
    print(f"\n EPOCH {epoch+1}/{num_epochs}")
    print("=" * 60)

    for batch_index, (sequences_batch, labels_batch) in enumerate(dataloader):
        print(f"\n Batch {batch_index + 1}")
        print("Batch sequences shape:", sequences_batch.shape)
        print("Batch labels:", labels_batch)

        optimizer.zero_grad()

        outputs = model(sequences_batch)

        loss = criterion(outputs, labels_batch)
        print(" Loss:", loss.item())

        loss.backward()
        print(" Backpropagation done")

        optimizer.step()
        print(" Weights updated")


 EPOCH 1/2

 Batch 1
Batch sequences shape: torch.Size([2, 4])
Batch labels: tensor([0, 0])
‚û°Ô∏è FORWARD PASS STARTED
Input x shape (batch_size, seq_len): torch.Size([2, 4])
Input x: tensor([[4., 1., 5., 0.],
        [1., 2., 3., 0.]])
After unsqueeze
x shape (batch_size, seq_len, input_size): torch.Size([2, 4, 1])
üîÅ RNN OUTPUT
rnn_output shape (batch_size, seq_len, hidden_size): torch.Size([2, 4, 16])
hidden_state shape (num_layers, batch_size, hidden_size): torch.Size([1, 2, 16])
üß† Last hidden state
last_hidden shape (batch_size, hidden_size): torch.Size([2, 16])
üì§ Output logits
output shape (batch_size, num_classes): torch.Size([2, 2])
output: tensor([[-0.2404,  0.0512],
        [-0.2545,  0.0577]], grad_fn=<AddmmBackward0>)
‚û°Ô∏è FORWARD PASS ENDED
 Loss: 0.8554551601409912
 Backpropagation done
 Weights updated

 Batch 2
Batch sequences shape: torch.Size([2, 4])
Batch labels: tensor([0, 0])
‚û°Ô∏è FORWARD PASS STARTED
Input x shape (batch_size, seq_len): torch.Size([2