## 1. Import Libraries

In [None]:
import os
import glob
import pickle
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from PIL import Image, ImageFile
import albumentations as A

from sklearn import preprocessing
from tqdm import tqdm

ImageFile.LOAD_TRUNCATED_IMAGES = True

print("‚úì All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 2. Configuration

Set all paths and hyperparameters in one place.

In [None]:
class Config:
    # Dataset paths
    TRAIN_DIR = "Dataset/train"
    VAL_DIR = "Dataset/val"
    
    # Image dimensions
    IMAGE_HEIGHT = 40
    IMAGE_WIDTH = 150
    
    # Training settings
    BATCH_SIZE = 32
    NUM_WORKERS = 4
    EPOCHS = 10
    LEARNING_RATE = 1e-3
    
    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device: {Config.DEVICE}")
print(f"Image size: {Config.IMAGE_HEIGHT} x {Config.IMAGE_WIDTH}")

## 3. Dataset Class

**What it does:** Loads images, applies preprocessing, and returns tensors.

**Preprocessing steps:**
1. Resize to 40x150 (height x width)
2. Normalize with ImageNet mean/std
3. Transpose to (Channels, Height, Width)
4. Convert to PyTorch tensor

**Example:**
```python
# Input: "8AE5T.jpg" file
# Output: 
#   - images: tensor of shape (3, 40, 150)
#   - targets: tensor like [9, 11, 5, 20, 6] (encoded characters)
```

In [None]:
class ClassificationDataset(Dataset):
    def __init__(self, image_paths, labels):
        self.image_paths = image_paths
        self.labels = labels
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        # Load and convert image to RGB
        image = Image.open(self.image_paths[idx]).convert('RGB')
        image = np.array(image)
        
        # Apply augmentations
        augmented = self.transform(image=image)
        image = augmented['image']
        
        # Transpose for PyTorch: (H,W,C) -> (C,H,W)
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        
        label = self.labels[idx]
        
        return {
            'images': torch.tensor(image, dtype=torch.float),
            'targets': torch.tensor(label, dtype=torch.long),
            'targets_len': torch.tensor(len(label), dtype=torch.long)
        }
    
    def transform(self, image):
        # Resize to model input size
        resize = A.Resize(40, 150, always_apply=True)
        # Normalize with ImageNet stats
        normalize = A.Normalize(always_apply=True)
        transform = A.Compose([resize, normalize])
        return transform(image=image)

print("‚úì Dataset class defined")

## üñºÔ∏è Dataset Class Explanation

The `ClassificationDataset` loads and preprocesses images for the model.

**Preprocessing Pipeline:**
1. **Load Image**: Read CAPTCHA from disk as RGB
2. **Resize**: Scale to 40√ó150 (height√ówidth) - landscape orientation
3. **Normalize**: Apply ImageNet statistics (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
4. **Transpose**: Convert (H,W,C) ‚Üí (C,H,W) for PyTorch

**Example Data Flow:**
```
Input:  "8AE5T.jpg" (original size, maybe 200√ó80)
Resize: (40, 150, 3) numpy array
Normalize: Values scaled to ~[-2, 2] range
Transpose: (3, 40, 150) tensor ready for model
```

## 4. Model Architecture

**CNN + GRU with CTC Loss**

**Structure:**
- **Conv layers:** Extract visual features from images
- **GRU layers:** Process features as a sequence (left to right)
- **Classifier:** Predict character at each position
- **CTC Loss:** Handle variable-length sequences without alignment

**Dimensions transformation:**
```
Input:  (batch, 3, 40, 150)      # RGB image
Conv:   (batch, 64, 10, 37)      # After convolution + pooling
Permute:(batch, 37, 64, 10)      # Treat width as sequence
Flatten:(batch, 37, 640)         # Flatten spatial dimensions
Linear: (batch, 37, 64)          # Project to feature space
GRU:    (batch, 37, 64)          # Bidirectional GRU
Output: (37, batch, num_chars+1) # CTC format
```

**Why this works:**
- CNN captures character shapes and edges
- GRU learns sequence patterns (common character combinations)
- CTC allows model to predict without knowing exact character positions

In [None]:
class CaptchaModel(nn.Module):
    def __init__(self, num_characters):
        super(CaptchaModel, self).__init__()
        
        # CNN layers for feature extraction
        self.conv1 = nn.Conv2d(3, 128, kernel_size=(3, 3), padding=(1, 1))
        self.pool1 = nn.MaxPool2d(kernel_size=(2, 2))
        self.conv2 = nn.Conv2d(128, 64, kernel_size=(3, 3), padding=(1, 1))
        self.pool2 = nn.MaxPool2d(kernel_size=(2, 2))
        
        # Bidirectional GRU for sequence modeling
        self.gru = nn.GRU(640, 32, num_layers=2, bidirectional=True, dropout=0.25, batch_first=True)
        
        # Output layer (+1 for CTC blank token)
        self.output = nn.Linear(64, num_characters + 1)
        
    def forward(self, images, targets=None, target_lengths=None):
        bs = images.size(0)  # batch size
        
        # CNN feature extraction
        x = F.relu(self.conv1(images))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        
        # Reshape for sequence processing
        x = x.permute(0, 3, 1, 2)  # (batch, width, height, channels)
        x = x.view(bs, x.size(1), -1)  # (batch, sequence_len, features)
        
        # GRU sequence modeling
        x, _ = self.gru(x)
        
        # Character predictions
        x = self.output(x)
        x = x.permute(1, 0, 2)  # (sequence_len, batch, num_classes) for CTC
        
        # Calculate loss if training
        if targets is not None:
            log_probs = F.log_softmax(x, dim=2)
            input_lengths = torch.full((bs,), x.size(0), dtype=torch.long)
            
            loss = nn.CTCLoss(blank=0)(log_probs, targets, input_lengths, target_lengths)
            return x, loss
        
        return x, None

print("‚úì Model architecture defined")

## üß† Model Architecture - Deep Dive

### Forward Pass: Complete Dimension Transformation

**Input:**
- `images`: (batch=32, channels=3, height=40, width=150)
- `targets`: (batch=32, seq_len=5) - e.g., [[9,11,5,20,6], [12,3,14,8,19], ...]
- `target_lengths`: (batch=32,) - all values = 5

**Step-by-Step Process:**

```
1. First Convolution Block
   Input:  (32, 3, 40, 150)
   ‚Üì Conv2d(3‚Üí128, kernel=3√ó3, padding=1)
   (32, 128, 40, 150)  # 128 feature maps, same spatial size
   ‚Üì ReLU + MaxPool2d(2√ó2)
   (32, 128, 20, 75)   # Halved: height/2, width/2

2. Second Convolution Block
   (32, 128, 20, 75)
   ‚Üì Conv2d(128‚Üí64, kernel=3√ó3, padding=1)
   (32, 64, 20, 75)
   ‚Üì ReLU + MaxPool2d(2√ó2)
   (32, 64, 10, 37)    # Halved again: 20/2=10, 75/2=37

3. Reshape for Sequence Processing
   (32, 64, 10, 37)
   ‚Üì permute(0, 3, 1, 2) - Treat width as sequence
   (32, 37, 64, 10)    # 37 positions along width
   ‚Üì view(batch, seq_len, -1) - Flatten spatial features
   (32, 37, 640)       # 640 = 64 channels √ó 10 height

4. Bidirectional GRU
   Input: (32, 37, 640)
   ‚Üì 2-layer BiGRU(hidden=32)
   Output: (32, 37, 64)  # 64 = 32 forward + 32 backward
   
   What happens:
   - Forward GRU: Reads sequence left‚Üíright (context from past)
   - Backward GRU: Reads sequence right‚Üíleft (context from future)
   - Concatenates: Each position gets 64 features

5. Classification Layer
   (32, 37, 64)
   ‚Üì Linear(64 ‚Üí num_chars+1)
   (32, 37, 37)        # 36 chars + 1 blank = 37 classes
   ‚Üì permute(1, 0, 2) - CTC expects seq_len first
   (37, 32, 37)        # Final output

6. CTC Loss Calculation
   Log probabilities: (37, 32, 37)
   Targets: (32, 5) - e.g., [[9,11,5,20,6], ...]
   
   CTC aligns 37 predictions with 5 target characters
   Allows for: "AA_BB_CC_DD_EE____" ‚Üí "ABCDE"
   Where _ = blank token (index 0)
```

**Output:**
- `logits`: (37, 32, 37) - Raw predictions for each of 37 positions
- `loss`: Scalar - CTC loss value for optimization

**Why This Architecture?**
- **CNN**: Extracts visual features (edges, curves, character shapes)
- **Sequence length 37**: Sufficient positions for 5-character captchas with blanks
- **BiGRU**: Captures dependencies (e.g., "Q" often followed by "U")
- **CTC Loss**: No need to know exact character positions in image

## 5. Training Functions

**train_fn:** One epoch of training
- Forward pass: get predictions
- Calculate CTC loss
- Backward pass: compute gradients
- Update weights

**eval_fn:** Validation
- Forward pass only (no gradient computation)
- Calculate validation loss
- Return predictions for evaluation

In [None]:
def train_fn(model, data_loader, optimizer, device):
    model.train()
    fin_loss = 0
    
    for data in tqdm(data_loader, desc="Training", leave=False):
        images = data['images'].to(device)
        targets = data['targets'].to(device)
        target_lengths = data['targets_len'].to(device)
        
        optimizer.zero_grad()
        logits, loss = model(images, targets, target_lengths)
        loss.backward()
        optimizer.step()
        
        fin_loss += loss.item()
    
    return fin_loss / len(data_loader)


def eval_fn(model, data_loader, device):
    model.eval()
    fin_loss = 0
    
    with torch.no_grad():
        for data in tqdm(data_loader, desc="Validating", leave=False):
            images = data['images'].to(device)
            targets = data['targets'].to(device)
            target_lengths = data['targets_len'].to(device)
            
            logits, loss = model(images, targets, target_lengths)
            fin_loss += loss.item()
    
    return fin_loss / len(data_loader)

print("‚úì Training functions defined")

## üîÑ Training & Evaluation - Deep Dive

### `train_fn()` - One Training Epoch

**Input:**
- `model`: Neural network
- `data_loader`: Yields batches of (images, targets, lengths)
- `optimizer`: Adam optimizer (stores gradients and updates weights)
- `device`: 'cuda' or 'cpu'

**Process (Per Batch):**

```
1. Forward Pass
   images (32, 3, 40, 150) ‚Üí model ‚Üí logits (37, 32, 37)
   CTC Loss computes alignment between predictions and targets

2. optimizer.zero_grad()
   Reset all gradients to zero
   Why? PyTorch accumulates gradients by default
   Without this: gradients from previous batches interfere

3. loss.backward()
   Compute ‚àÇloss/‚àÇweight for EVERY parameter
   Uses chain rule through entire network:
   - Output layer ‚Üí GRU ‚Üí Conv2 ‚Üí Conv1
   Magic of automatic differentiation!

4. optimizer.step()
   Update weights: w_new = w_old - lr √ó gradient
   Adam is smart: adjusts learning rate per parameter
   Frequently updated params ‚Üí smaller steps
   Rarely updated params ‚Üí larger steps
```

**Output:**
- Average loss across all batches
- Lower loss = better predictions

### `eval_fn()` - Validation

**Key Difference: `torch.no_grad()`**

```python
with torch.no_grad():
    # Forward pass only
```

**What this does:**
1. **Disables gradient computation**
   - Saves memory (no need to store intermediate activations)
   - Faster computation (skips gradient graph building)

2. **Evaluation mode (`model.eval()`)**
   - Dropout: OFF (use all neurons)
   - Batch Normalization: Use running stats (don't update)

**Why separate validation?**
- Measure generalization (how well model works on unseen data)
- Detect overfitting (train loss ‚Üì, val loss ‚Üë)
- Unbiased performance estimate

### The Training Loop Pattern

```
for epoch in range(50):
    # Training phase
    model.train()  # Enable dropout
    train_loss = train_fn(...)  # Update weights
    
    # Validation phase
    model.eval()  # Disable dropout
    val_loss = eval_fn(...)  # No weight updates
    
    # Learning rate adjustment
    scheduler.step(val_loss)  # Reduce LR if stuck
```

**Output Interpretation:**
```
Epoch 1: Train Loss: 3.2 | Val Loss: 3.5  ‚Üê High loss, random guessing
Epoch 10: Train Loss: 0.8 | Val Loss: 1.2  ‚Üê Getting better
Epoch 30: Train Loss: 0.1 | Val Loss: 0.3  ‚Üê Good! Close losses
Epoch 40: Train Loss: 0.01 | Val Loss: 1.5  ‚Üê Overfitting! Val loss increased
```

## 6. Main Training Pipeline

**Data Preparation:**
1. **Load images** - Get all .jpg files from train/val directories
2. **Extract labels** - Filename is the captcha text (e.g., "8AE5T.jpg" ‚Üí "8AE5T")
3. **Split into characters** - "8AE5T" ‚Üí ['8', 'A', 'E', '5', 'T']
4. **Encode characters** - Fit LabelEncoder on all unique characters
5. **Add +1 offset** - Reserve 0 for CTC blank token

**Example encoding:**
```
Original: "8AE5T"
Split: ['8', 'A', 'E', '5', 'T']
Encode: [8, 10, 4, 5, 19] (example indices from LabelEncoder)
Add +1: [9, 11, 5, 6, 20] (reserve 0 for blank)
```

**Training Loop:**
- Train on all batches (forward + backward pass)
- Validate on test set (forward only)
- Reduce learning rate if validation loss plateaus
- Save best model checkpoint

In [None]:
from sklearn.preprocessing import LabelEncoder
def run_training():
    # Load training and validation image paths
    train_images = glob.glob("Dataset/train/*.jpg")
    val_images = glob.glob("Dataset/val/*.jpg")
    
    # Extract labels from filenames (e.g., "Dataset/train/8AE5T.jpg" -> "8AE5T")
    train_labels = [x.split(os.sep)[-1].split('.')[0] for x in train_images]
    val_labels = [x.split(os.sep)[-1].split('.')[0] for x in val_images]
    
    # Split strings into individual characters for CTC
    train_targets = [[c for c in x] for x in train_labels]
    val_targets = [[c for c in x] for x in val_labels]
    
    # Flatten all characters for LabelEncoder
    targets_flat = [c for clist in train_targets for c in clist]
    targets_flat += [c for clist in val_targets for c in clist]
    
    # Fit encoder on all unique characters
    lbl_enc = LabelEncoder()
    lbl_enc.fit(targets_flat)
    
    # Encode labels to integers
    train_encoded = [lbl_enc.transform(x) for x in train_targets]
    val_encoded = [lbl_enc.transform(x) for x in val_targets]
    
    # Add +1 to reserve 0 for CTC blank token
    train_encoded = [[i + 1 for i in x] for x in train_encoded]
    val_encoded = [[i + 1 for i in x] for x in val_encoded]
    
    # Create datasets
    train_dataset = ClassificationDataset(train_images, train_encoded)
    val_dataset = ClassificationDataset(val_images, val_encoded)
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
    
    # Initialize model
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = CaptchaModel(num_characters=len(lbl_enc.classes_))
    model.to(device)
    
    # Setup optimizer and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.5, patience=5, verbose=True
    )
    
    # Training loop
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(50):
        train_loss = train_fn(model, train_loader, optimizer, device)
        val_loss = eval_fn(model, val_loader, device)
        
        print(f"Epoch {epoch+1}/50 - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        
        scheduler.step(val_loss)
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_captcha_model.pth')
            patience_counter = 0
        else:
            patience_counter += 1
        
        # Early stopping
        if patience_counter >= 10:
            print("Early stopping triggered!")
            break
    
    # Save label encoder
    with open('label_encoder.pkl', 'wb') as f:
        pickle.dump(lbl_enc, f)
    
    print("\nTraining complete!")
    return model, lbl_enc

## üìö Training Function - Deep Dive

### Character Encoding Process (Most Critical Part!)

**Problem:** Model needs numbers, but we have characters like "8AE5T"

**Solution: Multi-Step Encoding**

#### Step 1: Load Image Paths
```python
train_images = glob.glob("Dataset/train/*.jpg")
# Result: ["Dataset/train/8AE5T.jpg", "Dataset/train/BC3X2.jpg", ...]
```

#### Step 2: Extract Labels from Filenames
```python
train_labels = [x.split(os.sep)[-1].split('.')[0] for x in train_images]
# Process:
#   "Dataset/train/8AE5T.jpg"
#   ‚Üì .split(os.sep) ‚Üí ["Dataset", "train", "8AE5T.jpg"]
#   ‚Üì [-1] ‚Üí "8AE5T.jpg"
#   ‚Üì .split('.') ‚Üí ["8AE5T", "jpg"]
#   ‚Üì [0] ‚Üí "8AE5T"
# Result: ["8AE5T", "BC3X2", "K7M9P", ...]
```

#### Step 3: Split into Individual Characters
```python
train_targets = [[c for c in x] for x in train_labels]
# Process:
#   "8AE5T" ‚Üí ['8', 'A', 'E', '5', 'T']
#   "BC3X2" ‚Üí ['B', 'C', '3', 'X', '2']
# Result: [['8','A','E','5','T'], ['B','C','3','X','2'], ...]
```

#### Step 4: Flatten All Characters (Critical for LabelEncoder)
```python
targets_flat = [c for clist in train_targets for c in clist]
# Process (nested list comprehension):
#   For each list: ['8','A','E','5','T']
#   For each character in list: '8', 'A', 'E', '5', 'T'
#   Collect all into single list
# Result: ['8','A','E','5','T','B','C','3','X','2','K','7','M','9','P',...]
```
**Why flatten?** LabelEncoder.fit() needs ALL unique characters to create mapping.

#### Step 5: Fit LabelEncoder
```python
lbl_enc.fit(targets_flat)
# Creates mapping:
# '0'‚Üí0, '1'‚Üí1, ..., '9'‚Üí9, 'A'‚Üí10, 'B'‚Üí11, ..., 'Z'‚Üí35
# Total: 36 classes (0-9, A-Z)
```

#### Step 6: Encode Each Character List
```python
train_encoded = [lbl_enc.transform(x) for x in train_targets]
# Process:
#   ['8','A','E','5','T'] ‚Üí [8, 10, 14, 5, 19]
#   'A' maps to 10, 'E' to 14, etc.
# Result: [[8,10,14,5,19], [11,12,3,23,2], ...]
```

#### Step 7: Add +1 Offset (CTC Requirement)
```python
train_encoded = [[i + 1 for i in x] for x in train_encoded]
# Process:
#   [8, 10, 14, 5, 19] ‚Üí [9, 11, 15, 6, 20]
# Why? Reserve index 0 for CTC blank token
# CTC uses 0 for alignment gaps: "A__B" ‚Üí [1, 0, 0, 2]
```

**Final Encoding Example:**
```
Original string: "8AE5T"
Split:           ['8', 'A', 'E', '5', 'T']
Encode:          [8, 10, 14, 5, 19]
+1 Offset:       [9, 11, 15, 6, 20]  ‚Üê This goes to model!
```

### DataLoader Parameters

**`batch_size=32`:** Process 32 images simultaneously
- GPU parallelization: Faster than processing one-by-one
- Memory tradeoff: Larger batch = more GPU memory

**`shuffle=True` (training):** Randomize sample order each epoch
- Prevents model from learning order patterns
- Better generalization

**`shuffle=False` (validation):** Keep order consistent
- Reproducible results
- Order doesn't matter for evaluation

**`num_workers=4`:** 4 parallel processes load data
- Main process trains model
- Worker processes load and preprocess images
- Reduces data loading bottleneck

## 7. Prediction Functions

**CTC Decoding:** Convert model output to text

Model outputs: `[11, 11, 0, 12, 12, 12, 0, 13, 13, 0]`
- Index 0 is CTC blank (ignore)
- Remove consecutive duplicates: `[11, 12, 13]`
- Subtract 1 to get original indices: `[10, 11, 12]`
- Convert to characters: `['A', 'B', 'C']`
- Final text: `"ABC"`

**Functions:**
- `remove_duplicates()` - Remove consecutive same characters
- `decode_predictions()` - Full CTC decoding
- `predict_single_image()` - Predict one image
- `predict_batch()` - Predict multiple images

In [None]:
def remove_duplicates(x):
    if len(x) < 2:
        return x
    fin = ""
    for j in x:
        if fin == "" or j != fin[-1]:
            fin += j
    return fin


def decode_predictions(preds, encoder):
    preds = preds.permute(1, 0, 2)  # (seq, batch, classes) -> (batch, seq, classes)
    preds = torch.softmax(preds, 2)
    preds = torch.argmax(preds, 2)
    preds = preds.detach().cpu().numpy()
    
    cap_preds = []
    for j in range(preds.shape[0]):
        temp = []
        for k in preds[j, :]:
            k = k - 1  # Reverse +1 offset
            if k == -1:
                temp.append("¬ß")  # Blank token placeholder
            else:
                temp.append(encoder.inverse_transform([k])[0])
        
        tp = "".join(temp)
        tp = tp.replace("¬ß", "")  # Remove blanks
        cap_preds.append(remove_duplicates(tp))  # Remove duplicates
    
    return cap_preds


def load_model(model_path, lbl_enc, device='cpu'):
    model = CaptchaModel(num_characters=len(lbl_enc.classes_))
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model


def predict_single_image(image_path, model, lbl_enc, device='cpu'):
    dataset = ClassificationDataset([image_path], [[0, 0, 0, 0, 0]])
    image_dict = dataset[0]
    image_tensor = image_dict['images'].unsqueeze(0).to(device)
    
    with torch.no_grad():
        preds, _ = model(image_tensor)
    
    text = decode_predictions(preds, lbl_enc)[0]
    return text


def predict_batch(image_paths, model, lbl_enc, device='cpu', batch_size=32):
    dummy_labels = [[0, 0, 0, 0, 0] for _ in range(len(image_paths))]
    dataset = ClassificationDataset(image_paths, dummy_labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    all_predictions = []
    for data in tqdm(dataloader, desc="Predicting"):
        images = data['images'].to(device)
        
        with torch.no_grad():
            preds, _ = model(images)
        
        texts = decode_predictions(preds, lbl_enc)
        all_predictions.extend(texts)
    
    return all_predictions


def load_label_encoder(load_path='label_encoder.pkl'):
    with open(load_path, 'rb') as f:
        return pickle.load(f)

print("‚úì Prediction functions defined")

## üîÆ CTC Decoding - Deep Dive

### The Challenge
Model outputs: `(37, batch, 37)` - 37 predictions per image, each with 37 probabilities
We need: Simple text like "ABC"

### `decode_predictions()` Function Breakdown

**Input:**
- `preds`: (seq_len=37, batch=32, num_classes=37) - Raw model output (logits)
- `encoder`: LabelEncoder with character mappings

**Process:**

#### Step 1: Rearrange Dimensions
```python
preds = preds.permute(1, 0, 2)
# (37, 32, 37) ‚Üí (32, 37, 37)
# Why? Easier to loop through batch (32 images)
```

#### Step 2: Convert Logits to Probabilities
```python
preds = torch.softmax(preds, 2)
# Before: [2.3, -1.5, 0.8, ...] (logits, any value)
# After:  [0.65, 0.05, 0.15, ...] (probabilities, sum=1.0)
# Why? Makes prediction confidence interpretable
```

#### Step 3: Get Most Likely Character
```python
preds = torch.argmax(preds, 2)
# (32, 37, 37) ‚Üí (32, 37)
# Each of 37 positions now has single index (0-36)
# Example row: [11, 11, 0, 12, 12, 12, 0, 13, 13, 0, ...]
```

#### Step 4: Reverse the +1 Offset
```python
k = k - 1
# Remember training? We added +1 to reserve 0 for blank
# Now reverse it:
#   Model output: 11 ‚Üí 11-1 = 10 ‚Üí maps to 'A'
#   Model output: 0 ‚Üí 0-1 = -1 ‚Üí blank token
```

#### Step 5: Handle Blank Tokens
```python
if k == -1:
    temp.append("¬ß")  # Placeholder
else:
    temp.append(encoder.inverse_transform([k])[0])

# Example sequence:
#   [10, 10, -1, 11, 11, 11, -1, 12, 12, -1]
#   ‚Üì
#   ['A', 'A', '¬ß', 'B', 'B', 'B', '¬ß', 'C', 'C', '¬ß']
```

#### Step 6: Remove Blank Tokens
```python
tp = tp.replace("¬ß", "")
# "A A ¬ß B B B ¬ß C C ¬ß" ‚Üí "AABBBCC"
```

#### Step 7: Remove Consecutive Duplicates
```python
cap_preds.append(remove_duplicates(tp))
# "AABBBCC" ‚Üí "ABC"
# Why? CTC can output same character multiple times
# Real example: "HHEELLOO" ‚Üí "HELO"
```

**Complete Example:**

```
Raw Model Output (37 positions):
[11, 11, 11, 0, 0, 12, 12, 0, 13, 13, 13, 0, 0, 0, ...]

Step 1: Subtract 1
[10, 10, 10, -1, -1, 11, 11, -1, 12, 12, 12, -1, -1, -1, ...]

Step 2: Map to characters (using LabelEncoder)
['A', 'A', 'A', '¬ß', '¬ß', 'B', 'B', '¬ß', 'C', 'C', 'C', '¬ß', ...]

Step 3: Remove blanks (¬ß)
['A', 'A', 'A', 'B', 'B', 'C', 'C', 'C']

Step 4: Remove consecutive duplicates
['A', 'B', 'C']

Step 5: Join to string
"ABC"
```

**Output:**
- List of decoded strings: `["ABC", "X7Y", "K2M9", ...]`
- One prediction per batch sample

### Why CTC Works This Way

**Problem:** We don't know exactly where each character appears in the image
- "ABC" might span pixels 10-50, 60-100, 120-150
- Different fonts/sizes cause different positions

**CTC Solution:** 
- Output predictions for EVERY position (37 times)
- Allow blanks and duplicates for alignment
- Collapse to final text during decoding

**Example Alignment:**
```
Image positions: |A|A|A|_|_|B|B|_|C|C|C|_|_|_|...
Decoded text:    "ABC"
```

This flexibility allows the model to handle:
- Variable character widths
- Different spacing
- Overlapping characters

## 8. Run Training

Execute the complete training pipeline.

In [None]:
# Train the model
trained_model, label_encoder = run_training()

## 9. Test Predictions

Test the trained model on validation images.

In [None]:
# Load saved model and label encoder
lbl_enc = load_label_encoder('label_encoder.pkl')
model = load_model('best_captcha_model.pth', lbl_enc, device='cuda')

print("Model loaded successfully!")
print(f"Number of character classes: {len(lbl_enc.classes_)}")
print(f"Characters: {list(lbl_enc.classes_)}")

# Get test images
val_images = glob.glob("Dataset/val/*.jpg")
test_images = val_images[:5]

print(f"\nTesting on {len(test_images)} images...")

# Predict
predictions = predict_batch(test_images, model, lbl_enc, device='cuda', batch_size=5)

# Display results
print("\nPrediction Results:")
print("‚îÄ" * 60)
for img_path, pred_text in zip(test_images, predictions):
    true_label = img_path.split(os.sep)[-1].split('.')[0]
    status = "‚úì" if pred_text == true_label else "‚úó"
    print(f"{status} {img_path.split(os.sep)[-1]:15s} | True: {true_label} | Pred: {pred_text}")
print("‚îÄ" * 60)

## üß™ Testing the Model

Let's test our trained model on validation images and see the predictions!

## 10. Visualize Predictions

Display sample predictions with images.

In [None]:
import matplotlib.pyplot as plt
from PIL import Image

# Visualize predictions
fig, axes = plt.subplots(1, 5, figsize=(15, 3))

for idx, (img_path, pred_text) in enumerate(zip(test_images, predictions)):
    img = Image.open(img_path).convert('RGB')
    true_label = img_path.split(os.sep)[-1].split('.')[0]
    
    axes[idx].imshow(img)
    color = 'green' if pred_text == true_label else 'red'
    axes[idx].set_title(f"Pred: {pred_text}\nTrue: {true_label}", color=color, fontsize=10)
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

# Calculate validation accuracy
print("\n" + "="*60)
print("VALIDATION SET ACCURACY")
print("="*60)

all_val_predictions = predict_batch(val_images, model, lbl_enc, device='cuda', batch_size=32)

correct = 0
total = len(val_images)

for img_path, pred in zip(val_images, all_val_predictions):
    true_label = img_path.split(os.sep)[-1].split('.')[0]
    if pred == true_label:
        correct += 1

accuracy = (correct / total) * 100

print(f"Total Images:        {total}")
print(f"Correct Predictions: {correct}")
print(f"Wrong Predictions:   {total - correct}")
print(f"Accuracy:            {accuracy:.2f}%")
print("="*60)

# Character-level accuracy
print("\nCHARACTER-LEVEL ACCURACY")
print("="*60)

total_chars = 0
correct_chars = 0

for img_path, pred in zip(val_images, all_val_predictions):
    true_label = img_path.split(os.sep)[-1].split('.')[0]
    
    for i in range(min(len(pred), len(true_label))):
        total_chars += 1
        if i < len(pred) and i < len(true_label) and pred[i] == true_label[i]:
            correct_chars += 1
    
    total_chars += abs(len(pred) - len(true_label))

char_accuracy = (correct_chars / total_chars) * 100

print(f"Total Characters:    {total_chars}")
print(f"Correct Characters:  {correct_chars}")
print(f"Character Accuracy:  {char_accuracy:.2f}%")
print("="*60)

print("\n‚úì Visualization and evaluation complete!")

## üìä Visualization & Accuracy Metrics

Let's visualize our predictions and calculate overall accuracy on the validation set.

**Metrics:**
- **Image-level accuracy**: Entire CAPTCHA must be correct
- **Character-level accuracy**: Individual character accuracy (more granular)

## Summary

### What We Built:
1. **Dataset pipeline** - Load, preprocess, and batch images
2. **CNN + GRU model** - Feature extraction + sequence modeling
3. **CTC Loss training** - No character alignment needed
4. **Prediction system** - Decode model output to text

### Key Concepts:
- **Character-level encoding** - Model predicts individual characters
- **CTC Loss** - Handles variable-length sequences
- **+1 offset** - Reserve index 0 for CTC blank token
- **Bidirectional GRU** - Context from both directions

### Files Generated:
- `best_captcha_model.pth` - Trained model weights
- `label_encoder.pkl` - Character encoder for prediction

### Next Steps:
- Try data augmentation (rotation, noise)
- Experiment with different architectures
- Add beam search decoding for better accuracy
- Deploy as web API