# Import Dataset

In [28]:
# Import kagglehub to download the Flickr8k dataset
import kagglehub

# Download the Flickr8k dataset (8,000 images with 5 captions each)
# Returns the local path where the dataset is stored
path = kagglehub.dataset_download("adityajn105/flickr8k")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\LENOVO\.cache\kagglehub\datasets\adityajn105\flickr8k\versions\1


# Configure VGG16 Model

In [29]:
import torch
from torchvision.models.feature_extraction import create_feature_extractor
from torchvision import models, transforms

# Load pre-trained VGG16 model with ImageNet weights
# VGG16 is a CNN trained on millions of images, good for feature extraction
model = models.vgg16(weights=models.VGG16_Weights.DEFAULT)

# Replace the last layer (classification layer) with Identity
# This removes the 1000-class classifier and keeps the 4096-dim feature vector
model.classifier[-1] = torch.nn.Identity()

# Set model to evaluation mode (disables dropout, batch norm updates)
model.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

**What This Code Does:**

We're setting up VGG16, a pre-trained image recognition model, to extract features from our images.

**Simple Explanation:**
1. **Load VGG16**: We use a model that's already trained to recognize images (trained on 14 million images)
2. **Remove Classification Layer**: VGG16 normally classifies images into 1000 categories (like "dog", "cat"). We don't need that, so we remove it
3. **Keep Feature Extractor**: Now VGG16 gives us a 4096-number summary of what's in each image
4. **Set to Evaluation Mode**: This turns off training features we don't need

**Why VGG16?**
- It's already trained, so we don't start from scratch
- It's good at understanding what's in images
- We use its "knowledge" to help generate captions

# Convert Images to Feature Vectors

In [30]:
import os
import pickle
import numpy as np
import tqdm

# List all files in the downloaded dataset directory
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['captions.txt', 'Images']


In [31]:
# Construct path to the Images folder containing all Flickr8k images
img_directory = os.path.join(path, "Images")
# Display first 10 image filenames as a sanity check
print(os.listdir(img_directory)[:10])

['1000268201_693b08cb0e.jpg', '1001773457_577c3a7d70.jpg', '1002674143_1b742ab4b8.jpg', '1003163366_44323f5815.jpg', '1007129816_e794419615.jpg', '1007320043_627395c3d8.jpg', '1009434119_febe49276a.jpg', '1012212859_01547e3f17.jpg', '1015118661_980735411b.jpg', '1015584366_dfcec3c85a.jpg']


**Code Explanation:**
- Constructs the path to the Images folder within the dataset
- Lists first 10 image filenames to verify dataset structure

In [32]:
from PIL import Image

# Define image preprocessing pipeline for VGG16
transform = transforms.Compose([
    # Resize images to 224x224 (VGG16's required input size)
    transforms.Resize((224, 224)),
    # Convert PIL Image to PyTorch tensor (values 0-1)
    transforms.ToTensor(),
    # Normalize using ImageNet mean and std (required for pre-trained VGG16)
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],  # ImageNet RGB channel means
        std=[0.229, 0.224, 0.225]     # ImageNet RGB channel stds
    )
])

**Code Explanation (Line by Line):**

**Line 1: Import PIL**
- `from PIL import Image`: Python Imaging Library for loading/manipulating images

**Line 3-13: Create Transformation Pipeline**
- `transforms.Compose([...])`: Chains transformations into a single callable function
  - Takes list of transformations
  - Applies them sequentially to input image
  - Returns transformed tensor

**Line 5-6: Resize Transform**
- `transforms.Resize((224, 224))`: Resizes image to 224√ó224 pixels
  - **Why 224?** VGG16 was trained on 224√ó224 images (fixed input size)
  - Maintains aspect ratio may cause distortion, but ensures compatibility
  - Uses bilinear interpolation by default

**Line 7-8: Tensor Conversion**
- `transforms.ToTensor()`: Converts PIL Image to PyTorch tensor
  - **Input**: PIL Image with pixel values [0, 255] (uint8)
  - **Output**: Torch tensor with values [0.0, 1.0] (float32)
  - **Shape**: (H, W, C) ‚Üí (C, H, W) (channels-first format)
  - Divides by 255 automatically: `pixel / 255.0`

**Line 9-13: Normalization**
- `transforms.Normalize(mean=[...], std=[...])`: Standardizes pixel values
  - **Formula**: `output = (input - mean) / std` for each channel
  - **mean=[0.485, 0.456, 0.406]**: ImageNet dataset RGB channel means
  - **std=[0.229, 0.224, 0.225]**: ImageNet dataset RGB channel standard deviations
  - **Why?** VGG16 was trained on normalized ImageNet images
  - **Result**: Each channel has ~zero mean and ~unit variance
  - **Example**: Red channel pixel 0.8 ‚Üí `(0.8 - 0.485) / 0.229 ‚âà 1.375`

In [33]:
# # Dictionary to store image_id -> feature_vector mappings
# features = {}

# # Loop through all images in the dataset
# for img_name in tqdm.tqdm(os.listdir(img_directory)):
#     # Construct full path to image file
#     img_path = img_directory + "/" + img_name
    
#     # Load image and ensure it's in RGB format (not grayscale)
#     image = Image.open(img_path).convert("RGB")
    
#     # Apply preprocessing transformations (resize, normalize)
#     image = transform(image)
    
#     # Add batch dimension: (C, H, W) -> (1, C, H, W)
#     image = image.unsqueeze(0)
    
#     # Extract 4096-dimensional feature vector using VGG16
#     feature = model(image) 
    
#     # Extract image ID by removing file extension (e.g., "123.jpg" -> "123")
#     image_id = img_name.split(".")[0]
    
#     # Store feature vector as numpy array (remove batch dim, detach from graph)
#     features[image_id] = feature.squeeze(0).detach().numpy()

**Code Explanation (Line by Line):**

**Line 1-2: Initialize Storage**
- `features = {}`: Empty dictionary to store `{image_id: feature_vector}` pairs
  - Keys: Image IDs (strings like "1000268201_693b08cb0e")
  - Values: NumPy arrays of shape (4096,)

**Line 4-5: Loop Through Images**
- `for img_name in tqdm.tqdm(os.listdir(img_directory)):`: Iterates over all image files
  - `os.listdir(img_directory)`: Returns list of filenames (e.g., ["1234.jpg", "5678.jpg", ...])
  - `tqdm.tqdm(...)`: Wraps iterable to show progress bar with ETA
  - `img_name`: Current filename (e.g., "1000268201_693b08cb0e.jpg")

**Line 6-7: Construct Image Path**
- `img_path = img_directory + "/" + img_name`: Creates full path
  - Example: `"/path/to/Images/" + "1234.jpg"` ‚Üí `"/path/to/Images/1234.jpg"`

**Line 9-10: Load and Convert Image**
- `image = Image.open(img_path).convert("RGB")`:
  - `Image.open(img_path)`: Loads image from disk using PIL
  - `.convert("RGB")`: Ensures 3-channel RGB format
    - Some images might be grayscale (1 channel)
    - Some might have alpha channel (RGBA, 4 channels)
    - This standardizes all to RGB (3 channels)

**Line 12-13: Apply Transformations**
- `image = transform(image)`: Applies the preprocessing pipeline
  - Resizes to 224√ó224
  - Converts to tensor (values 0-1)
  - Normalizes using ImageNet stats
  - **Output Shape**: (3, 224, 224) tensor

**Line 15-16: Add Batch Dimension**
- `image = image.unsqueeze(0)`: Adds batch dimension at position 0
  - **Before**: Shape (3, 224, 224) - single image
  - **After**: Shape (1, 3, 224, 224) - batch of 1 image
  - **Why?** Neural networks expect batched inputs: (batch_size, channels, height, width)
  - VGG16 forward pass requires 4D tensor

**Line 18-19: Extract Features**
- `feature = model(image)`: Pass image through VGG16
  - Processes through conv layers ‚Üí extracts visual patterns
  - Passes through modified classifier ‚Üí outputs 4096-dim vector
  - **Output Shape**: (1, 4096) - batch of 1 feature vector
  - Contains high-level visual information (objects, textures, scenes)

**Line 21-22: Extract Image ID**
- `image_id = img_name.split(".")[0]`: Removes file extension
  - `img_name = "1000268201_693b08cb0e.jpg"`
  - `img_name.split(".")` ‚Üí `["1000268201_693b08cb0e", "jpg"]`
  - `[0]` ‚Üí `"1000268201_693b08cb0e"`
  - Used as key to match with captions later

**Line 24-25: Store Feature Vector**
- `features[image_id] = feature.squeeze(0).detach().numpy()`:
  - `.squeeze(0)`: Removes batch dimension (1, 4096) ‚Üí (4096,)
  - `.detach()`: Detaches tensor from computation graph (saves memory)
  - `.numpy()`: Converts PyTorch tensor to NumPy array
  - Stores in dictionary: `features["1000268201_693b08cb0e"] = array([0.23, 0.45, ...])`

**Overall Process:**
- Loops through 8,000 images
- Each image ‚Üí 4096-dim feature vector
- Takes ~10-15 minutes on CPU, ~2-3 minutes on GPU
- Result: Dictionary with 8,000 entries ready for training

In [34]:
import pickle

# Save the extracted features to disk to avoid re-computing every time
# This saves significant time - VGG16 feature extraction takes ~10 minutes
# with open("features.pkl", "wb") as f:
#     pickle.dump(features, f)

**Code Explanation (Line by Line):**

**Line 1: Import Pickle**
- `import pickle`: Python module for object serialization (saving Python objects to disk)
  - Converts Python objects ‚Üí byte streams ‚Üí files
  - Can reconstruct objects later without recomputation

**Line 3-5: Save Features**
- `with open("features.pkl", "wb") as f:`: Opens file in write-binary mode
  - `"features.pkl"`: Filename (.pkl extension by convention)
  - `"wb"`: Write mode + binary mode (pickle requires binary)
  - `as f`: File handle for writing
  - `with`: Context manager (automatically closes file when done)

**Line 5: Pickle Dump**
- `pickle.dump(features, f)`: Serializes dictionary to file
  - `features`: Dictionary with 8,000 entries {img_id: 4096-dim array}
  - `f`: File handle to write to
  - **File Size**: ~130 MB (8,000 √ó 4,096 floats √ó 4 bytes/float)

**Why Save?**
- **Time Saving**: VGG16 extraction takes 10-15 minutes
- **Reusability**: Load features in seconds instead of re-extracting
- **Consistency**: Same features across multiple training runs
- **Convenience**: Can share pre-computed features with others

In [None]:
with open("features.pkl", "rb") as f:
    features = pickle.load(f)

**Code Explanation:**
- Loads pre-computed features from disk (skip VGG16 if already extracted)
- Much faster than re-running feature extraction (seconds vs. minutes)

In [None]:
import os
with open(os.path.join(path,"captions.txt")) as f:
    next(f) # skip header line
    captions_data = f.read()
print("Number of caption lines:", len(captions_data.split("\n")))
print("Total characters in file:", len(captions_data))

Number of caption lines: 40456
Total characters in file: 3319280


**Code Explanation:**
- Opens `captions.txt` file (format: "image_id.jpg,caption text")
- `next(f)`: Skips header line ("image,caption")
- Reads all caption data into a single string for parsing

In [None]:
mapped_captions = {}

for line in tqdm.tqdm(captions_data.split("\n")):
    tokens = line.split(",")
    if len(tokens) < 2:
        continue
    img_id = tokens[0].split(".")[0]
    caption = tokens[1].strip().lower()
    
    if img_id not in mapped_captions:
        mapped_captions[img_id] = []
    
    # Always append the caption
    mapped_captions[img_id].append(caption)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 40456/40456 [00:00<00:00, 879277.67it/s]




**Final Structure:**
```python
{
  "1000268201_693b08cb0e": [
    "child in pink dress is climbing up set of stairs in an entry way",
    "girl going into wooden building",
    "little girl climbing into wooden playhouse",
    "little girl climbing the stairs to her playhouse",
    "little girl in pink dress going into wooden cabin"
  ],
  "1001773457_577c3a7d70": [...],  # 5 more captions
  ...  # 8,000 total images
}
```

**Result**: Dictionary with 8,000 images, each having exactly 5 captions (40,000 total caption-image pairs)

In [None]:
mapped_captions["1000268201_693b08cb0e"]

['<SOS> child in pink dress is climbing up set of stairs in an entry way <EOS>',
 '<SOS> girl going into wooden building <EOS>',
 '<SOS> little girl climbing into wooden playhouse <EOS>',
 '<SOS> little girl climbing the stairs to her playhouse <EOS>',
 '<SOS> little girl in pink dress going into wooden cabin <EOS>']

In [None]:
len(mapped_captions)

8091

In [None]:
import re

def clean_caption(mapped_captions):
    for key,caption_list in mapped_captions.items():
        for i in range(len(caption_list)):
            caption= caption_list[i]
            caption= re.sub(r"[^a-zA-Z]"," ",caption)
            caption= caption.split()
            caption= [word for word in caption if len(word)>1]
            caption= " ".join(caption)
            caption = "<SOS> " + caption + " <EOS>"
            caption_list[i]= caption

In [None]:
clean_caption(mapped_captions)
mapped_captions["1000268201_693b08cb0e"]

['<SOS> child in pink dress is climbing up set of stairs in an entry way <EOS>',
 '<SOS> girl going into wooden building <EOS>',
 '<SOS> little girl climbing into wooden playhouse <EOS>',
 '<SOS> little girl climbing the stairs to her playhouse <EOS>',
 '<SOS> little girl in pink dress going into wooden cabin <EOS>']

In [None]:
all_captions=[]
for caption_list in mapped_captions.values():
    all_captions.extend(caption_list)
    
len(all_captions)

40455

**Code Explanation:**
- Flattens all captions into a single list
- Used to calculate statistics like max caption length
- Total: ~40,000 captions (8,000 images √ó 5 captions each)

In [None]:
all_captions[:5]

['<SOS> child in pink dress is climbing up set of stairs in an entry way <EOS>',
 '<SOS> girl going into wooden building <EOS>',
 '<SOS> little girl climbing into wooden playhouse <EOS>',
 '<SOS> little girl climbing the stairs to her playhouse <EOS>',
 '<SOS> little girl in pink dress going into wooden cabin <EOS>']

# Initialize Tokenizer and Prepare Sequences

In [None]:
from transformers import AutoTokenizer

# Use pre-trained tokenizer with existing vocabulary
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# Add custom special tokens for image captioning
special_tokens_dict = {'additional_special_tokens': ['<SOS>', '<EOS>']}
tokenizer.add_special_tokens(special_tokens_dict)

print(f"Tokenizer loaded with vocab size: {len(tokenizer)}")


Tokenizer loaded with vocab size: 30524


**What This Code Does:**

We set up a tokenizer (a tool that converts words to numbers) using BERT's vocabulary.

**Simple Explanation:**

- **Tokenizer**: Converts words into numbers that the computer can understand- These markers tell the model when a caption starts and ends

- **BERT Vocabulary**: Uses 30,000 common words that BERT already knows- **Special Tokens**: We add `<SOS>` (Start Of Sentence) and `<EOS>` (End Of Sentence) markers

In [None]:
max_length=max(len(caption.split()) for caption in all_captions)
max_length

31

**What This Code Does:**

Finds the longest caption in our dataset. We need this to make all captions the same length by adding padding (empty spaces).

# Split Data: Training and Validation

In [None]:
from sklearn.model_selection import train_test_split

image_ids=list(mapped_captions.keys())

train_ids, val_ids = train_test_split(image_ids, test_size=0.2, random_state=42)

**What This Code Does:**

Divides our images into two groups:
- **Training set (80%)**: ~6,400 images to teach the model

- **Validation set (20%)**: ~1,600 images to test how well it learnedThis way we can check if the model works on images it hasn't seen before.


In [None]:
len(tokenizer)

30524

In [None]:
def data_generator(img_ids, mapped_captions, features, tokenizer, max_length, batch_size):
    X_img, X_seq, y_seq = [], [], []

    while True:
        for img_id in img_ids: # the list of the image ids
            for caption in mapped_captions[img_id]:

                seq = tokenizer.encode(
                    caption,
                    add_special_tokens=False,
                    max_length=max_length,
                    truncation=True
                )

                for i in range(1, len(seq)):
                    in_seq = seq[:i]
                    out_seq = seq[i]

                    in_seq = in_seq + [tokenizer.pad_token_id] * (max_length - len(in_seq))

                    X_img.append(features[img_id])
                    X_seq.append(in_seq)
                    y_seq.append(out_seq)

                    if len(X_img) == batch_size:
                        yield (
                            np.array(X_img, dtype=np.float32),
                            np.array(X_seq, dtype=np.int64),
                            np.array(y_seq, dtype=np.int64),
                        )
                        X_img, X_seq, y_seq = [], [], []


**Code Explanation (Line by Line):**

**Line 1: Function Signature**
- `def data_generator(img_ids, mapped_captions, features, tokenizer, max_length, batch_size):`:
  - `img_ids`: List of image IDs for this split (train or val)
  - `mapped_captions`: Dictionary `{img_id: [captions]}`
  - `features`: Dictionary `{img_id: 4096-dim vector}`
  - `tokenizer`: BERT tokenizer for text ‚Üí token IDs
  - `max_length`: Maximum caption length for padding
  - `batch_size`: Number of samples per batch (e.g., 64)

**Line 2-3: Initialize Batch Lists**
- `X_img, X_seq, y_seq = [], [], []`: Three empty lists for batch data
  - `X_img`: Image features
  - `X_seq`: Input sequences (partial captions)
  - `y_seq`: Target words (one-hot vectors)

**Line 4: Get Vocabulary Size**
- `vocab_size = len(tokenizer)`: Total number of unique tokens
  - For BERT: ~30,522 tokens
  - Used to create one-hot target vectors

**Line 6: Infinite Loop**
- `while True:`: Generator runs forever (yields batches indefinitely)
  - Training calls `next(generator)` repeatedly
  - Generator cycles through data infinitely (re-starts after all images processed)

**Line 7: Loop Through Images**
- `for img_id in img_ids:`: Process each image in the split
  - For training: ~6,400 image IDs
  - For validation: ~1,600 image IDs

**Line 8: Get Captions for Image**
- `caption_list = mapped_captions[img_id]`: Retrieves all 5 captions
  - Each caption format: `"<SOS> words here <EOS>"`

**Line 9: Loop Through Captions**
- `for caption in caption_list:`: Process each of the 5 captions
  - Creates 5 training samples per image
  - Total: 8,000 images √ó 5 captions = 40,000 caption-image pairs

**Line 10-11: Tokenize Caption**
- `seq = tokenizer.encode(caption, add_special_tokens=False, max_length=max_length, truncation=True)`:
  - `tokenizer.encode(...)`: Converts text to token IDs
  - `add_special_tokens=False`: **CRITICAL** - Don't add BERT's [CLS]/[SEP] tokens
    - Captions already have `<SOS>`/`<EOS>`
    - Adding BERT tokens would contaminate training
  - `max_length=max_length`: Maximum length before truncation
  - `truncation=True`: Cut off if caption exceeds max_length
  - **Example**: `"<SOS> dog runs <EOS>"` ‚Üí `[101, 3899, 3216, 102]` (token IDs)

**Line 13: Create Autoregressive Samples**
- `for i in range(1, len(seq)):`: Loop from index 1 to end
  - **Autoregressive Training**: Model predicts next word given previous words
  - **Example**: Caption = `[SOS, dog, runs, park, EOS]` creates 4 samples:
    ```
    Input: [SOS]           ‚Üí Target: dog
    Input: [SOS, dog]      ‚Üí Target: runs  
    Input: [SOS, dog, runs] ‚Üí Target: park
    Input: [SOS, dog, runs, park] ‚Üí Target: EOS
    ```

**Line 14: Create Input Sequence**
- `in_seq = seq[:i]`: Slice from start to current position
  - `i=1`: `seq[:1]` = `[SOS]`
  - `i=2`: `seq[:2]` = `[SOS, dog]`
  - `i=3`: `seq[:3]` = `[SOS, dog, runs]`

**Line 15: Create Target**
- `out_seq = seq[i]`: Next word (single token ID)
  - `i=1`: `seq[1]` = `dog`
  - `i=2`: `seq[2]` = `runs`

**Line 16-17: Pad Input Sequence**
- `in_seq = in_seq + [tokenizer.pad_token_id] * (max_length - len(in_seq))`:
  - **Padding**: Extends short sequences to max_length
  - `tokenizer.pad_token_id`: Special padding token (usually 0)
  - **Example**: `[SOS, dog]` with max_length=5 ‚Üí `[SOS, dog, PAD, PAD, PAD]`
  - **Why?** Neural networks need fixed-size inputs for batching

**Line 19-20: Create One-Hot Target**
- `out_seq_categorical = np.zeros(vocab_size)`: Create zero vector of size vocab_size
  - Example: `[0, 0, 0, ..., 0]` (30,522 zeros)
- `out_seq_categorical[out_seq] = 1`: Set target word position to 1
  - If `out_seq = 3899` (dog), creates: `[0, 0, ..., 1, ..., 0]` (1 at index 3899)
  - **One-hot encoding**: Only one element is 1, rest are 0

**Line 22-24: Accumulate Batch Samples**
- `X_img.append(features[img_id])`: Add image features (4096-dim)
- `X_seq.append(in_seq)`: Add padded input sequence
- `y_seq.append(out_seq_categorical)`: Add one-hot target

**Line 26-28: Yield Batch When Full**
- `if len(X_img) == batch_size:`: Check if batch complete
  - Typically batch_size = 64 samples
- `yield np.array(X_img), np.array(X_seq), np.array(y_seq)`: Return batch
  - Converts lists to NumPy arrays
  - **Shapes**: X_img: (64, 4096), X_seq: (64, max_length), y_seq: (64, vocab_size)
- `X_img, X_seq, y_seq = [], [], []`: Reset lists for next batch

**Key Concepts:**
- **Autoregressive Training**: Model learns to predict next word given context
- **Teacher Forcing**: During training, model sees correct previous words (not its predictions)
- **Padding**: All sequences same length for efficient batching
- **One-Hot Targets**: CrossEntropyLoss expects this format

## üîç Deep Dive: How the Data Generator Works

The `data_generator` function is the **heart of the training pipeline**. It converts our caption data into autoregressive training samples.

### üéØ Main Goal
Transform each caption into multiple training samples where the model learns to predict **one word at a time** based on previous words.

### üìä Example Walkthrough

**Input Caption**: `"<SOS> dog runs fast <EOS>"`  
**Tokenized**: `[30522, 3899, 3216, 2698, 30523]`

**Generated Training Samples**:
```
Sample 1:  Input: [30522]                      ‚Üí Target: 3899 (dog)
Sample 2:  Input: [30522, 3899]                ‚Üí Target: 3216 (runs)
Sample 3:  Input: [30522, 3899, 3216]          ‚Üí Target: 2698 (fast)
Sample 4:  Input: [30522, 3899, 3216, 2698]    ‚Üí Target: 30523 (<EOS>)
```

Each input is **padded to max_length** (e.g., 40 tokens) with padding tokens (0):
```
Sample 1:  [30522, 0, 0, 0, ..., 0]  ‚Üí Target: 3899
Sample 2:  [30522, 3899, 0, 0, ..., 0]  ‚Üí Target: 3216
```

### üîÑ Training Flow

1. **Image Loop**: Process each of 6,400 training images
2. **Caption Loop**: Each image has 5 captions
3. **Autoregressive Loop**: Each caption generates 3-35 training samples (depends on length)
4. **Batch Accumulation**: Collect 64 samples before yielding

**Total Training Samples per Epoch**:
- 6,400 images √ó 5 captions √ó ~15 words avg = **~480,000 training samples**
- Organized into ~7,500 batches (480,000 √∑ 64)

### üß† Why This Approach?

**Teacher Forcing**: During training, the model always sees the **correct previous words**, not its own predictions. This:
- Stabilizes training
- Speeds up convergence
- Prevents error accumulation

**Autoregressive Learning**: The model learns the **sequential nature** of language:
- After seeing "dog", it learns "runs" is likely
- After "dog runs", it learns "fast" or "in" might follow
- Captures grammar, context, and image-text relationships

### ‚ö° Key Implementation Details

1. **No Special Tokens**: `add_special_tokens=False` prevents BERT's [CLS]/[SEP] from contaminating captions
2. **Padding Strategy**: All sequences padded to `max_length` for efficient GPU batching
3. **One-Hot Encoding**: Target words converted to one-hot vectors for CrossEntropyLoss
4. **Infinite Generator**: `while True` loop allows unlimited epochs without restarting
5. **Same Image Features**: All 5 captions for an image share the same VGG16 features (efficiency)

# Create the Caption Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ImageCaptionModel(nn.Module):
    def __init__(self, vocab_size, pad_idx=0):
        super().__init__()

        # Image ‚Üí initial LSTM state
        self.img_to_h = nn.Linear(4096, 256)
        self.img_to_c = nn.Linear(4096, 256)

        # Text embedding + LSTM
        self.embedding = nn.Embedding(vocab_size, 256, padding_idx=pad_idx)
        self.lstm = nn.LSTM(256, 256, batch_first=True)

        # Predict next word at EACH timestep
        self.fc = nn.Linear(256, vocab_size)

    def forward(self, img_features, captions):


        # Initialize LSTM hidden & cell from image
        h0 = torch.tanh(self.img_to_h(img_features)).unsqueeze(0)  # (1, B, 256)
        c0 = torch.tanh(self.img_to_c(img_features)).unsqueeze(0)  # (1, B, 256)

        # Embed caption tokens
        emb = self.embedding(captions)                             # (B, T, 256)

        # LSTM over sequence
        outputs, _ = self.lstm(emb, (h0, c0))                      # (B, T, 256)

        # Vocabulary prediction at each timestep
        logits = self.fc(outputs)                                  # (B, T, vocab_size)
        return logits


In [None]:
model=ImageCaptionModel(vocab_size=len(tokenizer))

In [None]:
from torchinfo import summary
summary(model, input_size=[(1, 4096), (1, max_length)], dtypes=[torch.float32, torch.long])

**Code Explanation:**
- Uses `torchinfo.summary()` to display model architecture
- Shows layer-by-layer parameters, shapes, and total parameter count
- Useful for debugging and understanding model structure

In [None]:
PAD_IDX = tokenizer.pad_token_id  # or 0 if you used 0
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) # withouth ignore_index the model will mosttly  predict pad
optimizer= torch.optim.Adam(model.parameters(), lr=0.001)
epochs=10  # Increased for better learning
batch_size=64
steps_per_epoch = len(train_ids) // batch_size

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0.0

    generator = data_generator(
        train_ids, mapped_captions, features,
        tokenizer, max_length, batch_size
    )

    for step in tqdm.tqdm(range(steps_per_epoch)):
        img_features, seqs, targets = next(generator)

        optimizer.zero_grad()

        img_features_tensor = torch.tensor(img_features, dtype=torch.float32)
        seqs_tensor = torch.tensor(seqs, dtype=torch.long)
        targets_tensor = torch.tensor(targets, dtype=torch.long)

        outputs = model(img_features_tensor, seqs_tensor)  # (B, T, vocab_size)
        
        # Extract predictions at the last non-padding position for each sequence
        # Find the length of each sequence (number of non-pad tokens)
        lengths = (seqs_tensor != tokenizer.pad_token_id).sum(dim=1)  # (B,)
        
        # Get the prediction at the last valid position
        batch_indices = torch.arange(outputs.size(0))
        last_outputs = outputs[batch_indices, lengths - 1]  # (B, vocab_size)

        loss = criterion(last_outputs, targets_tensor)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/steps_per_epoch:.4f}")


## üîÑ Understanding `steps_per_epoch` and Yielding

### **The Calculation**
```python
steps_per_epoch = len(train_ids) // batch_size
# = 6,400 images √∑ 64 samples per batch = 100 steps
```

### **Why Do We Need `steps_per_epoch`?**

**Problem:** Our `data_generator` uses `while True:` - an **infinite loop** that never stops on its own.

```python
def data_generator(...):
    while True:  # ‚Üê Runs forever!
        for img_id in img_ids:
            # Generate batches...
            yield batch  # ‚Üê Returns batch but doesn't exit
```

**Without `steps_per_epoch`:** Training would run indefinitely, never advancing to the next epoch.

**With `steps_per_epoch`:** We manually control how many batches to process:

```python
for step in range(steps_per_epoch):  # ‚Üê Stops after exactly 100 iterations
    batch = next(generator)
```

---

### **What is `yield`?**

`yield` creates a **generator function** that:
1. **Pauses execution** and returns a value
2. **Remembers state** (variables, loop position)
3. **Resumes** from where it left off when called again

**Example:**
```python
# Generator yields batches one at a time
def data_generator():
    while True:
        # ... process data ...
        yield batch  # ‚Üê Pause here, return batch, wait for next call

# Training loop
for step in range(100):
    batch = next(generator)  # ‚Üê Resume generator, get next batch
```

**Memory Efficiency:** Instead of loading all 480,000 training samples into RAM, we generate batches **on-demand**.

---

### **How It Works Together**

| Step | What Happens |
|------|-------------|
| 1 | `generator = data_generator(...)` creates generator (doesn't execute yet) |
| 2 | `next(generator)` calls generator ‚Üí processes data ‚Üí hits `yield` ‚Üí returns batch |
| 3 | Generator **pauses** (remembers position in loops) |
| 4 | `next(generator)` again ‚Üí generator **resumes** ‚Üí processes more data ‚Üí yields next batch |
| 5 | Repeat 100 times (`steps_per_epoch`) |
| 6 | After 100 steps, loop exits ‚Üí **epoch complete** |
| 7 | Next epoch creates **new generator** (restarts from beginning) |

---

### **Visual Flow**

```
Epoch 1:
  Step 1: next(gen) ‚Üí yields batch 1 (64 samples)
  Step 2: next(gen) ‚Üí yields batch 2 (64 samples)
  ...
  Step 100: next(gen) ‚Üí yields batch 100 (64 samples)
  ‚úì Total: 6,400 samples processed (1 full pass through training data)

Epoch 2:
  Create new generator (restarts)
  Step 1-100: Process same 6,400 samples again
```

---

### **Key Insight**

**`steps_per_epoch`** acts as a **manual epoch boundary** for infinite generators. It ensures:
- ‚úÖ Each epoch processes the full training dataset exactly once
- ‚úÖ Training progresses through multiple epochs
- ‚úÖ Loss is calculated per epoch for monitoring
- ‚úÖ Training eventually completes after all epochs

Without it, the `while True` loop would **never allow the epoch loop to advance**, causing the model to train indefinitely on the first epoch!

In [None]:
torch.save(model.state_dict(), "image_caption_model.pth")

In [None]:
def idx_to_word(integer,tokenizer):
    return tokenizer.decode([integer],skip_special_tokens=True,clean_up_tokenization_spaces=True,)

**Code Explanation:**
- Helper function to convert token ID back to word using tokenizer
- `skip_special_tokens=True`: Removes `<SOS>`, `<EOS>`, `[PAD]` from output

In [None]:
def predict_caption(img_id, model, tokenizer, max_length, features, temperature=0.8):

    model.eval()

    img_feature = features[img_id]

    sos_id = tokenizer.encode("<SOS>", add_special_tokens=False)[0]
    eos_id = tokenizer.encode("<EOS>", add_special_tokens=False)[0]
    pad_id = tokenizer.pad_token_id

    caption = [sos_id]

    with torch.no_grad():
        for _ in range(max_length - 1):
            # Pad caption to max_length
            padded_caption = caption + [pad_id] * (max_length - len(caption))
            seq_tensor = torch.tensor([padded_caption], dtype=torch.long)
            img_tensor = torch.tensor([img_feature], dtype=torch.float32)

            # Forward pass - single prediction per step
            logits = model(img_tensor, seq_tensor)  # (1, vocab_size)
            
            # Apply temperature and sample
            probs = F.softmax(logits, dim=-1)  # (1, vocab_size)
            next_word = torch.multinomial(probs.squeeze(0), 1).item()

            if next_word in (eos_id, pad_id):
                break

            caption.append(next_word)

    return tokenizer.decode(caption, skip_special_tokens=True)


In [None]:
# Check if model weights exist
import os
if not os.path.exists("image_caption_model.pth"):
    print("ERROR: Model weights not found. Make sure to run the training cell first!")
else:
    # Load the trained model
    caption_model = ImageCaptionModel(vocab_size=len(tokenizer))
    caption_model.load_state_dict(torch.load("image_caption_model.pth"))
    caption_model.eval()
    
    # Test caption generation on validation samples
    print("Testing caption generation on validation samples:\n")
    
    for i in range(3):
        img_id = val_ids[i]
        
        # Generate caption using the trained caption model
        generated = predict_caption(img_id, caption_model, tokenizer, max_length, features)
        
        # Get actual captions
        actual_captions = mapped_captions[img_id]
        
        print(f"Image ID: {img_id}")
        print(f"Generated caption: {generated}")
        print(f"Actual captions:")
        for cap in actual_captions:
            print(f"  - {cap}")
        print("-" * 80)
