In [1]:
import torch
import subprocess
import time

def print_gpu_info():
    if torch.cuda.is_available():
        gpu_id = torch.cuda.current_device()
        print(f"\n[INFO] Using GPU: {torch.cuda.get_device_name(gpu_id)}")

        allocated = torch.cuda.memory_allocated(gpu_id) / (1024 ** 2)  # in MB
        reserved = torch.cuda.memory_reserved(gpu_id) / (1024 ** 2)    # in MB
        total_mem = torch.cuda.get_device_properties(gpu_id).total_memory / (1024 ** 2)  # in MB

        print(f"[MEMORY] Allocated: {allocated:.2f} MB")
        print(f"[MEMORY] Reserved : {reserved:.2f} MB")
        print(f"[MEMORY] Total    : {total_mem:.2f} MB")

        # Run nvidia-smi for detailed live stats (if available)
        print("\n[GPU STATUS - nvidia-smi]")
        try:
            subprocess.run(["nvidia-smi"], check=True)
        except Exception as e:
            print(f"Could not run nvidia-smi: {e}")
    else:
        print("[INFO] CUDA not available.")

# Example usage
if __name__ == "__main__":
    print_gpu_info()

    # Dummy PyTorch usage to simulate GPU load
    a = torch.rand((10000, 10000), device='cuda')
    b = torch.matmul(a, a)
    print_gpu_info()

    # Optional: Wait and monitor changes
    time.sleep(5)
    print_gpu_info()



[INFO] Using GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
[MEMORY] Allocated: 0.00 MB
[MEMORY] Reserved : 0.00 MB
[MEMORY] Total    : 6143.50 MB

[GPU STATUS - nvidia-smi]

[INFO] Using GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
[MEMORY] Allocated: 772.12 MB
[MEMORY] Reserved : 784.00 MB
[MEMORY] Total    : 6143.50 MB

[GPU STATUS - nvidia-smi]

[INFO] Using GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
[MEMORY] Allocated: 772.12 MB
[MEMORY] Reserved : 784.00 MB
[MEMORY] Total    : 6143.50 MB

[GPU STATUS - nvidia-smi]


In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
token = os.getenv("HF_TOKEN")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os

# Set environment variable to avoid potential conflicts
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

# Load a smaller dataset for summarization
dataset = load_dataset("cnn_dailymail","3.0.0")
print(dataset["train"][0])

# results = metric.compute(predictions=predictions, references=references)
# print(results)
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

PyTorch Version: 2.5.1+cu121
CUDA Available: True
GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU, Memory: 6.44 GB
Initial GPU Memory Allocated: 0.00 MB
CPU Usage: 20.7%, RAM Free: 4.59 GB
File exists: D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\ViT_Features\train_vit_features.pt, Size: 412.90 MB
File exists: D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\ViT_Features\val_vit_features.pt, Size: 51.78 MB
File exists: D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\preprocessed_captions.pkl, Size: 14.01 MB
Initializing datasets...
Loading feature file: D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\ViT_Features\train_vit_features.pt
Features shape: torch.Size([127132, 768]), Image names: 127132, Loaded in 0.60 seconds
Loading caption file: D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\preprocessed_captions.pkl
Captions loaded: 158914 captions in 0.37 seconds
Building voc

Training:   0%|          | 0/3973 [00:00<?, ?it/s]

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pickle
from collections import Counter
import os
import time
import random
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings
# Vocabulary class
class Vocabulary:
    def __init__(self, min_freq=2):
        self.itos = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.stoi = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.min_freq = min_freq

    def build_vocabulary(self, captions):
        words = [word for caption in captions for word in caption]
        word_counts = Counter(words)
        idx = len(self.itos)
        for word, count in word_counts.items():
            if count >= self.min_freq and word not in self.stoi:
                self.itos[idx] = word
                self.stoi[word] = idx
                idx += 1

    def numericalize(self, caption):
        return [self.stoi.get(word, self.stoi['<UNK>']) for word in caption]

# Dataset
class Flickr30kCaptionDataset(Dataset):
    def __init__(self, feature_file, caption_file):
        data = torch.load(feature_file, weights_only=False)
        self.features = data['features']
        self.captions = pickle.load(open(caption_file, 'rb'))
        self.image_names = data['image_names']
        self.vocab = Vocabulary(min_freq=2)
        self.vocab.build_vocabulary(self.captions)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        caption = self.captions[idx]
        numerical_caption = [self.vocab.stoi['<SOS>']] + self.vocab.numericalize(caption) + [self.vocab.stoi['<EOS>']]
        return feature, torch.tensor(numerical_caption, dtype=torch.long), self.image_names[idx]

# Custom collate function
def custom_collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return None, None, None
    features, captions, image_names = zip(*batch)
    features = torch.stack(features)
    max_len = max(len(c) for c in captions)
    padded_captions = torch.zeros(len(captions), max_len, dtype=torch.long)
    for i, cap in enumerate(captions):
        padded_captions[i, :len(cap)] = cap
    return features, padded_captions, image_names

In [2]:

# Bi-GRU Model
class BiGRUModel(nn.Module):
    def __init__(self, feature_dim, vocab_size, embed_dim=256, hidden_dim=512, num_layers=2, dropout=0.5):
        super(BiGRUModel, self).__init__()
        self.feature_fc = nn.Linear(feature_dim, hidden_dim)  # Maps 768 -> 512
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)  # Output from bidirectional GRU
        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.feature_fc.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.feature_fc.bias)
        nn.init.zeros_(self.fc.bias)

    def get_initial_hidden(self, features):
        """Initialize hidden state using transformed image features."""
        batch_size = features.size(0)
        # features is already [batch_size, hidden_dim] after feature_fc
        h_first_layer = features.unsqueeze(0).repeat(2, 1, 1)  # [2, batch_size, hidden_dim] for first layer (fwd + bwd)
        h_second_layer = torch.zeros(2, batch_size, self.hidden_dim).to(features.device)  # [2, batch_size, hidden_dim] for second layer
        hidden = torch.cat([h_first_layer, h_second_layer], dim=0)  # [4, batch_size, hidden_dim]
        return hidden

    def forward(self, features, captions=None, teacher_forcing_ratio=0.5):
        """Forward pass for training with teacher forcing."""
        # Transform features once
        features = self.dropout(torch.relu(self.feature_fc(features)))  # [batch_size, 512]
        batch_size = features.size(0)
        if captions is None:
            return self.inference(features)

        # Initialize hidden state
        hidden = self.get_initial_hidden(features)  # [4, batch_size, 512]
        embed = self.embedding(captions[:, :-1])  # Exclude <EOS> for input
        outputs = []
        input_token = captions[:, 0].unsqueeze(1)  # Start with <SOS>

        for t in range(embed.size(1)):
            embed_input = self.embedding(input_token).squeeze(1)  # [batch_size, embed_dim]
            _, hidden = self.gru(embed_input.unsqueeze(1), hidden)  # Update hidden state
            output = self.fc(hidden[-2:].transpose(0, 1).contiguous().view(batch_size, -1))  # [batch_size, vocab_size]
            outputs.append(output)
            # Teacher forcing
            if random.random() < teacher_forcing_ratio:
                input_token = captions[:, t + 1].unsqueeze(1)
            else:
                _, topi = output.topk(1)
                input_token = topi.detach()

        outputs = torch.stack(outputs, dim=1)  # [batch_size, seq_len, vocab_size]
        return outputs

    def inference(self, features, max_len=20):
        """Inference mode for generating captions."""
        batch_size = features.size(0)
        features = self.dropout(torch.relu(self.feature_fc(features)))  # [batch_size, 512]
        hidden = self.get_initial_hidden(features)  # [4, batch_size, 512]
        input_token = torch.full((batch_size, 1), 1, dtype=torch.long, device=features.device)  # <SOS>
        outputs = []

        for _ in range(max_len):
            embed = self.embedding(input_token).squeeze(1)  # [batch_size, embed_dim]
            _, hidden = self.gru(embed.unsqueeze(1), hidden)  # Update hidden state
            output = self.fc(hidden[-2:].transpose(0, 1).contiguous().view(batch_size, -1))  # [batch_size, vocab_size]
            _, next_token = output.max(1)  # Greedy decoding
            outputs.append(next_token)
            input_token = next_token.unsqueeze(1)
            if (next_token == 2).all():  # Stop at <EOS>
                break

        return torch.stack(outputs, dim=1)  # [batch_size, seq_len]
# Paths
feature_dir = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\ViT_Features'
caption_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\preprocessed_captions1.pkl'

# Datasets and DataLoaders
train_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'train_vit_features.pt'), caption_file)
val_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'val_vit_features.pt'), caption_file)
test_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'test_vit_features.pt'), caption_file)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)

# Save vocab for evaluation
with open(os.path.join(feature_dir, 'vocab.pkl'), 'wb') as f:
    pickle.dump(train_dataset.vocab, f)
print(f"Saved vocab.pkl with {len(train_dataset.vocab.itos)} tokens")

# Model
model = BiGRUModel(
    feature_dim=768,
    vocab_size=len(train_dataset.vocab.itos),
    embed_dim=256,
    hidden_dim=512,
    num_layers=2,
    dropout=0.5
).cuda()

# Training setup
criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.0005)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, min_lr=1e-6, verbose=True)
num_epochs = 100

# Training loop
def train_epoch(model, loader, criterion, optimizer, teacher_forcing_ratio=0.5):
    model.train()
    total_loss = 0
    for features, captions, _ in tqdm(loader, desc="Training"):
        if features is None:
            continue
        features, captions = features.cuda(), captions.cuda()
        outputs = model(features, captions, teacher_forcing_ratio)
        loss = criterion(outputs.view(-1, outputs.size(-1)), captions[:, 1:].contiguous().view(-1))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Validation loop
def validate_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for features, captions, _ in tqdm(loader, desc="Validating"):
            if features is None:
                continue
            features, captions = features.cuda(), captions.cuda()
            outputs = model(features, captions, teacher_forcing_ratio=1.0)
            loss = criterion(outputs.view(-1, outputs.size(-1)), captions[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

# Training with early stopping
best_val_loss = float('inf')
patience = 3
no_improve_count = 0
min_delta = 0.01
for epoch in range(num_epochs):
    start_time = time.time()
    train_loss = train_epoch(model, train_loader, criterion, optimizer, teacher_forcing_ratio=0.5)
    val_loss = validate_epoch(model, val_loader, criterion)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Time: {(time.time() - start_time) / 60:.2f} min, LR: {optimizer.param_groups[0]['lr']:.6f}")
    if val_loss < best_val_loss - min_delta:
        best_val_loss = val_loss
        no_improve_count = 0
        torch.save(model.state_dict(), os.path.join(feature_dir, 'best_bigru_model1.pt'))
        print(f"Saved best model with Val Loss: {best_val_loss:.4f}")
    else:
        no_improve_count += 1
        print(f"No improvement in Val Loss, count: {no_improve_count}/{patience}")
    scheduler.step(val_loss)  # Update learning rate based on val_loss
    if no_improve_count >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break


Saved vocab.pkl with 18 tokens


Training:   0%|          | 0/1987 [00:00<?, ?it/s]


KeyError: 124693

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pickle
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import os
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings

# Vocabulary class (unchanged)
class Vocabulary:
    def __init__(self, min_freq=2):
        self.itos = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.stoi = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.min_freq = min_freq

    def build_vocabulary(self, captions):
        words = [word for caption in captions for word in caption]
        word_counts = Counter(words)
        idx = len(self.itos)
        for word, count in word_counts.items():
            if count >= self.min_freq and word not in self.stoi:
                self.itos[idx] = word
                self.stoi[word] = idx
                idx += 1

    def numericalize(self, caption):
        return [self.stoi.get(word, self.stoi['<UNK>']) for word in caption]

# Dataset (unchanged)
class Flickr30kCaptionDataset(Dataset):
    def __init__(self, feature_file, caption_file, vocab=None):
        data = torch.load(feature_file, weights_only=False)
        self.features = data['features']
        self.captions = pickle.load(open(caption_file, 'rb'))
        self.image_names = data['image_names']

        # Use external vocab if provided
        if vocab is not None:
            self.vocab = vocab
        else:
            self.vocab = Vocabulary(min_freq=2)
            self.vocab.build_vocabulary(self.captions)


    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        image_name = self.image_names[idx]
        caption = self.captions[image_name]

    # Fix for multiple captions per image
        if isinstance(caption[0], list):
            caption = caption[0]  # Pick the first caption

        numerical_caption = [self.vocab.stoi['<SOS>']] + self.vocab.numericalize(caption) + [self.vocab.stoi['<EOS>']]
        return feature, torch.tensor(numerical_caption, dtype=torch.long), image_name



# Custom collate function (unchanged)
def custom_collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return None, None, None
    features, captions, image_names = zip(*batch)
    features = torch.stack(features)  # [batch_size, 768]
    max_len = max(len(c) for c in captions)
    padded_captions = torch.zeros(len(captions), max_len, dtype=torch.long)
    for i, cap in enumerate(captions):
        padded_captions[i, :len(cap)] = cap
    return features, padded_captions, image_names

# Bi-GRU Model (unchanged, for loading)
class BiGRUModel(nn.Module):
    def __init__(self, feature_dim, vocab_size, embed_dim=256, hidden_dim=512, num_layers=2, dropout=0.5):
        super(BiGRUModel, self).__init__()
        self.feature_fc = nn.Linear(feature_dim, hidden_dim)  # Maps 768 -> 512
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)  # Output from bidirectional GRU
        self.dropout = nn.Dropout(dropout)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.feature_fc.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.feature_fc.bias)
        nn.init.zeros_(self.fc.bias)

    def get_initial_hidden(self, features):
        """Initialize hidden state using transformed image features."""
        batch_size = features.size(0)
        h_first_layer = features.unsqueeze(0).repeat(2, 1, 1)  # [2, batch_size, hidden_dim] for first layer (fwd + bwd)
        h_second_layer = torch.zeros(2, batch_size, self.hidden_dim).to(features.device)  # [2, batch_size, hidden_dim] for second layer
        hidden = torch.cat([h_first_layer, h_second_layer], dim=0)  # [4, batch_size, hidden_dim]
        return hidden

    def forward(self, features, captions=None, teacher_forcing_ratio=0.5):
        """Forward pass for training with teacher forcing."""
        features = self.dropout(torch.relu(self.feature_fc(features)))  # [batch_size, 512]
        batch_size = features.size(0)
        if captions is None:
            return self.inference(features)
        hidden = self.get_initial_hidden(features)  # [4, batch_size, 512]
        embed = self.embedding(captions[:, :-1])  # Exclude <EOS> for input
        outputs = []
        input_token = captions[:, 0].unsqueeze(1)  # Start with <SOS>
        for t in range(embed.size(1)):
            embed_input = self.embedding(input_token).squeeze(1)  # [batch_size, embed_dim]
            _, hidden = self.gru(embed_input.unsqueeze(1), hidden)  # Update hidden state
            output = self.fc(hidden[-2:].transpose(0, 1).contiguous().view(batch_size, -1))  # [batch_size, vocab_size]
            outputs.append(output)
            if random.random() < teacher_forcing_ratio:
                input_token = captions[:, t + 1].unsqueeze(1)
            else:
                _, topi = output.topk(1)
                input_token = topi.detach()
        outputs = torch.stack(outputs, dim=1)  # [batch_size, seq_len, vocab_size]
        return outputs

    def inference(self, features, max_len=20):
        """Inference mode for generating captions."""
        batch_size = features.size(0)
        features = self.dropout(torch.relu(self.feature_fc(features)))  # [batch_size, 512]
        hidden = self.get_initial_hidden(features)  # [4, batch_size, 512]
        input_token = torch.full((batch_size, 1), 1, dtype=torch.long, device=features.device)  # <SOS>
        outputs = []
        for _ in range(max_len):
            embed = self.embedding(input_token).squeeze(1)  # [batch_size, embed_dim]
            _, hidden = self.gru(embed.unsqueeze(1), hidden)  # Update hidden state
            output = self.fc(hidden[-2:].transpose(0, 1).contiguous().view(batch_size, -1))  # [batch_size, vocab_size]
            _, next_token = output.max(1)  # Greedy decoding
            outputs.append(next_token)
            input_token = next_token.unsqueeze(1)
            if (next_token == 2).all():  # Stop at <EOS>
                break
        return torch.stack(outputs, dim=1)  # [batch_size, seq_len]

# Paths (updated for Colab, assuming files are in Google Drive)
feature_dir = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\ViT_Features'
caption_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\preprocessed_captions1.pkl'

# Datasets and DataLoaders
test_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'test_vit_features.pt'), caption_file)
val_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'val_vit_features.pt'), caption_file)

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)

# Load vocabulary

# Load the correct vocab used during training
# Load training vocab
vocab_path = os.path.join(feature_dir, 'vocab.pkl')
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

# Pass same vocab to both datasets
test_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'test_vit_features.pt'), caption_file, vocab)
val_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'val_vit_features.pt'), caption_file, vocab)


# Initialize model
model = BiGRUModel(
    feature_dim=768,
    vocab_size=len(vocab.itos),
    embed_dim=256,
    hidden_dim=512,
    num_layers=2,
    dropout=0.5
).cuda()

# Load trained model weights
model_path = os.path.join(feature_dir, 'best_bigru_model1.pt')
model.load_state_dict(torch.load(model_path))
model.eval()

# Evaluation function
def evaluate_model(model, loader, vocab, max_len=20, num_samples=5):
    references, hypotheses = [], []
    sample_outputs = []

    with torch.no_grad():
        for batch_idx, (features, captions, image_names) in enumerate(tqdm(loader, desc="Evaluating")):
            if features is None:
                continue
            features, captions = features.cuda(), captions.cuda()
            outputs = model.inference(features, max_len=max_len)  # [batch_size, seq_len]

            for i in range(captions.size(0)):
                # Ground truth caption (remove <SOS>, <EOS>, <PAD>)
                ref = [vocab.itos[idx.item()] for idx in captions[i] if idx.item() not in [0, 1, 2]]
                # Generated caption (remove <SOS>, <EOS>, <PAD>)
                hyp = [vocab.itos[idx.item()] for idx in outputs[i] if idx.item() not in [0, 1, 2]]
                references.append([ref])  # BLEU expects list of reference lists
                hypotheses.append(hyp)

                # Store samples for qualitative inspection
                if len(sample_outputs) < num_samples and batch_idx * captions.size(0) + i < num_samples:
                    sample_outputs.append({
                        'image_name': image_names[i],
                        'ground_truth': ' '.join(ref),
                        'generated': ' '.join(hyp)
                    })

    # Compute BLEU-4 with smoothing
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)

    return bleu_score, sample_outputs

# Evaluate on validation and test sets
val_bleu, val_samples = evaluate_model(model, val_loader, vocab, max_len=20, num_samples=5)
test_bleu, test_samples = evaluate_model(model, test_loader, vocab, max_len=20, num_samples=5)

# Print results
print(f"\nValidation BLEU-4 Score: {val_bleu:.4f}")
print("\nValidation Sample Outputs:")
for sample in val_samples:
    print(f"Image: {sample['image_name']}")
    print(f"Ground Truth: {sample['ground_truth']}")
    print(f"Generated: {sample['generated']}\n")

print(f"\nTest BLEU-4 Score: {test_bleu:.4f}")
print("\nTest Sample Outputs:")
for sample in test_samples:
    print(f"Image: {sample['image_name']}")
    print(f"Ground Truth: {sample['ground_truth']}")
    print(f"Generated: {sample['generated']}\n")

Evaluating: 100%|██████████| 249/249 [00:25<00:00,  9.79it/s]
Evaluating: 100%|██████████| 249/249 [00:33<00:00,  7.54it/s]



Validation BLEU-4 Score: 0.0000

Validation Sample Outputs:
Image: 4420802292.jpg
Ground Truth: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Generated: a man in a a shirt and a a a a a

Image: 1579291454.jpg
Ground Truth: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Generated: a man in a a shirt a a a

Image: 3720366614.jpg
Ground Truth: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Generated: a man in a a shirt is a a a

Image: 2315113960.jpg
Ground Truth: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Generated: a man in a a shirt and a a a a

Image: 2785408815.jpg
Ground Truth: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Generated: a man in a a shirt is a a a a


Test BLEU-4 Score: 0.0000

Test Sample Outputs:
Image: 3692746368.jpg
G

In [1]:
import pandas as pd
import spacy
import pickle
import warnings
warnings.filterwarnings('ignore')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Preprocess captions
def preprocess_caption(captions, batch_size=1000):
    captions = [str(c).strip() for c in captions if not pd.isna(c)]
    results = []
    for i in range(0, len(captions), batch_size):
        batch = captions[i:i + batch_size]
        docs = nlp.pipe([caption.lower() for caption in batch], disable=['parser', 'ner'])
        for doc in docs:
            tokens = [token.text for token in doc if not token.is_punct and token.text.strip()]
            results.append(tokens)
    return results

# Paths
caption_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\captions.txt'
output_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\preprocessed_captions1.pkl'

# Read captions with UTF-8 and error handling, skip header
try:
    df = pd.read_csv(caption_file, sep=',', names=['image', 'caption'], skiprows=1, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(caption_file, sep=',', names=['image', 'caption'], skiprows=1, encoding='latin1')

# Preprocess captions and create image-caption dictionary
image_captions = {}
captions = df['caption'].tolist()
image_names = df['image'].tolist()
preprocessed_captions = preprocess_caption(captions)

for img, cap in zip(image_names, preprocessed_captions):
    if img not in image_captions:
        image_captions[img] = []
    image_captions[img].append(cap)

# Save preprocessed captions
pickle.dump(image_captions, open(output_file, 'wb'))
print(f"Saved preprocessed_captions.pkl with {len(image_captions)} images")

# Debug: Verify alignment
print("\nSample Preprocessed Captions:")
for i, (img, caps) in enumerate(list(image_captions.items())[:5]):
    print(f"Image {i+1}: {img}, Captions: {caps}")

Saved preprocessed_captions.pkl with 31783 images

Sample Preprocessed Captions:
Image 1: 1000092795.jpg, Captions: [['two', 'young', 'guys', 'with', 'shaggy', 'hair', 'look', 'at', 'their', 'hands', 'while', 'hanging', 'out', 'in', 'the', 'yard'], ['two', 'young', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes'], ['two', 'men', 'in', 'green', 'shirts', 'are', 'standing', 'in', 'a', 'yard'], ['a', 'man', 'in', 'a', 'blue', 'shirt', 'standing', 'in', 'a', 'garden'], ['two', 'friends', 'enjoy', 'time', 'spent', 'together']]
Image 2: 10002456.jpg, Captions: [['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system'], ['workers', 'look', 'down', 'from', 'up', 'above', 'on', 'a', 'piece', 'of', 'equipment'], ['two', 'men', 'working', 'on', 'a', 'machine', 'wearing', 'hard', 'hats'], ['four', 'men', 'on', 'top', 'of', 'a', 'tall', 'structure'], ['three', 'men', 'on', 'a', 'large', 'rig']]
Image 3: 1000268201.jpg, Captions: [['a', 'child', 'in'

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import os
import pickle
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
import time
torch.backends.cudnn.benchmark = True
# Vocabulary class
class Vocabulary:
    def __init__(self, min_freq=2):
        self.itos = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.stoi = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.min_freq = min_freq

    def build_vocabulary(self, captions):
        words = [word for caption_list in captions.values() for caption in caption_list for word in caption]
        word_counts = Counter(words)
        idx = len(self.itos)
        for word, count in word_counts.items():
            if count >= self.min_freq and word not in self.stoi:
                self.itos[idx] = word
                self.stoi[word] = idx
                idx += 1

    def numericalize(self, caption):
        return [self.stoi.get(word, self.stoi['<UNK>']) for word in caption]

# Dataset
class Flickr30kCaptionDataset(Dataset):
    def __init__(self, feature_file, caption_file, augment=False):
        data = torch.load(feature_file, weights_only=False)
        self.features = data['features']
        self.image_names = data['image_names']
        self.captions_dict = pickle.load(open(caption_file, 'rb'))
        self.augment = augment
        self.vocab = Vocabulary(min_freq=2)
        self.vocab.build_vocabulary(self.captions_dict)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        image_name = self.image_names[idx]
        captions = self.captions_dict.get(image_name, [[]])
        caption = captions[torch.randint(0, len(captions), (1,)).item()] if self.augment and len(captions) > 1 else captions[0]
        numerical_caption = [self.vocab.stoi['<SOS>']] + self.vocab.numericalize(caption) + [self.vocab.stoi['<EOS>']]
        return feature, torch.tensor(numerical_caption, dtype=torch.long), image_name

# Custom collate function
def custom_collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return None, None, None
    features, captions, image_names = zip(*batch)
    features = torch.stack(features)
    max_len = max(len(c) for c in captions)
    padded_captions = torch.zeros(len(captions), max_len, dtype=torch.long)
    for i, cap in enumerate(captions):
        padded_captions[i, :len(cap)] = cap
    return features, padded_captions, image_names

# Transformer Model with Beam Search
class ImageCaptionTransformer(nn.Module):
    def __init__(self, feature_dim, vocab_size, embed_dim=512, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, hidden_dim=2048, dropout=0.05, max_len=100):
        super(ImageCaptionTransformer, self).__init__()
        self.feature_fc = nn.Linear(feature_dim, embed_dim)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos_encoder = nn.Parameter(torch.zeros(1, max_len, embed_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)
        self.embed_dim = embed_dim
        self.max_len = max_len
        self.dropout = nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.feature_fc.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.feature_fc.bias)
        nn.init.zeros_(self.fc.bias)

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask

    def forward(self, features, captions=None, teacher_forcing_ratio=0.5):
        batch_size = features.size(0)
        device = features.device
        features = self.dropout(torch.relu(self.feature_fc(features)))
        features = features.unsqueeze(1)
        memory = self.transformer_encoder(features)
        if captions is None:
            return self.inference(features)
        captions = captions[:, :-1]
        embed = self.embedding(captions) * (self.embed_dim ** 0.5)
        positions = torch.arange(0, captions.size(1), device=device).unsqueeze(0).repeat(batch_size, 1)
        pos_embed = self.pos_encoder[:, :captions.size(1), :]
        embed = embed + pos_embed
        tgt_mask = self.generate_square_subsequent_mask(captions.size(1)).to(device)
        output = self.transformer_decoder(embed, memory, tgt_mask=tgt_mask)
        output = self.fc(output)
        return output

    def inference(self, features, max_len=20, beam_size=3):
        batch_size = features.size(0)
        device = features.device
        features = self.dropout(torch.relu(self.feature_fc(features)))
        features = features.unsqueeze(1)
        memory = self.transformer_encoder(features)
        sequences = [[torch.full((1, 1), 1, dtype=torch.long, device=device), 0.0]] * batch_size
        for _ in range(max_len):
            all_candidates = []
            for i in range(batch_size):
                candidates = []
                for seq, score in sequences[i]:
                    if seq[0, -1].item() == 2:
                        candidates.append([seq, score])
                        continue
                    embed = self.embedding(seq) * (self.embed_dim ** 0.5)
                    positions = torch.arange(0, seq.size(1), device=device).unsqueeze(0)
                    pos_embed = self.pos_encoder[:, :seq.size(1), :]
                    embed = embed + pos_embed
                    tgt_mask = self.generate_square_subsequent_mask(seq.size(1)).to(device)
                    output = self.transformer_decoder(embed, memory[i:i+1], tgt_mask=tgt_mask)
                    output = self.fc(output[:, -1, :])
                    probs = torch.softmax(output, dim=-1)
                    top_probs, top_idx = probs.topk(beam_size, dim=-1)
                    for k in range(beam_size):
                        next_token = top_idx[0, k].unsqueeze(0).unsqueeze(0)
                        next_score = score - torch.log(top_probs[0, k]).item()
                        new_seq = torch.cat([seq, next_token], dim=1)
                        candidates.append([new_seq, next_score])
                candidates = sorted(candidates, key=lambda x: x[1])[:beam_size]
                all_candidates.append(candidates)
            sequences = all_candidates
        outputs = [sequences[i][0][0] for i in range(batch_size)]
        return torch.cat(outputs, dim=0)

# Load original captions
def load_original_captions(caption_txt_file):
    caption_dict = {}
    try:
        with open(caption_txt_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()[1:]  # Skip header
            for line in lines:
                parts = line.strip().split(',', 1)
                if len(parts) == 2:
                    image_name, caption = parts
                    caption_dict[image_name] = caption.split()
    except UnicodeDecodeError:
        with open(caption_txt_file, 'r', encoding='latin1') as f:
            lines = f.readlines()[1:]  # Skip header
            for line in lines:
                parts = line.strip().split(',', 1)
                if len(parts) == 2:
                    image_name, caption = parts
                    caption_dict[image_name] = caption.split()
    return caption_dict

# Evaluation function
def evaluate_model(model, loader, vocab, caption_dict, max_len=20, num_samples=5, beam_size=3):
    model.eval()
    references, hypotheses = [], []
    sample_outputs = []

    with torch.no_grad():
        for batch_idx, (features, captions, image_names) in enumerate(tqdm(loader, desc="Evaluating")):
            if features is None:
                continue
            features, captions = features.cuda(), captions.cuda()
            outputs = model.inference(features, max_len=max_len, beam_size=beam_size)
            for i in range(captions.size(0)):
                image_name = image_names[i]
                ref = caption_dict.get(image_name, [vocab.itos[idx.item()] for idx in captions[i] if idx.item() not in [0, 1, 2]])
                hyp = [vocab.itos[idx.item()] for idx in outputs[i] if idx.item() not in [0, 1, 2]]
                references.append([ref])
                hypotheses.append(hyp)
                if len(sample_outputs) < num_samples and batch_idx * captions.size(0) + i < num_samples:
                    sample_outputs.append({
                        'image_name': image_name,
                        'ground_truth': ' '.join(ref),
                        'generated': ' '.join(hyp)
                    })
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)
    return bleu_score, sample_outputs

# Paths
feature_dir = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\ViT_Features'
caption_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\preprocessed_captions1.pkl'
caption_txt_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\captions.txt'

# Datasets and DataLoaders
train_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'train_vit_features.pt'), caption_file, augment=True)
val_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'val_vit_features.pt'), caption_file, augment=False)
test_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'test_vit_features.pt'), caption_file, augment=False)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)

# Save vocab
with open(os.path.join(feature_dir, 'vocab.pkl'), 'wb') as f:
    pickle.dump(train_dataset.vocab, f)
print(f"Saved vocab.pkl with {len(train_dataset.vocab.itos)} tokens")

# Model
model = ImageCaptionTransformer(
    feature_dim=768,
    vocab_size=len(train_dataset.vocab.itos),
    embed_dim=512,
    num_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    hidden_dim=2048,
    dropout=0.05,
    max_len=100
).cuda()

# Training setup
criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=30, eta_min=1e-6)
num_epochs = 100

# Training loop
def train_epoch(model, loader, criterion, optimizer, teacher_forcing_ratio=0.5):
    model.train()
    total_loss = 0
    for features, captions, _ in tqdm(loader, desc="Training"):
        if features is None:
            continue
        features, captions = features.cuda(), captions.cuda()
        outputs = model(features, captions, teacher_forcing_ratio)
        loss = criterion(outputs.view(-1, outputs.size(-1)), captions[:, 1:].contiguous().view(-1))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Validation loop
def validate_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for features, captions, _ in tqdm(loader, desc="Validating"):
            if features is None:
                continue
            features, captions = features.cuda(), captions.cuda()
            outputs = model(features, captions, teacher_forcing_ratio=1.0)
            loss = criterion(outputs.view(-1, outputs.size(-1)), captions[:, 1:].contiguous().view(-1))
            total_loss += loss.item()
    return total_loss / len(loader)

# Training with early stopping
best_val_loss = float('inf')
patience = 3
no_improve_count = 0
min_delta = 0.01
for epoch in range(num_epochs):
    start_time = time.time()
    train_loss = train_epoch(model, train_loader, criterion, optimizer, teacher_forcing_ratio=0.5)
    val_loss = validate_epoch(model, val_loader, criterion)
    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Time: {(time.time() - start_time) / 60:.2f} min, LR: {optimizer.param_groups[0]['lr']:.6f}")
    if val_loss < best_val_loss - min_delta:
        best_val_loss = val_loss
        no_improve_count = 0
        torch.save(model.state_dict(), os.path.join(feature_dir, 'best_transformer_model.pt'))
        print(f"Saved best model with Val Loss: {best_val_loss:.4f}")
    else:
        no_improve_count += 1
        print(f"No improvement in Val Loss, count: {no_improve_count}/{patience}")
    if no_improve_count >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

Saved vocab.pkl with 12097 tokens


Training: 100%|██████████| 3973/3973 [07:10<00:00,  9.23it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 39.10it/s]


Epoch 1/100, Train Loss: 4.6521, Val Loss: 4.2158, Time: 7.39 min, LR: 0.000100
Saved best model with Val Loss: 4.2158


Training: 100%|██████████| 3973/3973 [07:31<00:00,  8.81it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.85it/s]


Epoch 2/100, Train Loss: 4.1017, Val Loss: 3.9552, Time: 7.74 min, LR: 0.000099
Saved best model with Val Loss: 3.9552


Training: 100%|██████████| 3973/3973 [07:16<00:00,  9.11it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.29it/s]


Epoch 3/100, Train Loss: 3.9127, Val Loss: 3.8118, Time: 7.47 min, LR: 0.000098
Saved best model with Val Loss: 3.8118


Training: 100%|██████████| 3973/3973 [07:11<00:00,  9.21it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.60it/s]


Epoch 4/100, Train Loss: 3.7773, Val Loss: 3.6878, Time: 7.40 min, LR: 0.000096
Saved best model with Val Loss: 3.6878


Training: 100%|██████████| 3973/3973 [07:13<00:00,  9.16it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.62it/s]


Epoch 5/100, Train Loss: 3.6656, Val Loss: 3.5878, Time: 7.43 min, LR: 0.000093
Saved best model with Val Loss: 3.5878


Training: 100%|██████████| 3973/3973 [07:16<00:00,  9.11it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.21it/s]


Epoch 6/100, Train Loss: 3.5630, Val Loss: 3.5023, Time: 7.47 min, LR: 0.000091
Saved best model with Val Loss: 3.5023


Training: 100%|██████████| 3973/3973 [07:15<00:00,  9.12it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.39it/s]


Epoch 7/100, Train Loss: 3.4722, Val Loss: 3.4153, Time: 7.47 min, LR: 0.000087
Saved best model with Val Loss: 3.4153


Training: 100%|██████████| 3973/3973 [07:14<00:00,  9.14it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.99it/s]


Epoch 8/100, Train Loss: 3.3891, Val Loss: 3.3371, Time: 7.44 min, LR: 0.000084
Saved best model with Val Loss: 3.3371


Training: 100%|██████████| 3973/3973 [07:24<00:00,  8.94it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.62it/s]


Epoch 9/100, Train Loss: 3.3070, Val Loss: 3.2566, Time: 7.61 min, LR: 0.000080
Saved best model with Val Loss: 3.2566


Training: 100%|██████████| 3973/3973 [07:38<00:00,  8.66it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.90it/s]


Epoch 10/100, Train Loss: 3.2314, Val Loss: 3.1775, Time: 7.87 min, LR: 0.000075
Saved best model with Val Loss: 3.1775


Training: 100%|██████████| 3973/3973 [07:29<00:00,  8.84it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 36.18it/s]


Epoch 11/100, Train Loss: 3.1508, Val Loss: 3.1093, Time: 7.72 min, LR: 0.000071
Saved best model with Val Loss: 3.1093


Training: 100%|██████████| 3973/3973 [07:29<00:00,  8.84it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.32it/s]


Epoch 12/100, Train Loss: 3.0766, Val Loss: 3.0308, Time: 7.71 min, LR: 0.000066
Saved best model with Val Loss: 3.0308


Training: 100%|██████████| 3973/3973 [07:35<00:00,  8.73it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 36.75it/s]


Epoch 13/100, Train Loss: 3.0090, Val Loss: 2.9622, Time: 7.81 min, LR: 0.000061
Saved best model with Val Loss: 2.9622


Training: 100%|██████████| 3973/3973 [07:26<00:00,  8.91it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 38.01it/s]


Epoch 14/100, Train Loss: 2.9370, Val Loss: 2.8978, Time: 7.65 min, LR: 0.000056
Saved best model with Val Loss: 2.8978


Training: 100%|██████████| 3973/3973 [07:29<00:00,  8.84it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.55it/s]


Epoch 15/100, Train Loss: 2.8738, Val Loss: 2.8329, Time: 7.71 min, LR: 0.000051
Saved best model with Val Loss: 2.8329


Training: 100%|██████████| 3973/3973 [07:30<00:00,  8.82it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.71it/s]


Epoch 16/100, Train Loss: 2.8078, Val Loss: 2.7725, Time: 7.73 min, LR: 0.000045
Saved best model with Val Loss: 2.7725


Training: 100%|██████████| 3973/3973 [07:25<00:00,  8.92it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 39.55it/s]


Epoch 17/100, Train Loss: 2.7514, Val Loss: 2.7124, Time: 7.63 min, LR: 0.000040
Saved best model with Val Loss: 2.7124


Training: 100%|██████████| 3973/3973 [07:21<00:00,  9.01it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 39.90it/s]


Epoch 18/100, Train Loss: 2.6994, Val Loss: 2.6543, Time: 7.56 min, LR: 0.000035
Saved best model with Val Loss: 2.6543


Training: 100%|██████████| 3973/3973 [07:16<00:00,  9.11it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 39.85it/s]


Epoch 19/100, Train Loss: 2.6460, Val Loss: 2.6032, Time: 7.48 min, LR: 0.000030
Saved best model with Val Loss: 2.6032


Training: 100%|██████████| 3973/3973 [07:14<00:00,  9.14it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 35.54it/s]


Epoch 20/100, Train Loss: 2.6005, Val Loss: 2.5533, Time: 7.48 min, LR: 0.000026
Saved best model with Val Loss: 2.5533


Training: 100%|██████████| 3973/3973 [07:39<00:00,  8.65it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.99it/s]


Epoch 21/100, Train Loss: 2.5615, Val Loss: 2.5093, Time: 7.88 min, LR: 0.000021
Saved best model with Val Loss: 2.5093


Training: 100%|██████████| 3973/3973 [07:37<00:00,  8.68it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.43it/s]


Epoch 22/100, Train Loss: 2.5210, Val Loss: 2.4739, Time: 7.85 min, LR: 0.000017
Saved best model with Val Loss: 2.4739


Training: 100%|██████████| 3973/3973 [07:37<00:00,  8.69it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.65it/s]


Epoch 23/100, Train Loss: 2.4866, Val Loss: 2.4345, Time: 7.84 min, LR: 0.000014
Saved best model with Val Loss: 2.4345


Training: 100%|██████████| 3973/3973 [07:40<00:00,  8.64it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.51it/s]


Epoch 24/100, Train Loss: 2.4597, Val Loss: 2.4080, Time: 7.89 min, LR: 0.000010
Saved best model with Val Loss: 2.4080


Training: 100%|██████████| 3973/3973 [07:26<00:00,  8.90it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 38.75it/s]


Epoch 25/100, Train Loss: 2.4378, Val Loss: 2.3822, Time: 7.66 min, LR: 0.000008
Saved best model with Val Loss: 2.3822


Training: 100%|██████████| 3973/3973 [07:23<00:00,  8.96it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 38.91it/s]


Epoch 26/100, Train Loss: 2.4188, Val Loss: 2.3630, Time: 7.60 min, LR: 0.000005
Saved best model with Val Loss: 2.3630


Training: 100%|██████████| 3973/3973 [07:23<00:00,  8.96it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 38.59it/s]


Epoch 27/100, Train Loss: 2.4020, Val Loss: 2.3531, Time: 7.61 min, LR: 0.000003
No improvement in Val Loss, count: 1/3


Training: 100%|██████████| 3973/3973 [07:26<00:00,  8.90it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 39.14it/s]


Epoch 28/100, Train Loss: 2.3913, Val Loss: 2.3435, Time: 7.65 min, LR: 0.000002
Saved best model with Val Loss: 2.3435


Training: 100%|██████████| 3973/3973 [07:24<00:00,  8.94it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 39.48it/s]


Epoch 29/100, Train Loss: 2.3852, Val Loss: 2.3356, Time: 7.61 min, LR: 0.000001
No improvement in Val Loss, count: 1/3


Training: 100%|██████████| 3973/3973 [07:19<00:00,  9.04it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 40.34it/s]


Epoch 30/100, Train Loss: 2.3810, Val Loss: 2.3318, Time: 7.53 min, LR: 0.000001
Saved best model with Val Loss: 2.3318


Training: 100%|██████████| 3973/3973 [07:29<00:00,  8.83it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.10it/s]


Epoch 31/100, Train Loss: 2.3774, Val Loss: 2.3302, Time: 7.72 min, LR: 0.000001
No improvement in Val Loss, count: 1/3


Training: 100%|██████████| 3973/3973 [07:34<00:00,  8.74it/s]
Validating: 100%|██████████| 497/497 [00:13<00:00, 37.17it/s]


Epoch 32/100, Train Loss: 2.3763, Val Loss: 2.3272, Time: 7.80 min, LR: 0.000002
No improvement in Val Loss, count: 2/3


Training: 100%|██████████| 3973/3973 [07:30<00:00,  8.83it/s]
Validating: 100%|██████████| 497/497 [00:12<00:00, 39.87it/s]

Epoch 33/100, Train Loss: 2.3750, Val Loss: 2.3233, Time: 7.71 min, LR: 0.000003
No improvement in Val Loss, count: 3/3
Early stopping at epoch 33





In [4]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

# Vocabulary class (unchanged)
class Vocabulary:
    def __init__(self, min_freq=2):
        self.itos = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.stoi = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.min_freq = min_freq

    def build_vocabulary(self, captions):
        words = [word for caption_list in captions.values() for caption in caption_list for word in caption]
        word_counts = Counter(words)
        idx = len(self.itos)
        for word, count in word_counts.items():
            if count >= self.min_freq and word not in self.stoi:
                self.itos[idx] = word
                self.stoi[word] = idx
                idx += 1

    def numericalize(self, caption):
        return [self.stoi.get(word, self.stoi['<UNK>']) for word in caption]

# Dataset (unchanged)
class Flickr30kCaptionDataset(Dataset):
    def __init__(self, feature_file, caption_file, augment=False):
        data = torch.load(feature_file, weights_only=False)
        self.features = data['features']
        self.image_names = data['image_names']
        self.captions_dict = pickle.load(open(caption_file, 'rb'))
        self.augment = augment
        self.vocab = Vocabulary(min_freq=2)
        self.vocab.build_vocabulary(self.captions_dict)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        image_name = self.image_names[idx]
        captions = self.captions_dict.get(image_name, [[]])
        caption = captions[torch.randint(0, len(captions), (1,)).item()] if self.augment and len(captions) > 1 else captions[0]
        numerical_caption = [self.vocab.stoi['<SOS>']] + self.vocab.numericalize(caption) + [self.vocab.stoi['<EOS>']]
        return feature, torch.tensor(numerical_caption, dtype=torch.long), image_name

# Custom collate function (unchanged)
def custom_collate_fn(batch):
    batch = [item for item in batch if item is not None]
    if not batch:
        return None, None, None
    features, captions, image_names = zip(*batch)
    features = torch.stack(features)
    max_len = max(len(c) for c in captions)
    padded_captions = torch.zeros(len(captions), max_len, dtype=torch.long)
    for i, cap in enumerate(captions):
        padded_captions[i, :len(cap)] = cap
    return features, padded_captions, image_names

# Transformer Model with Greedy Inference (to match best_transformer_model.pt)
class ImageCaptionTransformer(nn.Module):
    def __init__(self, feature_dim, vocab_size, embed_dim=512, num_heads=8, num_encoder_layers=6, num_decoder_layers=6, hidden_dim=2048, dropout=0.05, max_len=100):
        super(ImageCaptionTransformer, self).__init__()
        self.feature_fc = nn.Linear(feature_dim, embed_dim)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.pos_encoder = nn.Parameter(torch.zeros(1, max_len, embed_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_decoder_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)
        self.embed_dim = embed_dim
        self.max_len = max_len
        self.dropout = nn.Dropout(dropout)
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.feature_fc.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.feature_fc.bias)
        nn.init.zeros_(self.fc.bias)

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask

    def forward(self, features, captions=None, teacher_forcing_ratio=0.5):
        batch_size = features.size(0)
        device = features.device
        features = self.dropout(torch.relu(self.feature_fc(features)))
        features = features.unsqueeze(1)
        memory = self.transformer_encoder(features)
        if captions is None:
            return self.inference(features)
        captions = captions[:, :-1]
        embed = self.embedding(captions) * (self.embed_dim ** 0.5)
        positions = torch.arange(0, captions.size(1), device=device).unsqueeze(0).repeat(batch_size, 1)
        pos_embed = self.pos_encoder[:, :captions.size(1), :]
        embed = embed + pos_embed
        tgt_mask = self.generate_square_subsequent_mask(captions.size(1)).to(device)
        output = self.transformer_decoder(embed, memory, tgt_mask=tgt_mask)
        output = self.fc(output)
        return output

    def inference(self, features, max_len=20):
        batch_size = features.size(0)
        device = features.device
        features = self.dropout(torch.relu(self.feature_fc(features)))
        features = features.unsqueeze(1)
        memory = self.transformer_encoder(features)
        outputs = []
        input_token = torch.full((batch_size, 1), 1, dtype=torch.long, device=device)
        for _ in range(max_len):
            embed = self.embedding(input_token) * (self.embed_dim ** 0.5)
            positions = torch.arange(0, input_token.size(1), device=device).unsqueeze(0).repeat(batch_size, 1)
            pos_embed = self.pos_encoder[:, :input_token.size(1), :]
            embed = embed + pos_embed
            tgt_mask = self.generate_square_subsequent_mask(input_token.size(1)).to(device)
            output = self.transformer_decoder(embed, memory, tgt_mask=tgt_mask)
            output = self.fc(output[:, -1, :])
            _, next_token = output.max(1)
            outputs.append(next_token)
            input_token = torch.cat([input_token, next_token.unsqueeze(1)], dim=1)
            if (next_token == 2).all():
                break
        return torch.stack(outputs, dim=1)

# Load original captions
def load_original_captions(caption_txt_file):
    caption_dict = {}
    try:
        with open(caption_txt_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()[1:]  # Skip header
            for line in lines:
                parts = line.strip().split(',', 1)
                if len(parts) == 2:
                    image_name, caption = parts
                    caption_dict[image_name] = caption.split()
    except UnicodeDecodeError:
        with open(caption_txt_file, 'r', encoding='latin1') as f:
            lines = f.readlines()[1:]  # Skip header
            for line in lines:
                parts = line.strip().split(',', 1)
                if len(parts) == 2:
                    image_name, caption = parts
                    caption_dict[image_name] = caption.split()
    return caption_dict

# Evaluation function (updated for greedy decoding)
def evaluate_model(model, loader, vocab, caption_dict, max_len=20, num_samples=5):
    model.eval()
    references, hypotheses = [], []
    sample_outputs = []

    with torch.no_grad():
        for batch_idx, (features, captions, image_names) in enumerate(tqdm(loader, desc="Evaluating")):
            if features is None:
                continue
            features, captions = features.cuda(), captions.cuda()
            outputs = model.inference(features, max_len=max_len)  # Greedy decoding
            for i in range(captions.size(0)):
                image_name = image_names[i]
                ref = caption_dict.get(image_name, [vocab.itos[idx.item()] for idx in captions[i] if idx.item() not in [0, 1, 2]])
                hyp = [vocab.itos[idx.item()] for idx in outputs[i] if idx.item() not in [0, 1, 2]]
                references.append([ref])
                hypotheses.append(hyp)
                if len(sample_outputs) < num_samples and batch_idx * captions.size(0) + i < num_samples:
                    sample_outputs.append({
                        'image_name': image_name,
                        'ground_truth': ' '.join(ref),
                        'generated': ' '.join(hyp)
                    })
    smoothing = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing)
    return bleu_score, sample_outputs

# Paths
feature_dir = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\ViT_Features'
caption_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\preprocessed_captions1.pkl'
caption_txt_file = r'D:\Projects\AIniverse\Scene_Description_Generator\flickr30k\versions\1\captions.txt'

# Load original captions
caption_dict = load_original_captions(caption_txt_file)

# Datasets and DataLoaders
val_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'val_vit_features.pt'), caption_file, augment=False)
test_dataset = Flickr30kCaptionDataset(os.path.join(feature_dir, 'test_vit_features.pt'), caption_file, augment=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)

# Load vocabulary
vocab = pickle.load(open(os.path.join(feature_dir, 'vocab.pkl'), 'rb'))
print(f"Loaded vocab.pkl with {len(vocab.itos)} tokens")

# Initialize model
model = ImageCaptionTransformer(
    feature_dim=768,
    vocab_size=len(vocab.itos),
    embed_dim=512,
    num_heads=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    hidden_dim=2048,
    dropout=0.05,
    max_len=100
).cuda()

# Load trained model weights
model_path = os.path.join(feature_dir, 'best_transformer_model.pt')
model.load_state_dict(torch.load(model_path))

# Evaluate on validation and test sets
val_bleu, val_samples = evaluate_model(model, val_loader, vocab, caption_dict, max_len=20, num_samples=5)
test_bleu, test_samples = evaluate_model(model, test_loader, vocab, caption_dict, max_len=20, num_samples=5)

# Print results
print(f"\nValidation BLEU-4 Score: {val_bleu:.4f}")
print("\nValidation Sample Outputs:")
for sample in val_samples:
    print(f"Image: {sample['image_name']}")
    print(f"Ground Truth: {sample['ground_truth']}")
    print(f"Generated: {sample['generated']}\n")

print(f"\nTest BLEU-4 Score: {test_bleu:.4f}")
print("\nTest Sample Outputs:")
for sample in test_samples:
    print(f"Image: {sample['image_name']}")
    print(f"Ground Truth: {sample['ground_truth']}")
    print(f"Generated: {sample['generated']}\n")

# Debug: Verify caption alignment
print("\nSample Caption.txt Verification:")
for i, image_name in enumerate(list(caption_dict.keys())[:5]):
    print(f"Image {i+1}: {image_name}, Caption: {' '.join(caption_dict[image_name])}")

# Debug: Verify vocabulary
print(f"\nSample Vocab Tokens: {list(vocab.itos.items())[:10]}")

Loaded vocab.pkl with 12097 tokens


Evaluating: 100%|██████████| 497/497 [01:51<00:00,  4.44it/s]
Evaluating: 100%|██████████| 497/497 [01:52<00:00,  4.42it/s]



Validation BLEU-4 Score: 0.0862

Validation Sample Outputs:
Image: 4420802292.jpg
Ground Truth: A man concentrates to paint details .
Generated: a man with a green cap and a white long sleeve shirt is painting a picture of a building with

Image: 1579291454.jpg
Ground Truth: Priests reading prayers from note cards .
Generated: a man in a white robe is reading from a book to another man in a white robe

Image: 3720366614.jpg
Ground Truth: The dog is jumping up beside a red wall .
Generated: a dog is jumping up to catch a toy while another dog watches

Image: 2315113960.jpg
Ground Truth: A woman at an exhibit faces away from a camera .
Generated: a woman is looking at a photograph

Image: 2785408815.jpg
Ground Truth: a man smiling in a white coat .
Generated: a man in a lab coat is smiling with a large smile on his face


Test BLEU-4 Score: 0.0852

Test Sample Outputs:
Image: 3692746368.jpg
Ground Truth: Two people are in a pond pulling a life raft .
Generated: two people are in a muddy

In [8]:
data = torch.load(os.path.join(feature_dir, 'train_vit_features.pt'), weights_only=False)
features, image_names = data['features'], data['image_names']
unique_indices = {name: i for i, name in enumerate(image_names)}.values()
unique_features = features[list(unique_indices)]
unique_image_names = [image_names[i] for i in unique_indices]
torch.save({'features': unique_features, 'image_names': unique_image_names}, os.path.join(feature_dir, 'train_vit_features_unique.pt'))
print(f"Unique train features shape: {unique_features.shape}")

Unique train features shape: torch.Size([31774, 768])
