In [1]:
# === Cell 1: Imports ===
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingLR
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

from transformers import (
    BlipForImageTextRetrieval,
    BlipVisionModel,
    BlipConfig,
    BlipImageProcessor,
    AutoModel,
    AutoTokenizer,
    AutoConfig
)

from PIL import Image
import json
import os
import random
import numpy as np
import math
import time
import transformers
import gc

try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

print(f"PyTorch Version: {torch.__version__}")
print(f"Transformers Version: {transformers.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device Name: {torch.cuda.get_device_name(1)}")
    torch.cuda.set_device(1)

PyTorch Version: 2.6.0+cu124
Transformers Version: 4.50.0
CUDA Available: True
CUDA Device Name: NVIDIA GeForce RTX 4090


In [2]:
# === Cell 2: Configuration Class (CFG) for ViSigLIP Pretraining ===
class CFG:
    # --- Paths ---
    data_path = "./json_data/"
    image_base_path = "./data/OpenViVQA-dataset/"
    model_path = "./trained_models/ViSigLIP_uitopenviic"

    # --- Model Selection ---
    selected_vision_source = "Salesforce/blip-image-captioning-base"
    selected_text_model = "vinai/phobert-base"
    text_tokenizer_name = selected_text_model

    # --- Model parameters ---
    blip_vision_model_name = selected_vision_source
    blip_image_processor_name = selected_vision_source
    @property
    def text_embedding(self): return 768
    @property
    def vision_embedding(self): return 768
    projection_dim = 768

    # --- SigLIP specific ---
    learnable_temperature = True
    temperature_init = 10.0
    learnable_bias = True
    bias_init = -10.0

    # --- Training parameters ---
    seed = 42
    batch_size = 32   # Keep reduced batch size
    accumulation_steps = 64 # Keep high accumulation for large effective batch
    num_workers = 20

    learning_rate = 1e-4
    warmup_steps = 1000 # Adjust warmup relative to total steps? Maybe keep at 1k? Or reduce? Let's keep 1000 for now.
    weight_decay = 0.1

    scheduler_type = "cosine"
    rop_patience = 5
    rop_factor = 0.5

    epochs = 50

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    use_amp = True

    max_length = 200

    # --- Loss/Saving parameters ---
    save_best_only = True
    metric_to_track = "avg_acc"
    mode = "max"
    # Adjust intervals based on new total steps (calculated in Cell 9)
    save_interval_steps = 500 # Example: Save every 500 steps
    validation_interval_steps = 1000
    log_interval_steps = 50 # Log reasonably often

    early_stopping_patience = 5 # Patience in terms of validation checks
    early_stopping_min_delta = 0.001

# --- Instantiate Config and Create Output Dir ---
config = CFG()
os.makedirs(config.model_path, exist_ok=True)
print(f"Using device: {config.device}")
print(f"Effective Batch Size (per optimizer step): {config.batch_size * config.accumulation_steps}")
print(f"Model output path: {config.model_path}")
print(f"Selected Vision Source: {config.selected_vision_source}")
print(f"Selected Text Model: {config.selected_text_model}")
print(f"Image base path (for resolving paths in JSON): {os.path.abspath(config.image_base_path)}")
print(f"AMP Enabled: {config.use_amp}")

Using device: cuda
Effective Batch Size (per optimizer step): 2048
Model output path: ./trained_models/ViSigLIP_uitopenviic
Selected Vision Source: Salesforce/blip-image-captioning-base
Selected Text Model: vinai/phobert-base
Image base path (for resolving paths in JSON): /home/researcher/huypq69/TuningModels/data/OpenViVQA-dataset
AMP Enabled: True


In [3]:
# === Cell 3: Seeding ===
def set_seed(seed=config.seed):
    print(f"Setting seed: {seed}")
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

set_seed()

Setting seed: 42


In [4]:
# === Cell 4: Metric & AvgMeter Utilities ===
class AvgMeter:
    """Computes and stores the average and current value"""
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()

    def reset(self):
        self.sum = 0
        self.count = 0
        self.avg = 0.0

    def update(self, val, count=1):
        if torch.is_tensor(val):
            val = val.item()
        if isinstance(val, (int, float)):
            self.sum += val * count
            self.count += count
            self.avg = float(self.sum) / self.count if self.count != 0 else 0.0
        else:
            print(f"Warning: Cannot update AvgMeter '{self.name}' with value type {type(val)}")

    def __repr__(self):
        return f"{self.name}: {self.avg:.4f}"

def compute_recall_at_k(similarity_matrix, k, dim):
    n = similarity_matrix.shape[1-dim]
    if n == 0: 
        return 0.0
    
    correct_count = 0
    actual_k = min(k, similarity_matrix.shape[dim])
    if actual_k == 0: 
        return 0.0

    top_k_indices = torch.topk(similarity_matrix, actual_k, dim=dim).indices
    ground_truth = torch.arange(n, device=similarity_matrix.device)

    if dim == 0:  # I2T
        for img_idx in range(n):
            if ground_truth[img_idx] in top_k_indices[:, img_idx]:
                correct_count += 1
    elif dim == 1:  # T2I
        for txt_idx in range(n):
            if ground_truth[txt_idx] in top_k_indices[txt_idx, :]:
                correct_count += 1
    else: 
        raise ValueError("dim must be 0 or 1")
        
    return float(correct_count) / n if n > 0 else 0.0

def compute_metrics(image_embeddings, text_embeddings):
    # Ensure embeddings are float32 for stable matmul
    sim_matrix = text_embeddings.float() @ image_embeddings.float().T
    n = sim_matrix.shape[0]
    
    if n == 0:
        # Return default zero metrics for empty batch
        return {
            "i2t_acc": 0.0, "t2i_acc": 0.0, "avg_acc": 0.0,
            "avg_cosine_sim": 0.0,
            "i2t_recall": {"R@1": 0.0, "R@5": 0.0, "R@10": 0.0},
            "t2i_recall": {"R@1": 0.0, "R@5": 0.0, "R@10": 0.0}
        }

    ground_truth = torch.arange(n, device=sim_matrix.device)
    i2t_preds = torch.argmax(sim_matrix, dim=0)
    t2i_preds = torch.argmax(sim_matrix, dim=1)
    
    i2t_acc = (i2t_preds == ground_truth).float().mean().item()
    t2i_acc = (t2i_preds == ground_truth).float().mean().item()
    avg_acc = (i2t_acc + t2i_acc) / 2.0
    avg_cosine_sim = torch.diag(sim_matrix).mean().item()

    i2t_recall = {}
    t2i_recall = {}
    recall_k_values = [k for k in [1, 5, 10] if k <= n]
    
    for k in recall_k_values:
        i2t_recall[f"R@{k}"] = compute_recall_at_k(sim_matrix, k, dim=0)
        t2i_recall[f"R@{k}"] = compute_recall_at_k(sim_matrix, k, dim=1)

    # Ensure all keys R@1, R@5, R@10 exist even if k>n
    for k in [1, 5, 10]:
        k_str = f"R@{k}"
        if k_str not in i2t_recall: i2t_recall[k_str] = 0.0
        if k_str not in t2i_recall: t2i_recall[k_str] = 0.0

    metrics = {
        "i2t_acc": i2t_acc, 
        "t2i_acc": t2i_acc, 
        "avg_acc": avg_acc,
        "avg_cosine_sim": avg_cosine_sim,
        "i2t_recall": i2t_recall, 
        "t2i_recall": t2i_recall
    }
    return metrics

print("Metric utilities defined.")

Metric utilities defined.


In [5]:
# === Cell 5: Dataset Class Definition ===

class CustomImageCaptionDataset(Dataset):
    """
    Loads image-caption pairs from JSON metadata.
    Handles both single JSON list format and JSON-per-line format.
    """
    def __init__(self, json_path_or_list, image_base_path, tokenizer, image_processor, max_length):
        super().__init__()
        self.data = []
        if isinstance(json_path_or_list, str) and os.path.isdir(json_path_or_list):
            json_files = [os.path.join(json_path_or_list, f) for f in os.listdir(json_path_or_list) if f.endswith('.json')]
            print(f"Found {len(json_files)} JSON files in {json_path_or_list}")
        elif isinstance(json_path_or_list, str) and os.path.isfile(json_path_or_list):
            json_files = [json_path_or_list]
        elif isinstance(json_path_or_list, list):
            json_files = json_path_or_list
        else:
            raise ValueError("json_path_or_list must be a directory, a single JSON file, or a list of JSON files.")

        print("Loading JSON metadata (this might take time for large datasets)...")
        total_loaded_count = 0
        for json_path in tqdm(json_files, desc="Loading JSONs"):
            try:
                with open(json_path, 'r', encoding='utf-8') as f:
                    try:
                        # Attempt to load the entire file as a single JSON object
                        file_data = json.load(f)
                        if isinstance(file_data, list):
                            self.data.extend(file_data)
                            total_loaded_count += len(file_data)
                        else:
                            # Handle case where it's a single dict
                            self.data.append(file_data)
                            total_loaded_count += 1
                            print(f"  Warning: Loaded single JSON object from {json_path}, expected a list.")
                    except json.JSONDecodeError:
                        # Try line-by-line (JSON-per-line format)
                        print(f"  Info: Failed to load {json_path} as single JSON. Attempting JSON-per-line format...")
                        f.seek(0)  # Rewind file pointer
                        count_line_by_line = 0
                        for line in f:
                            line = line.strip()
                            if line:  # Skip empty lines
                                try:
                                    line_data = json.loads(line)
                                    self.data.append(line_data)
                                    count_line_by_line += 1
                                except json.JSONDecodeError as line_err:
                                    print(f"  ERROR parsing line in {json_path}: {line_err}. Line content (partial): {line[:100]}...")
                        total_loaded_count += count_line_by_line
                        if count_line_by_line > 0:
                            print(f"  Successfully loaded {count_line_by_line} items using JSON-per-line format from {json_path}.")
                        else:
                            print(f"  Failed to load any data using JSON-per-line format from {json_path} either.")
            except Exception as e:
                print(f"ERROR opening or processing file {json_path}: {e}")

        print(f"Loaded {total_loaded_count} samples total from {len(json_files)} file(s).")
        self.data = [item for item in self.data if item]  # Clean out any potential None entries
        print(f"Dataset size after potential cleaning: {len(self.data)}")

        if not self.data:
            print("WARNING: No data loaded! Training cannot proceed.")

        self.image_base_path = image_base_path
        self.tokenizer = tokenizer
        self.image_processor = image_processor
        self.max_length = max_length
        try:
            if isinstance(image_processor.size, dict):
                proc_size = image_processor.size
                self.img_size = proc_size.get('height', proc_size.get('shortest_edge', 224))
            else:
                self.img_size = image_processor.size
                if isinstance(self.img_size, (tuple, list)): self.img_size = self.img_size[0]
        except AttributeError:
            print("Warning: Could not determine image size from processor, defaulting to 224.")
            self.img_size = 224
        print(f"Using image target size: {self.img_size}x{self.img_size}")
        if not os.path.isdir(self.image_base_path):
            print(f"WARNING: Image base path does not exist: {os.path.abspath(self.image_base_path)}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if idx >= len(self.data): raise IndexError("Index out of bounds")
        item = self.data[idx]

        relative_image_path = item.get('image_path', item.get('url', item.get('filename')))
        # Handle caption being list OR string
        caption_data = item.get('caption', item.get('text', item.get('title', '')))
        if isinstance(caption_data, list):
            caption = caption_data[0] if caption_data else ""
        elif isinstance(caption_data, str):
            caption = caption_data
        else:
            caption = ""

        if not relative_image_path or not caption:
            return self._get_dummy_item()

        # Load Image
        try:
            image_path = os.path.join(self.image_base_path, relative_image_path)
            image = Image.open(image_path).convert('RGB')
            image_inputs = self.image_processor(images=image, return_tensors="pt")
            pixel_values = image_inputs['pixel_values'].squeeze(0)
        except Exception:
            return self._get_dummy_item()

        # Process Text
        try:
            text_inputs = self.tokenizer(
                caption, padding='max_length', truncation=True,
                max_length=self.max_length, return_tensors='pt'
            )
            input_ids = text_inputs['input_ids'].squeeze(0)
            attention_mask = text_inputs['attention_mask'].squeeze(0)
        except Exception:
            return self._get_dummy_item()

        return {
            "pixel_values": pixel_values,
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }

    def _get_dummy_item(self):
        return {
            "pixel_values": torch.zeros((3, self.img_size, self.img_size), dtype=torch.float),
            "input_ids": torch.zeros(self.max_length, dtype=torch.long),
            "attention_mask": torch.zeros(self.max_length, dtype=torch.long)
        }

print("CustomImageCaptionDataset class defined.")

CustomImageCaptionDataset class defined.


In [6]:
# === Cell 6: Model Definition (ViSigLIP) ===
# Uses corrected vision loading V2

class ImageEncoder(nn.Module):
    """Encodes images using BLIP's Vision Model (Base)."""
    def __init__(self, config_train, pretrained=True):
        super().__init__()
        self.config_train = config_train
        print(f"Initializing BLIP Vision Encoder from: {config_train.blip_vision_model_name} by loading BlipForImageTextRetrieval first.")

        if pretrained:
            try:
                print("  Loading base BlipForImageTextRetrieval...")
                full_blip_model = BlipForImageTextRetrieval.from_pretrained(
                    config_train.blip_vision_model_name,
                    low_cpu_mem_usage=True,
                )
                print("  Extracting vision_model from BlipForImageTextRetrieval.")
                self.vision_model = full_blip_model.vision_model
                del full_blip_model
                print("  Vision model extracted successfully.")
            except Exception as e:
                print(f"  ERROR loading BlipForImageTextRetrieval or extracting vision model: {e}")
                print("  Falling back to initializing BlipVisionModel directly (might show warnings).")
                self.vision_model = BlipVisionModel.from_pretrained(config_train.blip_vision_model_name)
        else:
            print("  Initializing BlipVisionModel from scratch (as pretrained=False).")
            blip_vision_config = BlipConfig.from_pretrained(config_train.blip_vision_model_name).vision_config
            self.vision_model = BlipVisionModel(blip_vision_config)

        try:
            self.input_features = self.vision_model.config.hidden_size
        except AttributeError as e:
            print(f"  ERROR accessing vision_model.config.hidden_size: {e}. Attempting config_train value.")
            self.input_features = config_train.vision_embedding  # Fallback

        if hasattr(config_train, 'vision_embedding') and self.input_features != config_train.vision_embedding:
            print(f"  WARNING: Configured vision_embedding ({config_train.vision_embedding}) doesn't match loaded model hidden size ({self.input_features}). Using loaded size.")
        else:
            print(f"  Confirmed/Using vision model hidden size: {self.input_features}")

        self.projection = nn.Linear(self.input_features, config_train.projection_dim, bias=False)
        print(f"  Added projection head: {self.input_features} -> {config_train.projection_dim}")

    def forward(self, pixel_values):
        vision_outputs = self.vision_model(pixel_values=pixel_values, return_dict=True)
        image_features = vision_outputs.pooler_output
        projected_features = self.projection(image_features)
        return projected_features

class TextEncoder(nn.Module):
    """Encodes text using PhoBERT-Base."""
    def __init__(self, config_train, pretrained=True):
        super().__init__()
        self.config_train = config_train
        print(f"Initializing Text Encoder: {config_train.selected_text_model}")

        if pretrained:
            self.model = AutoModel.from_pretrained(config_train.selected_text_model)
        else:
            model_config = AutoConfig.from_pretrained(config_train.selected_text_model)
            self.model = AutoModel.from_config(model_config)

        try:
            self.input_features = self.model.config.hidden_size
        except AttributeError as e:
            print(f"  ERROR accessing model.config.hidden_size: {e}. Attempting config_train value.")
            self.input_features = config_train.text_embedding  # Fallback

        if hasattr(config_train, 'text_embedding') and self.input_features != config_train.text_embedding:
            print(f"  WARNING: Configured text_embedding ({config_train.text_embedding}) doesn't match loaded PhoBERT hidden size ({self.input_features}). Using actual size.")
        else:
            print(f"  Confirmed text model hidden size: {self.input_features}")

        self.projection = nn.Linear(self.input_features, config_train.projection_dim, bias=False)
        print(f"  Added projection head: {self.input_features} -> {config_train.projection_dim}")

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        text_features = outputs.last_hidden_state[:, 0, :]
        projected_features = self.projection(text_features)
        return projected_features

class ViSigLIPModel(nn.Module):
    """ViSigLIP Model: BLIP Vision + PhoBERT Text + Sigmoid Loss components"""
    def __init__(self, image_encoder, text_encoder, config_train):
        super().__init__()
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder
        self.config_train = config_train

        # Learnable Temperature
        if config_train.learnable_temperature:
            init_val_t = torch.tensor(config_train.temperature_init, dtype=torch.float)
            self.logit_scale = nn.Parameter(init_val_t)
            print(f"Using learnable temperature, initialized to {self.logit_scale.item():.4f}")
        else:
            temp_tensor = torch.tensor(config_train.temperature_init, dtype=torch.float)
            self.register_buffer('logit_scale', temp_tensor)
            print(f"Using fixed temperature: {self.logit_scale.item():.4f}")

        # Learnable Bias
        if config_train.learnable_bias:
            init_val_b = torch.tensor(config_train.bias_init, dtype=torch.float)
            self.logit_bias = nn.Parameter(init_val_b)
            print(f"Using learnable bias, initialized to {self.logit_bias.item():.4f}")
        else:
            bias_tensor = torch.tensor(config_train.bias_init, dtype=torch.float)
            self.register_buffer('logit_bias', bias_tensor)
            print(f"Using fixed bias: {self.logit_bias.item():.4f}")

    def forward(self, pixel_values, input_ids, attention_mask):
        # Move inputs to device
        pixel_values = pixel_values.to(self.config_train.device)
        input_ids = input_ids.to(self.config_train.device)
        attention_mask = attention_mask.to(self.config_train.device)

        # Get embeddings from encoders
        image_embed = self.image_encoder(pixel_values)
        text_embed = self.text_encoder(input_ids, attention_mask)

        # Normalize embeddings L2 norm
        image_features = F.normalize(image_embed, p=2, dim=-1)
        text_features = F.normalize(text_embed, p=2, dim=-1)

        # Return normalized features, temperature, and bias for loss calculation
        current_temp = self.logit_scale.to(image_features.device)
        current_bias = self.logit_bias.to(image_features.device)

        return image_features, text_features, current_temp, current_bias

print("ViSigLIP Model components defined.")

ViSigLIP Model components defined.


In [7]:
# === Cell 7: SigLIP Loss Function ===
def siglip_loss(image_features, text_features, logit_scale, logit_bias):
    """
    Computes the SigLIP loss.
    Based on https://arxiv.org/abs/2303.15343
    """
    # Ensure features are float32 for stability
    image_features = image_features.float()
    text_features = text_features.float()
    logit_scale = logit_scale.float()
    logit_bias = logit_bias.float()

    n = text_features.shape[0]
    if n == 0:
        return torch.tensor(0.0, device=image_features.device, requires_grad=True)

    # Calculate cosine similarity with temperature scaling and bias
    logits = image_features @ text_features.t() * logit_scale + logit_bias

    # Create labels: 1 for positive pairs (diagonal), 0 for negative pairs
    labels = torch.eye(n, device=logits.device, dtype=logits.dtype)

    # Binary cross-entropy with logits loss
    loss = F.binary_cross_entropy_with_logits(logits, labels, reduction='mean')

    return loss

print("SigLIP loss function defined.")

SigLIP loss function defined.


In [8]:
# === Cell 8: Setup - Tokenizer and Image Processor ===
from transformers import AutoTokenizer, BlipImageProcessor

tokenizer = None
image_processor = None

print(f"Loading Tokenizer: {config.text_tokenizer_name}")
try:
    tokenizer = AutoTokenizer.from_pretrained(config.text_tokenizer_name)
    print("PhoBERT Tokenizer loaded successfully.")
except Exception as e:
    print(f"ERROR loading tokenizer '{config.text_tokenizer_name}': {e}")

print(f"Loading Image Processor from: {config.blip_image_processor_name}")
try:
    image_processor = BlipImageProcessor.from_pretrained(config.blip_image_processor_name)
    print("BLIP Image Processor loaded successfully.")
except Exception as e:
    print(f"ERROR loading image processor '{config.blip_image_processor_name}': {e}")

Loading Tokenizer: vinai/phobert-base
PhoBERT Tokenizer loaded successfully.
Loading Image Processor from: Salesforce/blip-image-captioning-base
BLIP Image Processor loaded successfully.


In [9]:
# === Cell 9: Setup - Datasets and DataLoaders ===
# Uses the dataset class defined above.

train_loader = None
dev_loader = None

# Define paths
validation_json_path = os.path.join(config.data_path, "dev.json")
train_json_path = os.path.join(config.data_path, "train.json")

if tokenizer and image_processor:
    print("\nCreating datasets...")
    # --- Training Dataset ---
    try:
        print(f"Attempting to load training data from: {train_json_path}")
        train_dataset = CustomImageCaptionDataset(
            json_path_or_list=train_json_path,
            image_base_path=config.image_base_path,
            tokenizer=tokenizer, 
            image_processor=image_processor,
            max_length=config.max_length
        )
        if not train_dataset.data:
            print("\nERROR: Failed to load training data.")
    except Exception as e:
        print(f"ERROR creating training dataset: {e}")
        train_dataset = None

    # --- Validation Dataset ---
    if os.path.exists(validation_json_path):
        try:
            print(f"Attempting to load validation data from: {validation_json_path}")
            dev_dataset = CustomImageCaptionDataset(
                json_path_or_list=validation_json_path,
                image_base_path=config.image_base_path,
                tokenizer=tokenizer,
                image_processor=image_processor,
                max_length=config.max_length
            )
            if not dev_dataset.data:
                print("\nWARNING: Failed to load validation data.")
        except Exception as e:
            print(f"ERROR creating validation dataset: {e}")
            dev_dataset = None
    else:
        print("Validation JSON path not found, skipping validation set creation.")
        dev_dataset = None

    print("\nCreating dataloaders...")
    num_workers = min(config.num_workers, os.cpu_count() if os.cpu_count() else 1)
    print(f"Using {num_workers} workers for DataLoaders.")

    if train_dataset and train_dataset.data:
        # Check if persistent_workers is supported
        persist_workers = (num_workers > 0)
        try:
            _ = DataLoader(train_dataset, num_workers=num_workers, persistent_workers=persist_workers)
        except TypeError:
            persist_workers = False
            print("Note: `persistent_workers=True` not supported by this PyTorch version/DataLoader setup.")

        train_loader = DataLoader(
            train_dataset, 
            batch_size=config.batch_size, 
            shuffle=True,
            num_workers=num_workers,
            pin_memory=True if config.device == torch.device("cuda") else False,
            drop_last=True,
            persistent_workers=persist_workers
        )
        print(f"Train loader created with {len(train_loader)} batches.")
        # Calculate total training steps for Cosine Scheduler
        config.total_training_steps = len(train_loader) * config.epochs // config.accumulation_steps
        print(f"Total estimated training steps: {config.total_training_steps}")
    else: 
        print("Skipping train loader creation (no data).")

    if dev_dataset and dev_dataset.data:
        dev_loader = DataLoader(
            dev_dataset, 
            batch_size=config.batch_size * 2,
            shuffle=False,
            num_workers=num_workers,
            pin_memory=True if config.device == torch.device("cuda") else False,
            drop_last=False,
            persistent_workers=persist_workers if num_workers > 0 else False
        )
        print(f"Validation loader created with {len(dev_loader)} batches.")
    else: 
        print("Skipping validation loader creation.")

    if not train_loader: 
        print("\nERROR: Train loader could not be created.")
else:
    print("ERROR: Tokenizer or Image Processor not loaded. Skipping dataset/loader creation.")


Creating datasets...
Attempting to load training data from: ./json_data/train.json
Loading JSON metadata (this might take time for large datasets)...


Loading JSONs:   0%|          | 0/1 [00:00<?, ?it/s]

Loaded 41236 samples total from 1 file(s).
Dataset size after potential cleaning: 41236
Using image target size: 384x384
Attempting to load validation data from: ./json_data/dev.json
Loading JSON metadata (this might take time for large datasets)...


Loading JSONs:   0%|          | 0/1 [00:00<?, ?it/s]

Loaded 10002 samples total from 1 file(s).
Dataset size after potential cleaning: 10002
Using image target size: 384x384

Creating dataloaders...
Using 20 workers for DataLoaders.
Train loader created with 1288 batches.
Total estimated training steps: 1006
Validation loader created with 157 batches.


In [10]:
# === Cell 10: Setup - Model, Optimizer, Scheduler ===

model = None
optimizer = None
lr_scheduler = None
scaler = None # For AMP

print("\nInitializing ViSigLIP model components...")
try:
    # Instantiate the encoders and main model
    image_encoder = ImageEncoder(config).to(config.device)
    text_encoder = TextEncoder(config).to(config.device)
    model = ViSigLIPModel(image_encoder, text_encoder, config).to(config.device)

    print(f"\nViSigLIP Model initialized successfully on {config.device}.")
    num_params_total = sum(p.numel() for p in model.parameters())
    num_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {num_params_total / 1e6:.2f} M")
    print(f"Trainable parameters: {num_params_trainable / 1e6:.2f} M")

except Exception as e:
    print(f"ERROR initializing model components: {e}")
    traceback.print_exc()
    model = None

if model and train_loader:
    print("\nSetting up optimizer...")
    # --- Optimizer ---
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': config.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
    print(f"Optimizer AdamW initialized with base LR: {config.learning_rate}, weight decay: {config.weight_decay}")

    # --- LR Scheduler ---
    if config.scheduler_type == "cosine":
        if hasattr(config, 'total_training_steps'):
            lr_scheduler = transformers.get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=config.warmup_steps,
                num_training_steps=config.total_training_steps
            )
            print(f"LR Scheduler: Cosine with Warmup ({config.warmup_steps} steps) initialized.")
        else:
            print("ERROR: total_training_steps not calculated. Cannot init Cosine scheduler.")
            lr_scheduler = None
    elif config.scheduler_type == "reduce_on_plateau":
        lr_scheduler = ReduceLROnPlateau(
            optimizer, mode=config.mode, factor=config.rop_factor, patience=config.rop_patience
        )
        print(f"LR Scheduler: ReduceLROnPlateau initialized (mode='{config.mode}', factor={config.rop_factor}, patience={config.rop_patience})")
    else:
        print("No LR Scheduler specified.")
        lr_scheduler = None

    # --- Automatic Mixed Precision (AMP) Scaler ---
    if config.use_amp:
        scaler = torch.amp.GradScaler('cuda')
        print("AMP GradScaler initialized.")
    else:
        scaler = None

    # Early stopping setup
    early_stopping_counter = 0
    early_stopping_patience = config.early_stopping_patience if hasattr(config, 'early_stopping_patience') else 5
    best_val_metric = -float('inf') if config.mode == "max" else float('inf')
    print(f"Early stopping enabled with patience: {early_stopping_patience}")

else:
    print("ERROR: Model not initialized or train_loader not available. Skipping optimizer/scheduler setup.")


Initializing ViSigLIP model components...
Initializing BLIP Vision Encoder from: Salesforce/blip-image-captioning-base by loading BlipForImageTextRetrieval first.
  Loading base BlipForImageTextRetrieval...


Some weights of BlipForImageTextRetrieval were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['itm_head.bias', 'itm_head.weight', 'text_encoder.embeddings.position_embeddings.weight', 'text_encoder.embeddings.word_embeddings.weight', 'text_proj.bias', 'text_proj.weight', 'vision_proj.bias', 'vision_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Extracting vision_model from BlipForImageTextRetrieval.
  Vision model extracted successfully.
  Confirmed/Using vision model hidden size: 768
  Added projection head: 768 -> 768
Initializing Text Encoder: vinai/phobert-base
  Confirmed text model hidden size: 768
  Added projection head: 768 -> 768
Using learnable temperature, initialized to 10.0000
Using learnable bias, initialized to -10.0000

ViSigLIP Model initialized successfully on cuda.
Total parameters: 222.27 M
Trainable parameters: 222.27 M

Setting up optimizer...
Optimizer AdamW initialized with base LR: 0.0001, weight decay: 0.1
LR Scheduler: Cosine with Warmup (1000 steps) initialized.
AMP GradScaler initialized.
Early stopping enabled with patience: 5


In [11]:
# === Cell 11: Training and Validation Functions (Adapted for SigLIP & Steps) ===
import traceback

def train_step(model, batch, optimizer, scaler, device, use_amp):
    """ Performs a single training step with SigLIP loss and optional AMP """
    model.train() # Set model to training mode

    pixel_values = batch['pixel_values']#.to(device) # Moved device transfer to model forward
    input_ids = batch['input_ids']#.to(device)
    attention_mask = batch['attention_mask']#.to(device)

    with torch.cuda.amp.autocast(enabled=use_amp):
        # Get features, temp, bias from model
        image_features, text_features, temp, bias = model(pixel_values, input_ids, attention_mask)
        # Calculate SigLIP loss
        loss = siglip_loss(image_features, text_features, temp, bias)

    if use_amp:
        scaler.scale(loss).backward()
    else:
        loss.backward()

    return loss.item() # Return scalar loss

def validate_epoch(model, dataloader, device):
    """ Performs validation, returning metrics """
    model.eval() # Set model to evaluation mode
    all_image_embeddings = []
    all_text_embeddings = []

    progress_bar = tqdm(dataloader, desc=f"Validation", leave=False, unit="batch")

    with torch.no_grad():
        for batch in progress_bar:
            pixel_values = batch['pixel_values'] #.to(device) # Moved to model forward
            input_ids = batch['input_ids'] #.to(device)
            attention_mask = batch['attention_mask'] #.to(device)

            with torch.cuda.amp.autocast(enabled=config.use_amp): # Use AMP for validation inference too
                # Get features BUT ignore temp/bias for metric calculation
                # Metrics are calculated on raw (normalized) similarity
                image_embeds_norm, text_embeds_norm, _, _ = model(pixel_values, input_ids, attention_mask)

            # Collect normalized embeddings on CPU to save GPU memory
            all_image_embeddings.append(image_embeds_norm.cpu())
            all_text_embeddings.append(text_embeds_norm.cpu())

    if not all_image_embeddings or not all_text_embeddings:
         print("Warning: No embeddings collected during validation.")
         # Return default zero metrics, maybe fetch loss if calculated differently
         return { "loss": float('inf'), "avg_acc": 0.0, "avg_cosine_sim": 0.0,
                  "i2t recall R@1": 0.0, "i2t recall R@5": 0.0, "i2t recall R@10": 0.0,
                  "t2i recall R@1": 0.0, "t2i recall R@5": 0.0, "t2i recall R@10": 0.0 }

    # Concatenate all embeddings
    try:
        all_image_embeddings = torch.cat(all_image_embeddings, dim=0)
        all_text_embeddings = torch.cat(all_text_embeddings, dim=0)
    except Exception as e:
        print(f"Error concatenating embeddings: {e}")
        # Handle cases where validation might have yielded inconsistent batch sizes if drop_last=False
        return { "loss": float('inf'), "avg_acc": 0.0, "avg_cosine_sim": 0.0,
                  "i2t recall R@1": 0.0, "i2t recall R@5": 0.0, "i2t recall R@10": 0.0,
                  "t2i recall R@1": 0.0, "t2i recall R@5": 0.0, "t2i recall R@10": 0.0 }


    print(f"\\nComputing metrics over {all_image_embeddings.shape[0]} validation samples...")
    validation_metrics = compute_metrics(all_image_embeddings.to(device), all_text_embeddings.to(device))

    # Format results
    final_results = {}
    for k, v in validation_metrics.items():
        if isinstance(v, dict):
            for recall_k, recall_v in v.items(): final_results[f"{k.replace('_', ' ')} {recall_k}"] = recall_v
        else: final_results[k.replace('_', ' ')] = v

    # Clean up memory
    del all_image_embeddings
    del all_text_embeddings
    gc.collect()
    if torch.cuda.is_available(): torch.cuda.empty_cache()

    return final_results

print("Training step and validation epoch functions defined.")

Training step and validation epoch functions defined.


In [12]:
# === Cell 12: Pretraining Loop ===
import datetime

if model and train_loader and optimizer:  # Basic check
    print(f"\nStarting ViSigLIP pretraining for {config.epochs} epochs...")
    print(f"Target metric for saving best model: '{config.metric_to_track}' (mode: {config.mode})")

    best_val_metric = -float('inf') if config.mode == "max" else float('inf')
    global_step = 0
    total_loss_since_log = 0.0
    steps_since_log = 0
    start_train_time = time.time()
    
    # Store minimal history
    history = {'steps': [], 'train_loss': [], 'val_metrics': {}}

    model.train()  # Ensure model is in training mode initially

    for epoch in range(config.epochs):
        epoch_start_time = time.time()
        print(f"\n--- Epoch {epoch+1}/{config.epochs} ---")
        progress_bar = tqdm(train_loader, desc=f"Training E{epoch+1}", leave=True, unit="batch")

        for i, batch in enumerate(progress_bar):
            # Skip dummy batches
            if batch['pixel_values'].shape[0] < config.batch_size and torch.all(batch['pixel_values'] == 0):
                continue

            loss = train_step(model, batch, optimizer, scaler, config.device, config.use_amp)
            loss = loss / config.accumulation_steps  # Normalize loss for logging if accumulating

            total_loss_since_log += loss
            steps_since_log += 1

            # Gradient Accumulation & Optimizer Step
            if (global_step + 1) % config.accumulation_steps == 0:
                if config.use_amp:
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    optimizer.step()

                optimizer.zero_grad()

                # LR Scheduler Step
                if lr_scheduler:
                    lr_scheduler.step()

            global_step += 1

            # Logging
            if global_step % config.log_interval_steps == 0:
                avg_loss = total_loss_since_log / steps_since_log
                current_lr = optimizer.param_groups[0]['lr']
                progress_bar.set_postfix(loss=f"{avg_loss:.4f}", lr=f"{current_lr:.2e}", step=f"{global_step}")
                history['steps'].append(global_step)
                history['train_loss'].append(avg_loss)
                total_loss_since_log = 0.0
                steps_since_log = 0

            # Validation & Checkpointing
            if dev_loader and global_step % config.validation_interval_steps == 0 and global_step > 0:
                print(f"\nRunning validation at step {global_step}...")
                val_start_time = time.time()
                val_results = validate_epoch(model, dev_loader, config.device)
                val_end_time = time.time()
                print(f"Validation finished in {val_end_time - val_start_time:.2f}s")

                # Log validation metrics
                metric_log_str = f"  Validation Step {global_step}: "
                history['val_metrics'][global_step] = val_results
                sorted_keys = sorted(val_results.keys())
                for name in sorted_keys:
                    metric_log_str += f"{name}: {val_results[name]:.4f} | "
                print(metric_log_str.strip(" | "))

                # Save Checkpoint Logic
                current_val_metric = val_results.get(config.metric_to_track.replace('_', ' '), None)
                is_best = False
                save_path = None
                save_path_periodic = None
                
                if current_val_metric is not None:
                    if config.mode == "max":
                        is_best = current_val_metric > best_val_metric + config.early_stopping_min_delta
                    else:  # min mode
                        is_best = current_val_metric < best_val_metric - config.early_stopping_min_delta

                    if is_best:
                        print(f"  Metric '{config.metric_to_track}' improved from {best_val_metric:.4f} to {current_val_metric:.4f}. Saving best model.")
                        best_val_metric = current_val_metric
                        save_path = os.path.join(config.model_path, "visiglip_pretrain_best.pt")
                    else:
                        print(f"  Metric '{config.metric_to_track}' did not improve. Best: {best_val_metric:.4f}.")

                    # Save checkpoint periodically
                    if global_step % config.save_interval_steps == 0:
                        periodic_save_path = os.path.join(config.model_path, f"visiglip_pretrain_step_{global_step}.pt")
                        if save_path != periodic_save_path:  # Avoid saving twice if it's the best step
                            print(f"  Saving periodic checkpoint to {periodic_save_path}")
                            save_path_periodic = periodic_save_path
                    
                    # Prepare Save Dictionary
                    if save_path or save_path_periodic:
                        save_dict = {
                            'step': global_step,
                            'epoch': epoch + 1,
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'best_val_metric': best_val_metric,
                            'metric_tracked': config.metric_to_track,
                            'current_val_metrics': val_results,
                            'vision_model_name': config.blip_vision_model_name,
                            'text_model_name': config.selected_text_model,
                            'projection_dim': config.projection_dim,
                            'learnable_temperature': config.learnable_temperature,
                            'learnable_bias': config.learnable_bias,
                            'max_length': config.max_length,
                        }
                        if lr_scheduler:
                            save_dict['scheduler_state_dict'] = lr_scheduler.state_dict()
                        if scaler:
                            save_dict['scaler_state_dict'] = scaler.state_dict()

                        # Save best model if condition met
                        if save_path:
                            torch.save(save_dict, save_path)
                        # Save periodic checkpoint if condition met
                        if save_path_periodic:
                            torch.save(save_dict, save_path_periodic)
                else:
                    print(f"  Warning: Metric '{config.metric_to_track}' not found in validation results. Cannot determine best model.")

                # Reset model to train mode after validation
                model.train()

        # End of Epoch
        epoch_end_time = time.time()
        print(f"--- Epoch {epoch+1} Time: {datetime.timedelta(seconds=epoch_end_time - epoch_start_time)} ---")

    # End of Training
    end_train_time = time.time()
    total_duration = datetime.timedelta(seconds=end_train_time - start_train_time)
    print(f"=============== Pretraining Finished ================")
    print(f"Total Training Time: {total_duration}")

    # Save final model state
    final_model_path = os.path.join(config.model_path, 'visiglip_pretrain_final.pt')
    final_save_dict = {
        'step': global_step,
        'epoch': config.epochs,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'best_val_metric': best_val_metric,
        'metric_tracked': config.metric_to_track,
        'vision_model_name': config.blip_vision_model_name,
        'text_model_name': config.selected_text_model,
        'projection_dim': config.projection_dim,
        'learnable_temperature': config.learnable_temperature,
        'learnable_bias': config.learnable_bias,
        'max_length': config.max_length,
    }
    if lr_scheduler:
        final_save_dict['scheduler_state_dict'] = lr_scheduler.state_dict()
    if scaler:
        final_save_dict['scaler_state_dict'] = scaler.state_dict()
    torch.save(final_save_dict, final_model_path)
    print(f"Final model state saved to {final_model_path}")

    best_model_file = os.path.join(config.model_path, "visiglip_pretrain_best.pt")
    if dev_loader and os.path.exists(best_model_file):
        print(f"Best model based on '{config.metric_to_track}' ({best_val_metric:.4f}) is saved at: {best_model_file}")
    elif dev_loader:
        print("Best model checkpoint file not found (or validation was skipped).")
    print(f"=================================================")

else:
    print("ERROR: Prerequisites for training (model, dataloader, optimizer) not met. Training loop skipped.")


Starting ViSigLIP pretraining for 50 epochs...
Target metric for saving best model: 'avg_acc' (mode: max)

--- Epoch 1/50 ---


Training E1:   0%|          | 0/1288 [00:00<?, ?batch/s]

  with torch.cuda.amp.autocast(enabled=use_amp):



Running validation at step 1000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

  with torch.cuda.amp.autocast(enabled=config.use_amp): # Use AMP for validation inference too


\nComputing metrics over 10002 validation samples...
Validation finished in 30.91s
Validation Step 1000: avg acc: 0.0001 | avg cosine sim: 0.0703 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' improved from -inf to 0.0001. Saving best model.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_1000.pt
--- Epoch 1 Time: 0:06:27.937303 ---

--- Epoch 2/50 ---


Training E2:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 2000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.18s
Validation Step 2000: avg acc: 0.0001 | avg cosine sim: 0.4176 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_2000.pt
--- Epoch 2 Time: 0:06:23.407286 ---

--- Epoch 3/50 ---


Training E3:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 3000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.26s
Validation Step 3000: avg acc: 0.0001 | avg cosine sim: 0.7140 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_3000.pt
--- Epoch 3 Time: 0:06:23.708715 ---

--- Epoch 4/50 ---


Training E4:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 4000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.30s
Validation Step 4000: avg acc: 0.0001 | avg cosine sim: 0.7147 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_4000.pt

Running validation at step 5000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.07s
Validation Step 5000: avg acc: 0.0001 | avg cosine sim: 0.6975 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_5000.pt
--- Epoch 4 Time: 0:06:55.078366 ---

--- Epoch 5/50 ---


Training E5:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 6000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.38s
Validation Step 6000: avg acc: 0.0001 | avg cosine sim: 0.6999 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_6000.pt
--- Epoch 5 Time: 0:06:23.837935 ---

--- Epoch 6/50 ---


Training E6:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 7000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.15s
Validation Step 7000: avg acc: 0.0001 | avg cosine sim: 0.6958 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_7000.pt
--- Epoch 6 Time: 0:06:23.539023 ---

--- Epoch 7/50 ---


Training E7:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 8000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.23s
Validation Step 8000: avg acc: 0.0001 | avg cosine sim: 0.6935 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_8000.pt

Running validation at step 9000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.16s
Validation Step 9000: avg acc: 0.0001 | avg cosine sim: 0.6918 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_9000.pt
--- Epoch 7 Time: 0:06:55.265229 ---

--- Epoch 8/50 ---


Training E8:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 10000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.19s
Validation Step 10000: avg acc: 0.0001 | avg cosine sim: 0.6905 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_10000.pt
--- Epoch 8 Time: 0:06:23.542316 ---

--- Epoch 9/50 ---


Training E9:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 11000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.37s
Validation Step 11000: avg acc: 0.0001 | avg cosine sim: 0.6902 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_11000.pt
--- Epoch 9 Time: 0:06:23.984508 ---

--- Epoch 10/50 ---


Training E10:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 12000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.34s
Validation Step 12000: avg acc: 0.0001 | avg cosine sim: 0.6889 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_12000.pt
--- Epoch 10 Time: 0:06:23.788190 ---

--- Epoch 11/50 ---


Training E11:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 13000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.41s
Validation Step 13000: avg acc: 0.0001 | avg cosine sim: 0.6884 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_13000.pt

Running validation at step 14000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.57s
Validation Step 14000: avg acc: 0.0001 | avg cosine sim: 0.6895 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_14000.pt
--- Epoch 11 Time: 0:06:55.968926 ---

--- Epoch 12/50 ---


Training E12:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 15000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.30s
Validation Step 15000: avg acc: 0.0001 | avg cosine sim: 0.6867 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_15000.pt
--- Epoch 12 Time: 0:06:23.823886 ---

--- Epoch 13/50 ---


Training E13:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 16000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.20s
Validation Step 16000: avg acc: 0.0001 | avg cosine sim: 0.6856 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_16000.pt
--- Epoch 13 Time: 0:06:23.649000 ---

--- Epoch 14/50 ---


Training E14:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 17000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.17s
Validation Step 17000: avg acc: 0.0001 | avg cosine sim: 0.6863 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_17000.pt

Running validation at step 18000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.37s
Validation Step 18000: avg acc: 0.0001 | avg cosine sim: 0.6882 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_18000.pt
--- Epoch 14 Time: 0:06:55.405091 ---

--- Epoch 15/50 ---


Training E15:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 19000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 29.99s
Validation Step 19000: avg acc: 0.0001 | avg cosine sim: 0.6881 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_19000.pt
--- Epoch 15 Time: 0:06:23.496486 ---

--- Epoch 16/50 ---


Training E16:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 20000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.23s
Validation Step 20000: avg acc: 0.0001 | avg cosine sim: 0.6861 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_20000.pt
--- Epoch 16 Time: 0:06:23.625420 ---

--- Epoch 17/50 ---


Training E17:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 21000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.45s
Validation Step 21000: avg acc: 0.0001 | avg cosine sim: 0.6838 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_21000.pt
--- Epoch 17 Time: 0:06:24.017399 ---

--- Epoch 18/50 ---


Training E18:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 22000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.36s
Validation Step 22000: avg acc: 0.0001 | avg cosine sim: 0.6836 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_22000.pt

Running validation at step 23000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.10s
Validation Step 23000: avg acc: 0.0001 | avg cosine sim: 0.6819 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_23000.pt
--- Epoch 18 Time: 0:06:55.363130 ---

--- Epoch 19/50 ---


Training E19:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 24000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.13s
Validation Step 24000: avg acc: 0.0001 | avg cosine sim: 0.6855 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_24000.pt
--- Epoch 19 Time: 0:06:23.754922 ---

--- Epoch 20/50 ---


Training E20:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 25000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.08s
Validation Step 25000: avg acc: 0.0001 | avg cosine sim: 0.6860 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_25000.pt
--- Epoch 20 Time: 0:06:23.480805 ---

--- Epoch 21/50 ---


Training E21:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 26000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.38s
Validation Step 26000: avg acc: 0.0001 | avg cosine sim: 0.6806 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_26000.pt

Running validation at step 27000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.36s
Validation Step 27000: avg acc: 0.0001 | avg cosine sim: 0.6847 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_27000.pt
--- Epoch 21 Time: 0:06:55.574708 ---

--- Epoch 22/50 ---


Training E22:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 28000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.32s
Validation Step 28000: avg acc: 0.0001 | avg cosine sim: 0.6822 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_28000.pt
--- Epoch 22 Time: 0:06:23.896525 ---

--- Epoch 23/50 ---


Training E23:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 29000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.36s
Validation Step 29000: avg acc: 0.0001 | avg cosine sim: 0.6806 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_29000.pt
--- Epoch 23 Time: 0:06:23.876587 ---

--- Epoch 24/50 ---


Training E24:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 30000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.28s
Validation Step 30000: avg acc: 0.0001 | avg cosine sim: 0.6815 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_30000.pt
--- Epoch 24 Time: 0:06:23.633214 ---

--- Epoch 25/50 ---


Training E25:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 31000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.39s
Validation Step 31000: avg acc: 0.0001 | avg cosine sim: 0.6834 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_31000.pt

Running validation at step 32000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.37s
Validation Step 32000: avg acc: 0.0001 | avg cosine sim: 0.6810 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_32000.pt
--- Epoch 25 Time: 0:06:55.470729 ---

--- Epoch 26/50 ---


Training E26:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 33000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.43s
Validation Step 33000: avg acc: 0.0001 | avg cosine sim: 0.6807 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_33000.pt
--- Epoch 26 Time: 0:06:24.102100 ---

--- Epoch 27/50 ---


Training E27:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 34000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.31s
Validation Step 34000: avg acc: 0.0001 | avg cosine sim: 0.6807 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_34000.pt
--- Epoch 27 Time: 0:06:23.859852 ---

--- Epoch 28/50 ---


Training E28:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 35000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.25s
Validation Step 35000: avg acc: 0.0001 | avg cosine sim: 0.6802 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_35000.pt

Running validation at step 36000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.27s
Validation Step 36000: avg acc: 0.0001 | avg cosine sim: 0.6795 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_36000.pt
--- Epoch 28 Time: 0:06:55.281450 ---

--- Epoch 29/50 ---


Training E29:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 37000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.35s
Validation Step 37000: avg acc: 0.0001 | avg cosine sim: 0.6826 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_37000.pt
--- Epoch 29 Time: 0:06:23.893634 ---

--- Epoch 30/50 ---


Training E30:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 38000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.36s
Validation Step 38000: avg acc: 0.0001 | avg cosine sim: 0.6795 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_38000.pt
--- Epoch 30 Time: 0:06:23.889092 ---

--- Epoch 31/50 ---


Training E31:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 39000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.27s
Validation Step 39000: avg acc: 0.0001 | avg cosine sim: 0.6794 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_39000.pt
--- Epoch 31 Time: 0:06:23.670665 ---

--- Epoch 32/50 ---


Training E32:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 40000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.46s
Validation Step 40000: avg acc: 0.0001 | avg cosine sim: 0.6803 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_40000.pt

Running validation at step 41000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.18s
Validation Step 41000: avg acc: 0.0001 | avg cosine sim: 0.6806 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_41000.pt
--- Epoch 32 Time: 0:06:55.467594 ---

--- Epoch 33/50 ---


Training E33:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 42000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.19s
Validation Step 42000: avg acc: 0.0001 | avg cosine sim: 0.6766 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_42000.pt
--- Epoch 33 Time: 0:06:23.640095 ---

--- Epoch 34/50 ---


Training E34:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 43000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.39s
Validation Step 43000: avg acc: 0.0001 | avg cosine sim: 0.6795 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_43000.pt
--- Epoch 34 Time: 0:06:23.834829 ---

--- Epoch 35/50 ---


Training E35:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 44000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.35s
Validation Step 44000: avg acc: 0.0001 | avg cosine sim: 0.6797 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_44000.pt

Running validation at step 45000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.23s
Validation Step 45000: avg acc: 0.0001 | avg cosine sim: 0.6785 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_45000.pt
--- Epoch 35 Time: 0:06:55.406744 ---

--- Epoch 36/50 ---


Training E36:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 46000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.27s
Validation Step 46000: avg acc: 0.0001 | avg cosine sim: 0.6787 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_46000.pt
--- Epoch 36 Time: 0:06:23.697371 ---

--- Epoch 37/50 ---


Training E37:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 47000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.28s
Validation Step 47000: avg acc: 0.0001 | avg cosine sim: 0.6657 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_47000.pt
--- Epoch 37 Time: 0:06:23.582286 ---

--- Epoch 38/50 ---


Training E38:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 48000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.34s
Validation Step 48000: avg acc: 0.0001 | avg cosine sim: 0.6866 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_48000.pt
--- Epoch 38 Time: 0:06:23.883169 ---

--- Epoch 39/50 ---


Training E39:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 49000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.20s
Validation Step 49000: avg acc: 0.0001 | avg cosine sim: 0.6826 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_49000.pt

Running validation at step 50000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.04s
Validation Step 50000: avg acc: 0.0001 | avg cosine sim: 0.6709 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_50000.pt
--- Epoch 39 Time: 0:06:54.930866 ---

--- Epoch 40/50 ---


Training E40:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 51000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.24s
Validation Step 51000: avg acc: 0.0001 | avg cosine sim: 0.6726 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_51000.pt
--- Epoch 40 Time: 0:06:23.778524 ---

--- Epoch 41/50 ---


Training E41:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 52000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.24s
Validation Step 52000: avg acc: 0.0001 | avg cosine sim: 0.6722 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_52000.pt
--- Epoch 41 Time: 0:06:23.720734 ---

--- Epoch 42/50 ---


Training E42:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 53000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.27s
Validation Step 53000: avg acc: 0.0001 | avg cosine sim: 0.6723 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_53000.pt

Running validation at step 54000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.11s
Validation Step 54000: avg acc: 0.0001 | avg cosine sim: 0.6714 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_54000.pt
--- Epoch 42 Time: 0:06:55.139631 ---

--- Epoch 43/50 ---


Training E43:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 55000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.29s
Validation Step 55000: avg acc: 0.0001 | avg cosine sim: 0.6725 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_55000.pt
--- Epoch 43 Time: 0:06:23.758763 ---

--- Epoch 44/50 ---


Training E44:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 56000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.11s
Validation Step 56000: avg acc: 0.0001 | avg cosine sim: 0.6723 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_56000.pt
--- Epoch 44 Time: 0:06:23.439123 ---

--- Epoch 45/50 ---


Training E45:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 57000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.13s
Validation Step 57000: avg acc: 0.0001 | avg cosine sim: 0.6718 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_57000.pt
--- Epoch 45 Time: 0:06:23.599174 ---

--- Epoch 46/50 ---


Training E46:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 58000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.35s
Validation Step 58000: avg acc: 0.0001 | avg cosine sim: 0.6721 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_58000.pt

Running validation at step 59000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.45s
Validation Step 59000: avg acc: 0.0001 | avg cosine sim: 0.6714 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_59000.pt
--- Epoch 46 Time: 0:06:55.467820 ---

--- Epoch 47/50 ---


Training E47:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 60000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.17s
Validation Step 60000: avg acc: 0.0001 | avg cosine sim: 0.6716 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_60000.pt
--- Epoch 47 Time: 0:06:23.441057 ---

--- Epoch 48/50 ---


Training E48:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 61000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.35s
Validation Step 61000: avg acc: 0.0001 | avg cosine sim: 0.6718 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_61000.pt
--- Epoch 48 Time: 0:06:23.803608 ---

--- Epoch 49/50 ---


Training E49:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 62000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.38s
Validation Step 62000: avg acc: 0.0001 | avg cosine sim: 0.6718 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_62000.pt

Running validation at step 63000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.22s
Validation Step 63000: avg acc: 0.0001 | avg cosine sim: 0.6721 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_63000.pt
--- Epoch 49 Time: 0:06:55.296951 ---

--- Epoch 50/50 ---


Training E50:   0%|          | 0/1288 [00:00<?, ?batch/s]


Running validation at step 64000...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

\nComputing metrics over 10002 validation samples...
Validation finished in 30.35s
Validation Step 64000: avg acc: 0.0001 | avg cosine sim: 0.6715 | i2t acc: 0.0001 | i2t recall R@1: 0.0001 | i2t recall R@10: 0.0010 | i2t recall R@5: 0.0005 | t2i acc: 0.0001 | t2i recall R@1: 0.0001 | t2i recall R@10: 0.0010 | t2i recall R@5: 0.0005
  Metric 'avg_acc' did not improve. Best: 0.0001.
  Saving periodic checkpoint to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_step_64000.pt
--- Epoch 50 Time: 0:06:23.907305 ---
Total Training Time: 5:27:13.619622
Final model state saved to ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_final.pt
Best model based on 'avg_acc' (0.0001) is saved at: ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_best.pt


In [13]:
# === Cell 13: Final Evaluation on Test Set (Optional) ===
# Needs a separate test set definition (similar to validation setup in Cell 9)
# Uses the validation function for evaluation logic

# --- Define Test Set Paths ---
test_json_path = os.path.join(config.data_path, "test.json")
test_image_path = config.image_base_path

print("\\n=============== Starting Test Set Evaluation ===============")

test_loader = None
model_to_test = None

# 1. Check prerequisites & Create Test Loader
if os.path.exists(test_json_path) and 'tokenizer' in globals() and tokenizer and 'image_processor' in globals() and image_processor:
    print(f"Loading test data from: {test_json_path}")
    try:
        test_dataset = CustomImageCaptionDataset(
            json_path_or_list=test_json_path, image_base_path=test_image_path,
            tokenizer=tokenizer, image_processor=image_processor,
            max_length=config.max_length
        )
        if test_dataset.data:
            num_workers = min(config.num_workers, os.cpu_count() if os.cpu_count() else 1)
            persist_workers_test = (num_workers > 0) # Check support again
            try: _ = DataLoader(test_dataset, num_workers=num_workers, persistent_workers=persist_workers_test)
            except TypeError: persist_workers_test = False

            test_loader = DataLoader(
                test_dataset, batch_size=config.batch_size * 2, shuffle=False, # Larger batch, no shuffle
                num_workers=num_workers, pin_memory=True if config.device == torch.device("cuda") else False,
                drop_last=False, persistent_workers=persist_workers_test
            )
            print(f"Test loader created with {len(test_loader)} batches.")
        else: print("Test dataset loaded but is empty.")
    except Exception as e: print(f"Error creating test dataset/loader: {e}")
else: print("Skipping test evaluation: Test JSON, Tokenizer or Image Processor not found/loaded.")


# 2. Load Best Model for Testing
if test_loader:
    try:
        best_model_path = os.path.join(config.model_path, "visiglip_pretrain_best.pt")
        # Fallback to final if best doesn't exist
        final_model_path = os.path.join(config.model_path, "visiglip_pretrain_final.pt")
        load_path = None

        if os.path.exists(best_model_path):
             load_path = best_model_path
             print(f"\\nLoading best model: {load_path}")
        elif os.path.exists(final_model_path):
             load_path = final_model_path
             print(f"\\nLoading final model (best not found): {load_path}")
        else:
            print(f"\\nWARNING: No checkpoints found in {config.model_path} to evaluate.")

        if load_path:
            checkpoint = torch.load(load_path, map_location=config.device)
            print("Re-creating model structure for testing...")

            # --- Create a temporary config based on saved checkpoint ---
            # Use attribute names expected by the Encoder/Model __init__ methods
            from types import SimpleNamespace
            temp_config_dict = {
                'device': config.device, # Use current device
                'blip_vision_model_name': checkpoint.get('vision_model_name', config.selected_vision_source),
                'vision_embedding': config.vision_embedding, # Match the current CFG base size
                'selected_text_model': checkpoint.get('text_model_name', config.selected_text_model),
                'text_embedding': config.text_embedding, # Match the current CFG base size
                'projection_dim': checkpoint.get('projection_dim', config.projection_dim),
                'learnable_temperature': checkpoint.get('learnable_temperature', config.learnable_temperature),
                'temperature_init': config.temperature_init, # Use CFG init value
                'learnable_bias': checkpoint.get('learnable_bias', config.learnable_bias),
                'bias_init': config.bias_init, # Use CFG init value
            }
            temp_config = SimpleNamespace(**temp_config_dict)

            print(f"  Using Vision Source: {temp_config.blip_vision_model_name}")
            print(f"  Using Text Model: {temp_config.selected_text_model}")

            test_image_encoder = ImageEncoder(temp_config).to(config.device)
            test_text_encoder = TextEncoder(temp_config).to(config.device)
            model_to_test = ViSigLIPModel(test_image_encoder, test_text_encoder, temp_config).to(config.device)

            # Load state dict carefully
            state_dict = checkpoint['model_state_dict']
            # Handle potential 'module.' prefix if saved from DataParallel/DistributedDataParallel
            if all(k.startswith('module.') for k in state_dict.keys()):
                print("Detected 'module.' prefix, removing.")
                from collections import OrderedDict
                state_dict = OrderedDict((k[7:], v) for k, v in state_dict.items())

            # Load with strict=False initially to see mismatched keys
            load_result = model_to_test.load_state_dict(state_dict, strict=False)
            print(f"  State dict loading result: {load_result}")
            if load_result.missing_keys: print(f"  Warning: Missing keys: {load_result.missing_keys}")
            if load_result.unexpected_keys: print(f"  Warning: Unexpected keys: {load_result.unexpected_keys}")
            print(f"Model weights loaded successfully.")

            print("\\nRunning evaluation on test set...")
            # Use the validation function to compute metrics
            test_results = validate_epoch(model_to_test, test_loader, config.device)

            print("\\n--- Test Set Results ---")
            metric_log_str = ""
            sorted_keys = sorted(test_results.keys())
            for name in sorted_keys: metric_log_str += f"  {name}: {test_results[name]:.4f}\\n"
            print(metric_log_str.strip())
            print("------------------------")
        else:
            print("Evaluation skipped (no weights found).")
    except Exception as e:
        print(f"\\nERROR during test setup/evaluation: {e}")
        traceback.print_exc()

print("\\n================= Evaluation Finished ==================")

Loading test data from: ./json_data/test.json
Loading JSON metadata (this might take time for large datasets)...


Loading JSONs:   0%|          | 0/1 [00:00<?, ?it/s]

Loaded 10001 samples total from 1 file(s).
Dataset size after potential cleaning: 10001
Using image target size: 384x384
Test loader created with 157 batches.
\nLoading best model: ./trained_models/ViSigLIP_uitopenviic/visiglip_pretrain_best.pt
Re-creating model structure for testing...
  Using Vision Source: Salesforce/blip-image-captioning-base
  Using Text Model: vinai/phobert-base
Initializing BLIP Vision Encoder from: Salesforce/blip-image-captioning-base by loading BlipForImageTextRetrieval first.
  Loading base BlipForImageTextRetrieval...


Some weights of BlipForImageTextRetrieval were not initialized from the model checkpoint at Salesforce/blip-image-captioning-base and are newly initialized: ['itm_head.bias', 'itm_head.weight', 'text_encoder.embeddings.position_embeddings.weight', 'text_encoder.embeddings.word_embeddings.weight', 'text_proj.bias', 'text_proj.weight', 'vision_proj.bias', 'vision_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Extracting vision_model from BlipForImageTextRetrieval.
  Vision model extracted successfully.
  Confirmed/Using vision model hidden size: 768
  Added projection head: 768 -> 768
Initializing Text Encoder: vinai/phobert-base
  Confirmed text model hidden size: 768
  Added projection head: 768 -> 768
Using learnable temperature, initialized to 10.0000
Using learnable bias, initialized to -10.0000
  State dict loading result: <All keys matched successfully>
Model weights loaded successfully.
\nRunning evaluation on test set...


Validation:   0%|          | 0/157 [00:00<?, ?batch/s]

  with torch.cuda.amp.autocast(enabled=config.use_amp): # Use AMP for validation inference too


\nComputing metrics over 10001 validation samples...
\n--- Test Set Results ---
avg acc: 0.0001\n  avg cosine sim: 0.0703\n  i2t acc: 0.0001\n  i2t recall R@1: 0.0001\n  i2t recall R@10: 0.0010\n  i2t recall R@5: 0.0005\n  t2i acc: 0.0001\n  t2i recall R@1: 0.0001\n  t2i recall R@10: 0.0010\n  t2i recall R@5: 0.0005\n
------------------------


In [14]:
# === Cell 14: Training Visualization (Adapted for Steps) ===
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # For easier plotting from history

def plot_training_metrics_steps(history, plot_dir):
    if not history or not history.get('steps') or not history.get('train_loss'):
        print("No/incomplete training history available for step-based plotting.")
        return

    os.makedirs(plot_dir, exist_ok=True)
    print(f"Step-based plot directory ensured at: {os.path.abspath(plot_dir)}")

    # --- Training Loss Plot ---
    plt.figure(figsize=(10, 6))
    plt.plot(history['steps'], history['train_loss'], 'b-', label='Training Loss (Avg per Log Interval)')
    plt.xlabel('Global Steps')
    plt.ylabel('Loss')
    plt.title('Training Loss over Steps')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    save_path_loss = os.path.join(plot_dir, 'training_loss_steps.png')
    plt.savefig(save_path_loss, dpi=300)
    print(f"Saved training loss plot to: {save_path_loss}")
    plt.close()

    # --- Validation Metrics Plot ---
    if history.get('val_metrics'):
        val_steps = sorted(history['val_metrics'].keys())
        if val_steps:
            metrics_to_plot = list(history['val_metrics'][val_steps[0]].keys()) # Get metric names from first entry

            num_plots = len(metrics_to_plot)
            if num_plots == 0: return
            # Adjust subplot layout based on number of metrics
            ncols = 2
            nrows = math.ceil(num_plots / ncols)
            fig, axes = plt.subplots(nrows, ncols, figsize=(8 * ncols, 6 * nrows), squeeze=False)
            axes = axes.flatten() # Flatten for easy indexing

            for i, metric_name in enumerate(metrics_to_plot):
                 metric_values = [history['val_metrics'][step].get(metric_name, float('nan')) for step in val_steps]
                 axes[i].plot(val_steps, metric_values, 'r-o', label=f'Validation {metric_name}')
                 axes[i].set_xlabel('Global Steps')
                 axes[i].set_ylabel(metric_name.capitalize())
                 axes[i].set_title(f'Validation {metric_name} over Steps')
                 axes[i].legend()
                 axes[i].grid(True)

            # Hide unused subplots
            for j in range(i + 1, len(axes)):
                 fig.delaxes(axes[j])

            fig.suptitle('Validation Metrics over Training Steps', fontsize=16, y=1.02)
            plt.tight_layout(rect=[0, 0, 1, 0.98])
            save_path_val = os.path.join(plot_dir, 'validation_metrics_steps.png')
            plt.savefig(save_path_val, dpi=300)
            print(f"Saved validation metrics plot to: {save_path_val}")
            plt.close()
        else:
            print("No validation steps found in history.")
    else:
        print("No validation metrics found in history to plot.")


# --- Plotting ---
plot_directory = "./train_plot/ViSigLIP_uitopenviic"
if 'history' in locals() and isinstance(history, dict):
    plot_training_metrics_steps(history, plot_directory)
else:
    print("No training history found. Run training first.")

# --- END OF SCRIPT ---

Step-based plot directory ensured at: /home/researcher/huypq69/TuningModels/train_plot/ViSigLIP_uitopenviic
Saved training loss plot to: ./train_plot/ViSigLIP_uitopenviic/training_loss_steps.png
Saved validation metrics plot to: ./train_plot/ViSigLIP_uitopenviic/validation_metrics_steps.png
