In [1]:
!pip install torch pandas numpy inflect transformers accelerate pillow tqdm scikit-learn bert-score rouge-score python-Levenshtein sentence-transformers


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires numpy<3.0,>=2.

In [4]:
import torch
import gc
import os
import pandas as pd
import re
import inflect
from pathlib import Path
from PIL import Image
from tqdm import tqdm
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from bert_score import score as bert_score
import evaluate
import warnings
import logging
from torch.cuda.amp import autocast
import time

# Suppress warnings
# warnings.filterWarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

# Set environment variable to avoid memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Clear GPU memory
torch.cuda.empty_cache()
gc.collect()

# Setup inflect for number normalization
inflect_engine = inflect.engine()

# Normalize answer for single-word output
def normalize_answer(ans):
    ans = ans.strip().lower()
    ans = re.sub(r'[^\w\s]', '', ans)  # Remove punctuation
    if ans.isdigit():
        ans = inflect_engine.number_to_words(ans)
    words = ans.split()
    return words[0] if words else ""  # Return first word

# Parse BLIP-2 output to extract meaningful answer
def parse_blip2_output(raw_pred, prompt):
    # Remove the prompt
    answer_part = raw_pred.replace(prompt, "").strip()
    # Strip common prefixes
    prefixes = [
        "it's a", "it is a", "this is a", "is a", "it's", "it is", "this is"
    ]
    for prefix in prefixes:
        if answer_part.lower().startswith(prefix):
            answer_part = answer_part[len(prefix):].strip()
            break
    # Return the answer or a placeholder if empty
    return answer_part if answer_part else "NO_ANSWER"

# Model and processor
model_id = "Salesforce/blip2-opt-2.7b"
processor = Blip2Processor.from_pretrained(model_id, trust_remote_code=True)
processor.tokenizer.padding_side = 'left'  # Set left-padding for decoder-only model

# Load model with float16
try:
    model = Blip2ForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    ).to("cuda:0")
    model.eval()
except Exception as e:
    print(f"Failed to load {model_id}: {e}")
    exit()

# Load and validate dataset
val_path = "/kaggle/input/val1-dataset/val1.csv"
df = pd.read_csv(val_path)
df['answer'] = df['answer'].astype(str).str.lower()
df = df.dropna(subset=['question', 'answer'])
df["full_image_path"] = df["full_image_path"].apply(lambda p: str(Path("/kaggle/input") / Path(p.replace("\\", "/"))))

# Validate image paths
def validate_dataset(df):
    valid_rows = []
    for idx, row in df.iterrows():
        image_path = Path(row["full_image_path"])
        if image_path.exists():
            try:
                Image.open(image_path).convert("RGB")
                valid_rows.append(row)
            except:
                print(f"Invalid image at index {idx}: {row['full_image_path']}")
        else:
            print(f"Image not found at index {idx}: {row['full_image_path']}")
    return pd.DataFrame(valid_rows)
df = validate_dataset(df)
print(f"Valid samples: {len(df)}")

# VQA Dataset
from torch.utils.data import Dataset, DataLoader
class VQADataset(Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row["full_image_path"]).convert("RGB")
        image = image.resize((384, 384))  # BLIP-2 recommended resolution
        return {
            "image": image,
            "question": row["question"],
            "answer": row["answer"],
            "image_path": row["full_image_path"]
        }

# Custom collate function to handle PIL.Image.Image
def custom_collate_fn(batch):
    images = [item["image"] for item in batch]
    questions = [item["question"] for item in batch]
    answers = [item["answer"] for item in batch]
    image_paths = [item["image_path"] for item in batch]
    return {
        "image": images,
        "question": questions,
        "answer": answers,
        "image_path": image_paths
    }

vqa_dataset = VQADataset(df)
dataloader = DataLoader(vqa_dataset, batch_size=4, shuffle=False, collate_fn=custom_collate_fn)

# Process samples
correct = 0
total = 0
all_preds = []
all_labels = []

for batch_idx, batch in enumerate(tqdm(dataloader, desc="Processing batches", ncols=100)):
    images = batch["image"]
    questions = batch["question"]
    labels = batch["answer"]
    image_paths = batch["image_path"]
    
    batch_start_time = time.time()
    valid_samples = 0
    
    try:
        # Format questions with simpler prompt
        formatted_questions = [f"Question: {q} Answer:" for q in questions]
        inputs = processor(images=images, text=formatted_questions, return_tensors="pt", padding=True)
        inputs = {k: v.to("cuda:0", dtype=torch.float16 if v.dtype in [torch.float, torch.float32] else v.dtype) for k, v in inputs.items()}
        
        with torch.no_grad(), autocast():
            generated_ids = model.generate(**inputs, max_new_tokens=5, do_sample=False, num_beams=3)
        outputs = processor.batch_decode(generated_ids, skip_special_tokens=True)
        
        # Debug raw token IDs for first 3 batches
        if batch_idx < 3:
            print(f"Batch {batch_idx + 1} Token IDs: {generated_ids.tolist()}")
        
        for pred_raw, label, question, prompt, img_path in zip(outputs, labels, questions, formatted_questions, image_paths):
            # Parse the output to extract the answer
            parsed_answer = parse_blip2_output(pred_raw, prompt)
            pred = normalize_answer(parsed_answer)
            label = normalize_answer(label)
            if pred and pred != "no_answer":  # Skip placeholder
                all_preds.append(pred)
                all_labels.append(label)
                if pred == label:
                    correct += 1
                total += 1
                valid_samples += 1
                if total <= 10:
                    print(f"Sample {total}: Question: '{question}', Pred: '{pred}', Label: '{label}', Raw Pred: '{pred_raw}', Parsed Answer: '{parsed_answer}', Image: '{img_path}'")
            else:
                print(f"Skipped sample in batch {batch_idx + 1}: Invalid pred ('{pred}') for question: '{question}', Raw Pred: '{pred_raw}', Parsed Answer: '{parsed_answer}'")
    
    except Exception as e:
        print(f"Error in batch {batch_idx + 1}: {e}")
        continue
    
    # Log batch summary
    if valid_samples < len(questions):
        print(f"Batch {batch_idx + 1}: Processed {valid_samples}/{len(questions)} valid samples")
    
    if (total // 100) > ((total - valid_samples) // 100):
        torch.cuda.empty_cache()
        gc.collect()
        print(f"Processed {total} samples, GPU Memory: {torch.cuda.memory_allocated() / 1024**3:.2f} GB, Batch Time: {time.time() - batch_start_time:.2f}s")

# Compute exact-match accuracy
accuracy = correct / total if total > 0 else 0
print(f"\nBLIP-2 Exact-match Accuracy: {accuracy:.3f} ({correct}/{total})")

# Compute BERTScore
if all_preds and all_labels:
    P, R, F1 = bert_score(all_preds, all_labels, lang="en", model_type="roberta-large")
    print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

# Compute METEOR and ROUGE
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")
meteor_score = meteor.compute(predictions=all_preds, references=all_labels)
rouge_score = rouge.compute(predictions=all_preds, references=all_labels)
print(f"METEOR: {meteor_score['meteor']:.4f}")
print(f"ROUGE-L: {rouge_score['rougeL']:.4f}")

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Valid samples: 12000


  with torch.no_grad(), autocast():
Processing batches:   0%|                                        | 1/3000 [00:02<2:05:49,  2.52s/it]

Batch 1 Token IDs: [[50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 2, 45641, 35, 141, 171, 6907, 7716, 32, 1950, 24255, 196, 116, 31652, 35, 24, 7971, 15, 5, 14214], [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 1, 1, 2, 45641, 35, 99, 3989, 16, 1065, 5, 346, 116, 31652, 35, 5, 346, 158, 50118, 1], [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 1, 2, 45641, 35, 16, 5, 14610, 28094, 32039, 50, 41893, 116, 31652, 35, 24, 16, 28094, 41893, 50118], [50265,

Processing batches:   0%|                                        | 2/3000 [00:03<1:07:13,  1.35s/it]

Batch 2 Token IDs: [[50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 1, 2, 45641, 35, 99, 3989, 16, 24, 116, 31652, 35, 24, 18, 10, 24317, 50118], [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 1, 2, 45641, 35, 99, 3989, 16, 24, 116, 31652, 35, 24, 16, 10, 45140, 50118], [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 2, 45641, 35, 141, 171, 2280, 6538, 32, 7097, 116, 31652, 35, 24, 7971, 15, 5, 2280], [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 

Processing batches:   0%|                                          | 3/3000 [00:03<48:28,  1.03it/s]

Batch 3 Token IDs: [[50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 1, 1, 1, 2, 45641, 35, 99, 16, 5, 41546, 3195, 116, 31652, 35, 1275, 6, 5718, 6, 2272], [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 2, 45641, 35, 99, 3989, 16, 5, 14610, 23, 5, 2576, 116, 31652, 35, 24, 18, 10, 14610, 50118], [50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 50265, 1, 2, 45641, 35, 109, 4716, 1536, 50, 3607, 1719, 55, 443, 116, 31652, 35, 24, 7971, 15, 5, 1521], [50265, 50265, 502

Processing batches:   1%|▎                                        | 25/3000 [00:15<32:37,  1.52it/s]

Processed 100 samples, GPU Memory: 7.20 GB, Batch Time: 0.92s


Processing batches:   2%|▋                                        | 50/3000 [00:29<33:01,  1.49it/s]

Processed 200 samples, GPU Memory: 7.20 GB, Batch Time: 0.92s


Processing batches:   2%|█                                        | 75/3000 [00:44<34:17,  1.42it/s]

Processed 300 samples, GPU Memory: 7.20 GB, Batch Time: 0.96s


Processing batches:   3%|█▎                                      | 100/3000 [00:59<34:59,  1.38it/s]

Processed 400 samples, GPU Memory: 7.20 GB, Batch Time: 0.98s


Processing batches:   4%|█▋                                      | 125/3000 [01:15<35:54,  1.33it/s]

Processed 500 samples, GPU Memory: 7.20 GB, Batch Time: 1.01s


Processing batches:   4%|█▋                                      | 128/3000 [01:17<32:34,  1.47it/s]

Skipped sample in batch 128: Invalid pred ('no_answer') for question: 'what shape best describes the highlighted marker?', Raw Pred: 'Question: what shape best describes the highlighted marker? Answer:
', Parsed Answer: 'NO_ANSWER'
Batch 128: Processed 3/4 valid samples


Processing batches:   5%|██                                      | 151/3000 [01:33<37:48,  1.26it/s]

Processed 603 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:   6%|██▎                                     | 176/3000 [01:51<38:20,  1.23it/s]

Processed 703 samples, GPU Memory: 7.20 GB, Batch Time: 1.06s


Processing batches:   7%|██▋                                     | 201/3000 [02:08<36:18,  1.28it/s]

Processed 803 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:   8%|███                                     | 226/3000 [02:25<35:51,  1.29it/s]

Processed 903 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:   8%|███▎                                    | 251/3000 [02:42<36:31,  1.25it/s]

Processed 1003 samples, GPU Memory: 7.20 GB, Batch Time: 1.09s


Processing batches:   9%|███▋                                    | 276/3000 [02:59<37:03,  1.22it/s]

Processed 1103 samples, GPU Memory: 7.20 GB, Batch Time: 1.11s


Processing batches:  10%|████                                    | 301/3000 [03:17<35:44,  1.26it/s]

Processed 1203 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  11%|████▎                                   | 326/3000 [03:34<35:00,  1.27it/s]

Processed 1303 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  12%|████▋                                   | 351/3000 [03:51<34:32,  1.28it/s]

Processed 1403 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  13%|█████                                   | 376/3000 [04:08<34:14,  1.28it/s]

Processed 1503 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  13%|█████▎                                  | 401/3000 [04:25<33:43,  1.28it/s]

Processed 1603 samples, GPU Memory: 7.20 GB, Batch Time: 1.02s


Processing batches:  14%|█████▋                                  | 426/3000 [04:42<33:24,  1.28it/s]

Processed 1703 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  15%|██████                                  | 451/3000 [04:59<33:30,  1.27it/s]

Processed 1803 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  16%|██████▎                                 | 476/3000 [05:17<33:08,  1.27it/s]

Processed 1903 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  17%|██████▋                                 | 501/3000 [05:34<32:47,  1.27it/s]

Processed 2003 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  18%|███████                                 | 526/3000 [05:51<32:46,  1.26it/s]

Processed 2103 samples, GPU Memory: 7.20 GB, Batch Time: 1.07s


Processing batches:  18%|███████▎                                | 551/3000 [06:08<31:59,  1.28it/s]

Processed 2203 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  19%|███████▋                                | 576/3000 [06:25<31:58,  1.26it/s]

Processed 2303 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  20%|████████                                | 601/3000 [06:42<31:38,  1.26it/s]

Processed 2403 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  21%|████████▎                               | 626/3000 [07:00<31:10,  1.27it/s]

Processed 2503 samples, GPU Memory: 7.20 GB, Batch Time: 1.06s


Processing batches:  22%|████████▋                               | 651/3000 [07:17<30:47,  1.27it/s]

Processed 2603 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  23%|█████████                               | 676/3000 [07:34<30:17,  1.28it/s]

Processed 2703 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  23%|█████████▎                              | 701/3000 [07:51<29:54,  1.28it/s]

Processed 2803 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  24%|█████████▋                              | 726/3000 [08:08<29:47,  1.27it/s]

Processed 2903 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  25%|██████████                              | 751/3000 [08:25<29:34,  1.27it/s]

Processed 3003 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  26%|██████████▎                             | 776/3000 [08:43<29:17,  1.27it/s]

Processed 3103 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  27%|██████████▋                             | 801/3000 [09:00<29:00,  1.26it/s]

Processed 3203 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  28%|███████████                             | 826/3000 [09:17<28:38,  1.27it/s]

Processed 3303 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  28%|███████████▎                            | 851/3000 [09:34<28:19,  1.26it/s]

Processed 3403 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  29%|███████████▋                            | 876/3000 [09:52<27:50,  1.27it/s]

Processed 3503 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  30%|████████████                            | 901/3000 [10:09<27:34,  1.27it/s]

Processed 3603 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  31%|████████████▎                           | 926/3000 [10:26<27:02,  1.28it/s]

Processed 3703 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  32%|████████████▋                           | 951/3000 [10:43<26:55,  1.27it/s]

Processed 3803 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  33%|█████████████                           | 976/3000 [11:01<26:35,  1.27it/s]

Processed 3903 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  33%|█████████████                          | 1001/3000 [11:18<26:06,  1.28it/s]

Processed 4003 samples, GPU Memory: 7.20 GB, Batch Time: 1.02s


Processing batches:  34%|█████████████▎                         | 1026/3000 [11:35<26:07,  1.26it/s]

Processed 4103 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  35%|█████████████▋                         | 1051/3000 [11:52<25:42,  1.26it/s]

Processed 4203 samples, GPU Memory: 7.20 GB, Batch Time: 1.06s


Processing batches:  36%|█████████████▉                         | 1076/3000 [12:09<25:14,  1.27it/s]

Processed 4303 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  37%|██████████████▎                        | 1101/3000 [12:26<24:52,  1.27it/s]

Processed 4403 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  38%|██████████████▋                        | 1126/3000 [12:43<24:29,  1.28it/s]

Processed 4503 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  38%|██████████████▉                        | 1151/3000 [13:01<24:09,  1.28it/s]

Processed 4603 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  39%|███████████████▎                       | 1176/3000 [13:18<23:52,  1.27it/s]

Processed 4703 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  40%|███████████████▌                       | 1201/3000 [13:35<23:35,  1.27it/s]

Processed 4803 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  41%|███████████████▉                       | 1226/3000 [13:52<23:32,  1.26it/s]

Processed 4903 samples, GPU Memory: 7.20 GB, Batch Time: 1.06s


Processing batches:  42%|████████████████▎                      | 1251/3000 [14:09<23:17,  1.25it/s]

Processed 5003 samples, GPU Memory: 7.20 GB, Batch Time: 1.07s


Processing batches:  43%|████████████████▌                      | 1276/3000 [14:27<22:33,  1.27it/s]

Processed 5103 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  43%|████████████████▉                      | 1301/3000 [14:44<22:38,  1.25it/s]

Processed 5203 samples, GPU Memory: 7.20 GB, Batch Time: 1.07s


Processing batches:  44%|█████████████████▏                     | 1326/3000 [15:01<22:03,  1.27it/s]

Processed 5303 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  45%|█████████████████▌                     | 1351/3000 [15:18<21:41,  1.27it/s]

Processed 5403 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  46%|█████████████████▉                     | 1376/3000 [15:36<21:21,  1.27it/s]

Processed 5503 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  47%|██████████████████▏                    | 1401/3000 [15:53<21:04,  1.26it/s]

Processed 5603 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  48%|██████████████████▌                    | 1426/3000 [16:10<20:49,  1.26it/s]

Processed 5703 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  48%|██████████████████▊                    | 1451/3000 [16:27<20:09,  1.28it/s]

Processed 5803 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  49%|███████████████████▏                   | 1476/3000 [16:44<20:00,  1.27it/s]

Processed 5903 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  50%|███████████████████▌                   | 1501/3000 [17:02<19:29,  1.28it/s]

Processed 6003 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  51%|███████████████████▊                   | 1526/3000 [17:19<19:24,  1.27it/s]

Processed 6103 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  52%|████████████████████▏                  | 1551/3000 [17:36<19:21,  1.25it/s]

Processed 6203 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  53%|████████████████████▍                  | 1576/3000 [17:53<18:39,  1.27it/s]

Processed 6303 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  53%|████████████████████▊                  | 1601/3000 [18:10<18:19,  1.27it/s]

Processed 6403 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  54%|█████████████████████▏                 | 1626/3000 [18:28<17:57,  1.27it/s]

Processed 6503 samples, GPU Memory: 7.20 GB, Batch Time: 1.02s


Processing batches:  55%|█████████████████████▍                 | 1651/3000 [18:45<17:41,  1.27it/s]

Processed 6603 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  56%|█████████████████████▊                 | 1676/3000 [19:02<17:32,  1.26it/s]

Processed 6703 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  57%|██████████████████████                 | 1701/3000 [19:19<17:11,  1.26it/s]

Processed 6803 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  58%|██████████████████████▍                | 1726/3000 [19:36<16:54,  1.26it/s]

Processed 6903 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  58%|██████████████████████▊                | 1751/3000 [19:54<16:23,  1.27it/s]

Processed 7003 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  59%|███████████████████████                | 1776/3000 [20:11<16:02,  1.27it/s]

Processed 7103 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  60%|███████████████████████▍               | 1801/3000 [20:28<15:41,  1.27it/s]

Processed 7203 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  61%|███████████████████████▋               | 1826/3000 [20:45<15:25,  1.27it/s]

Processed 7303 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  62%|████████████████████████               | 1851/3000 [21:03<15:11,  1.26it/s]

Processed 7403 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  63%|████████████████████████▍              | 1876/3000 [21:20<14:57,  1.25it/s]

Processed 7503 samples, GPU Memory: 7.20 GB, Batch Time: 1.07s


Processing batches:  63%|████████████████████████▋              | 1901/3000 [21:37<14:24,  1.27it/s]

Processed 7603 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  64%|█████████████████████████              | 1926/3000 [21:54<14:03,  1.27it/s]

Processed 7703 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  65%|█████████████████████████▎             | 1951/3000 [22:11<13:48,  1.27it/s]

Processed 7803 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  66%|█████████████████████████▋             | 1976/3000 [22:28<13:27,  1.27it/s]

Processed 7903 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  67%|██████████████████████████             | 2001/3000 [22:46<13:08,  1.27it/s]

Processed 8003 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  68%|██████████████████████████▎            | 2026/3000 [23:03<12:47,  1.27it/s]

Processed 8103 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  68%|██████████████████████████▋            | 2051/3000 [23:21<12:27,  1.27it/s]

Processed 8203 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  69%|██████████████████████████▉            | 2076/3000 [23:38<12:31,  1.23it/s]

Processed 8303 samples, GPU Memory: 7.20 GB, Batch Time: 1.10s


Processing batches:  70%|███████████████████████████▎           | 2101/3000 [23:55<11:53,  1.26it/s]

Processed 8403 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  71%|███████████████████████████▋           | 2126/3000 [24:13<11:30,  1.27it/s]

Processed 8503 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  72%|███████████████████████████▉           | 2151/3000 [24:30<11:05,  1.28it/s]

Processed 8603 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  73%|████████████████████████████▎          | 2176/3000 [24:47<10:51,  1.26it/s]

Processed 8703 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  73%|████████████████████████████▌          | 2201/3000 [25:04<10:29,  1.27it/s]

Processed 8803 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  74%|████████████████████████████▉          | 2226/3000 [25:21<10:10,  1.27it/s]

Processed 8903 samples, GPU Memory: 7.20 GB, Batch Time: 1.07s


Processing batches:  75%|█████████████████████████████▎         | 2251/3000 [25:39<09:47,  1.28it/s]

Processed 9003 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  76%|█████████████████████████████▌         | 2276/3000 [25:56<09:30,  1.27it/s]

Processed 9103 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  77%|█████████████████████████████▉         | 2301/3000 [26:13<09:11,  1.27it/s]

Processed 9203 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  78%|██████████████████████████████▏        | 2326/3000 [26:30<08:53,  1.26it/s]

Processed 9303 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  78%|██████████████████████████████▌        | 2351/3000 [26:47<08:33,  1.26it/s]

Processed 9403 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  79%|██████████████████████████████▉        | 2376/3000 [27:05<08:13,  1.26it/s]

Processed 9503 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  80%|███████████████████████████████▏       | 2401/3000 [27:22<07:49,  1.28it/s]

Processed 9603 samples, GPU Memory: 7.20 GB, Batch Time: 1.02s


Processing batches:  81%|███████████████████████████████▌       | 2426/3000 [27:39<07:31,  1.27it/s]

Processed 9703 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  82%|███████████████████████████████▊       | 2451/3000 [27:56<07:11,  1.27it/s]

Processed 9803 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  83%|████████████████████████████████▏      | 2476/3000 [28:14<06:54,  1.26it/s]

Processed 9903 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  83%|████████████████████████████████▌      | 2501/3000 [28:31<06:35,  1.26it/s]

Processed 10003 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  84%|████████████████████████████████▊      | 2526/3000 [28:48<06:13,  1.27it/s]

Processed 10103 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  85%|█████████████████████████████████▏     | 2551/3000 [29:05<05:57,  1.26it/s]

Processed 10203 samples, GPU Memory: 7.20 GB, Batch Time: 1.06s


Processing batches:  86%|█████████████████████████████████▍     | 2576/3000 [29:23<05:33,  1.27it/s]

Processed 10303 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  87%|█████████████████████████████████▊     | 2601/3000 [29:40<05:16,  1.26it/s]

Processed 10403 samples, GPU Memory: 7.20 GB, Batch Time: 1.06s


Processing batches:  88%|██████████████████████████████████▏    | 2626/3000 [29:57<04:53,  1.28it/s]

Processed 10503 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  88%|██████████████████████████████████▍    | 2651/3000 [30:14<04:32,  1.28it/s]

Processed 10603 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  89%|██████████████████████████████████▊    | 2676/3000 [30:31<04:16,  1.26it/s]

Processed 10703 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  90%|███████████████████████████████████    | 2701/3000 [30:48<03:58,  1.25it/s]

Processed 10803 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  91%|███████████████████████████████████▍   | 2726/3000 [31:06<03:36,  1.26it/s]

Processed 10903 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches:  92%|███████████████████████████████████▊   | 2751/3000 [31:23<03:16,  1.27it/s]

Processed 11003 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  93%|████████████████████████████████████   | 2776/3000 [31:40<02:55,  1.28it/s]

Processed 11103 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  93%|████████████████████████████████████▍  | 2801/3000 [31:57<02:35,  1.28it/s]

Processed 11203 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  94%|████████████████████████████████████▋  | 2826/3000 [32:14<02:16,  1.28it/s]

Processed 11303 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  95%|█████████████████████████████████████  | 2851/3000 [32:32<01:58,  1.25it/s]

Processed 11403 samples, GPU Memory: 7.20 GB, Batch Time: 1.06s


Processing batches:  96%|█████████████████████████████████████▍ | 2876/3000 [32:49<01:37,  1.27it/s]

Processed 11503 samples, GPU Memory: 7.20 GB, Batch Time: 1.03s


Processing batches:  97%|█████████████████████████████████████▋ | 2901/3000 [33:06<01:17,  1.27it/s]

Processed 11603 samples, GPU Memory: 7.20 GB, Batch Time: 1.02s


Processing batches:  98%|██████████████████████████████████████ | 2926/3000 [33:23<00:59,  1.25it/s]

Processed 11703 samples, GPU Memory: 7.20 GB, Batch Time: 1.07s


Processing batches:  98%|██████████████████████████████████████▎| 2951/3000 [33:40<00:38,  1.26it/s]

Processed 11803 samples, GPU Memory: 7.20 GB, Batch Time: 1.05s


Processing batches:  99%|██████████████████████████████████████▋| 2976/3000 [33:58<00:18,  1.27it/s]

Processed 11903 samples, GPU Memory: 7.20 GB, Batch Time: 1.04s


Processing batches: 100%|███████████████████████████████████████| 3000/3000 [34:14<00:00,  1.46it/s]



BLIP-2 Exact-match Accuracy: 0.186 (2228/11999)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

BERTScore - Precision: 0.9770, Recall: 0.9784, F1: 0.9773


Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

METEOR: 0.1051
ROUGE-L: 0.1858
