In [1]:
!pip install peft accelerate transformers datasets bitsandbytes timeout_decorator
!pip uninstall -y pylibcudagraph-cu12 rmm-cu12
!pip install scikit-learn
!pip install timeout-decorator  # For timeout in dataset loading
!pip install bert_score rouge_score rapidfuzz sentence_transformers evaluate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting timeout_decorator
  Downloading timeout-decorator-0.5.0.tar.gz (4.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3

In [2]:
import os
import pandas as pd
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering, Trainer, TrainingArguments
import torch
from accelerate import Accelerator
from peft import LoraConfig, get_peft_model
from transformers.data.data_collator import default_data_collator
from timeout_decorator import timeout, TimeoutError
from pathlib import Path
from bert_score import score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import evaluate
from bert_score import BERTScorer
from rouge_score import rouge_scorer
from rapidfuzz.distance import Levenshtein
from sentence_transformers import SentenceTransformer, util

2025-05-18 09:50:47.928430: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747561848.133389      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747561848.200653      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
accelerator = Accelerator()
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base", use_fast=True)
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

In [4]:
train_df = pd.read_csv('/kaggle/input/train1-dataset/train1.csv')
val_df = pd.read_csv('/kaggle/input/val1-dataset/val1.csv')
print(f"Loaded training dataset with {len(train_df)} entries")
print(f"Loaded validation dataset with {len(val_df)} entries")

Loaded training dataset with 48000 entries
Loaded validation dataset with 12000 entries


In [5]:
val_df["full_image_path"] = val_df["full_image_path"].apply(lambda p: p.replace("\\", "/"))
val_df["full_image_path"] = val_df["full_image_path"].apply(lambda p: str(Path("/kaggle/input") / p))
train_df["full_image_path"] = train_df["full_image_path"].apply(lambda p: p.replace("\\", "/"))
train_df["full_image_path"] = train_df["full_image_path"].apply(lambda p: str(Path("/kaggle/input") / p))

In [6]:
train_df['answer'] = train_df['answer'].fillna('unknown').astype(str)
val_df['answer'] = val_df['answer'].fillna('unknown').astype(str)

In [7]:
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor

    def __len__(self):
        return len(self.df)

    @timeout(10)  # Timeout after 10 seconds for image loading/processing
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = row['full_image_path']
        try:
            if not os.path.exists(image_path):
                print(f"Image not found: {image_path}")
                image = Image.new("RGB", (224, 224), (0, 0, 0))  # Fallback image
            else:
                image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            image = Image.new("RGB", (224, 224), (0, 0, 0))  # Fallback image
        
        question = row['question']
        answer = row['answer']

        # Process image and question with attention mask
        encoding = self.processor(
            images=image,
            text=question,
            padding="max_length",
            max_length=128,
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )
        
        # Tokenize answer as labels with fixed length
        labels = self.processor.tokenizer(
            answer,
            padding="max_length",
            truncation=True,
            max_length=32,
            return_tensors="pt"
        )["input_ids"]

        # Remove batch dimension from tensors
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = labels.squeeze(0)

        return encoding

In [8]:
train_dataset = VQADataset(train_df, processor)
test_dataset = VQADataset(val_df, processor)

In [9]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value", "key"],
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, lora_config)
print("LoRA applied to the model")

# Prepare model with Accelerator
model = accelerator.prepare(model)

LoRA applied to the model


In [10]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    run_name="blip_vqa_lora_finetune",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

# Create Trainer instance with default data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=default_data_collator,
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [11]:
if torch.cuda.is_available():
    print("GPU Memory Usage Before Training:")
    print(torch.cuda.memory_summary())

GPU Memory Usage Before Training:
|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   1483 MiB |   1483 MiB |   1483 MiB |      0 B   |
|       from large pool |   1468 MiB |   1468 MiB |   1468 MiB |      0 B   |
|       from small pool |     15 MiB |     15 MiB |     15 MiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |   1483 MiB |   1483 MiB |   1483 MiB |      0 B   |
|       from large pool |   1468 MiB |   1468 MiB |   1468 MiB |      0 B   |
|       from small pool |     15 MiB |     15 MiB |     15 MiB |      0 B   |
|-----------------------------

In [12]:
trainer.train()

# Evaluate the model on the test dataset
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save the fine-tuned model 
trainer.save_model("/kaggle/working/fine_tuned_blip_vqa_lora")
print("Model saved to '/kaggle/working/fine_tuned_blip_vqa_lora'")

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
10,10.204
20,10.0101
30,9.824
40,9.6321
50,9.4983
60,9.3653
70,9.2095
80,9.063
90,8.9652
100,8.8762




Evaluation Results: {'eval_runtime': 690.7807, 'eval_samples_per_second': 17.372, 'eval_steps_per_second': 1.086, 'epoch': 3.0}
Model saved to '/kaggle/working/fine_tuned_blip_vqa_lora'


In [13]:
skipped_entries = []

# Function to predict answer with timeout
@timeout(10)  # Set timeout to 10 seconds
def predict_answer(image_path, question):
    try:
        if not os.path.exists(image_path):
            print(f"Image not found: {image_path}")
            return ""
        image = Image.open(image_path).convert("RGB")

        # Prepare inputs for BLIP VQA
        inputs = processor(images=image, text=question, return_tensors="pt", padding=True).to(device)

        # Use generate for inference
        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=20)
        predicted_answer = processor.decode(output_ids[0], skip_special_tokens=True).strip()

        # Extract one-word answer (post-process if needed)
        predicted_answer = predicted_answer.split()[-1] if predicted_answer else ""

        del inputs, output_ids
        gc.collect()
        torch.cuda.empty_cache()
        return predicted_answer
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

In [14]:
import os

output_dir = '/kaggle/working/resultCSV'

# Create the directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)


In [15]:
from tqdm import tqdm

In [16]:
import os
import pandas as pd
import gc
import torch

# Define the file path
pred_path = "/kaggle/working/resultCSV/predictions.csv"

# Delete the file if it exists
if os.path.exists(pred_path):
    os.remove(pred_path)
    print(f"Deleted existing file: {pred_path}")
else:
    print(f"No file found at: {pred_path}")

# Create an empty DataFrame with the appropriate columns
columns = ["img_path", "question", "true_answer", "predicted_answer"]
empty_df = pd.DataFrame(columns=columns)

# Save the empty DataFrame to create a fresh CSV
empty_df.to_csv(pred_path, index=False)
print(f"Created new empty file: {pred_path}")


No file found at: /kaggle/working/resultCSV/predictions.csv
Created new empty file: /kaggle/working/resultCSV/predictions.csv


In [17]:
# Resume support with handling for empty CSV


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


start_idx = 0
pred_path = '/kaggle/working/resultCSV/predictions.csv'
if os.path.exists(pred_path):
    try:
        existing = pd.read_csv(pred_path)
        if not existing.empty:
            start_idx = len(existing)
            print(f"Resuming from index {start_idx}")
        else:
            print(f"Prediction file {pred_path} is empty, starting from index 0")
    except pd.errors.EmptyDataError:
        print(f"Prediction file {pred_path} is empty, starting from index 0")
else:
    print(f"No existing prediction file found, starting from index 0")

#Lists to store predictions and ground truth
df = val_df
predictions = []
ground_truths = df["answer"].tolist()
y_true = []
y_pred = []
results = []
all_preds = []
# Predict answers for each row
for idx, row in tqdm(df.reset_index(drop=True).iterrows(), total=len(df), desc="Processing"):
    if idx < start_idx:
        continue  # Skip already processed rows
    full_image_path = row["full_image_path"]
    question = row["question"]
    try:
        predicted = predict_answer(full_image_path, question)
    except TimeoutError:
        print(f"Timeout processing row {idx}: {full_image_path}")
        skipped_entries.append({"row": idx, "full_image_path": full_image_path, "question": question})
        predicted = ""
    predictions.append(predicted)

    y_true.append(str(ground_truths[idx]).lower())
    y_pred.append(str(predicted).lower())
    
    all_preds.append(str(predicted).lower())
    
    results.append({
        "img_path": full_image_path,
        "question": question,
        "true_answer": str(ground_truths[idx]).lower(),
        "predicted_answer": str(predicted).lower()
    })
    if (idx + 1) % 1000 == 0 or (idx + 1) == len(df):
        # Convert to DataFrame
        chunk_df = pd.DataFrame(results)

        # Save predictions
        if not os.path.exists(pred_path):
            chunk_df.to_csv(pred_path, mode='w', index=False, header=True)
        else:
            chunk_df.to_csv(pred_path, mode='a', index=False, header=False)

        # Compute metrics
        # accuracy = accuracy_score(y_true, y_pred) if y_true else 0
        # f1 = (2 * accuracy) / (1 + accuracy) if accuracy > 0 else 0
        # metrics_entry = pd.DataFrame([{"step": idx + 1, "accuracy": accuracy, "f1_score": f1}])

        # Save metrics
        # if not os.path.exists(metrics_path):
        #     metrics_entry.to_csv(metrics_path, mode='w', index=False, header=True)
        # else:
        #     metrics_entry.to_csv(metrics_path, mode='a', index=False, header=False)

        print(f"Checkpoint saved at index {idx + 1}")
        # Reset for next chunk
        results = []
        y_true = []
        y_pred = []

# Save final predictions
# results_df = pd.DataFrame({
#     "Image Path": df["full_image_path"],
#     "Question": df["question"],
#     "Ground Truth": ground_truths,
#     "Predicted": predictions
# })
# results_df = pd.DataFrame(results)

# results_df.to_csv("/kaggle/working/resultCSV/lora_blip_vqa_predictions.csv", index=False)

# # Compute final accuracy and F1 score
# correct = sum(1 for pred, truth in zip(predictions, ground_truths) if str(pred).lower() == str(truth).lower())
# total = len(predictions)
# accuracy = (correct / total) * 100 if total > 0 else 0
# f1 = (2 * accuracy) / (1 + accuracy) if accuracy > 0 else 0

# # Print results
# print(f"\nTotal Questions: {total}")
# print(f"Correct Predictions: {correct}")
# print(f"Accuracy: {accuracy:.2f}%")
# print(f"F1-Score: {f1:.2f}%")
# print(f"Skipped Entries: {len(skipped_entries)}")
# if skipped_entries:
#     print("\nSkipped Entries (due to timeout):")
#     skipped_df = pd.DataFrame(skipped_entries)
#     print(skipped_df)

# # Display a few examples
# print("\nSample Predictions:")
# print(results_df.head(10))

Prediction file /kaggle/working/resultCSV/predictions.csv is empty, starting from index 0


Processing:   8%|▊         | 1000/12000 [08:38<1:34:20,  1.94it/s]

Checkpoint saved at index 1000


Processing:  17%|█▋        | 2000/12000 [17:18<1:27:06,  1.91it/s]

Checkpoint saved at index 2000


Processing:  25%|██▌       | 3000/12000 [25:57<1:16:38,  1.96it/s]

Checkpoint saved at index 3000


Processing:  33%|███▎      | 4000/12000 [34:37<1:08:25,  1.95it/s]

Checkpoint saved at index 4000


Processing:  42%|████▏     | 5000/12000 [43:12<59:32,  1.96it/s]

Checkpoint saved at index 5000


Processing:  50%|█████     | 6000/12000 [51:49<51:44,  1.93it/s]

Checkpoint saved at index 6000


Processing:  58%|█████▊    | 7000/12000 [1:00:29<43:16,  1.93it/s]

Checkpoint saved at index 7000


Processing:  67%|██████▋   | 8000/12000 [1:09:11<34:36,  1.93it/s]

Checkpoint saved at index 8000


Processing:  75%|███████▌  | 9000/12000 [1:17:54<26:09,  1.91it/s]

Checkpoint saved at index 9000


Processing:  83%|████████▎ | 10000/12000 [1:26:37<17:42,  1.88it/s]

Checkpoint saved at index 10000


Processing:  92%|█████████▏| 11000/12000 [1:35:20<08:36,  1.94it/s]

Checkpoint saved at index 11000


Processing: 100%|██████████| 12000/12000 [1:43:58<00:00,  1.92it/s]

Checkpoint saved at index 12000





In [18]:
# val_df = val_df.iloc[1:].reset_index(drop=True)
all_actuals_BLIP = val_df['answer'].tolist()

In [None]:
# Normalize case
preds = pd.read_csv('/kaggle/working/resultCSV/predictions.csv')
preds = preds.dropna()
# preds = preds.iloc[1:].reset_index(drop=True)
all_preds_BLIP = preds['predicted_answer'].tolist()
all_actuals_BLIP = preds['true_answer'].tolist()
preds_l = [p.lower() for p in all_preds_BLIP]
refs_l = [r.lower() for r in all_actuals_BLIP]

In [9]:

# Compute exact-match binary metrics
y_pred_bin = [int(p == r) for p, r in zip(preds_l, refs_l)]
y_true_bin = [1] * len(all_actuals_BLIP)

acc = accuracy_score(y_true_bin, y_pred_bin)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_true_bin, y_pred_bin, average="binary", zero_division=0
)

# Print metrics
print(f"Exact-match Accuracy: {acc:.3f}")
print(f"Exact-match Precision: {prec:.3f}")
print(f"Exact-match Recall:    {rec:.3f}")
print(f"Exact-match F1:        {f1:.3f}\n")

# Save predictions and ground truths to CSV
# pred_ref_df = pd.DataFrame({
#     "Prediction": predictions,
#     "Ground_Truth": refs
# })
# df.to_csv('prediction_output.csv', index=False)

Exact-match Accuracy: 0.596
Exact-match Precision: 1.000
Exact-match Recall:    0.596
Exact-match F1:        0.747



In [10]:
from bert_score import score

P, R, F1 = score(preds_l, refs_l, lang="en", verbose=True, rescale_with_baseline = True)
print(f"BERTScore - Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/18 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/188 [00:00<?, ?it/s]

done in 12.48 seconds, 961.83 sentences/sec
BERTScore - Precision: 0.9387, Recall: 0.9248, F1: 0.9307


In [None]:

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge1_scores = []
rougeL_scores = []

for pred, ref in zip(preds_l, refs_l):
    scores = scorer.score(ref, pred)
    rouge1_scores.append(scores["rouge1"].fmeasure)
    rougeL_scores.append(scores["rougeL"].fmeasure)

print(f"ROUGE Scores:")
print(f"ROUGE-1 F1: {np.mean(rouge1_scores):.4f}")
print(f"ROUGE-L F1: {np.mean(rougeL_scores):.4f}")

ROUGE Scores:
ROUGE-1 F1: 0.6119
ROUGE-L F1: 0.6119


In [13]:
# 5. Sentence Embedding Cosine Similarity (memory-efficient pairwise)
model = SentenceTransformer('all-MiniLM-L6-v2')
pred_embeds = model.encode(preds_l, convert_to_tensor=True, device='cuda')
ref_embeds = model.encode(refs_l, convert_to_tensor=True, device='cuda')

cos_sims = [
    util.cos_sim(pred_embeds[i], ref_embeds[i]).item()
    for i in range(len(preds_l))
]

print(f"Sentence-BERT Cosine Similarity:")
print(f"Average Cosine Similarity: {np.mean(cos_sims):.4f}")


Sentence-BERT Cosine Similarity:
Average Cosine Similarity: 0.8070


In [None]:
import shutil

# Compress each folder
shutil.make_archive("/kaggle/working/fine_tuned_blip_vqa_lora", 'zip', "/kaggle/working/fine_tuned_blip_vqa_lora")
shutil.make_archive("/kaggle/working/resultCSV", 'zip', "/kaggle/working/resultCSV")
shutil.make_archive("/kaggle/working/results", 'zip', "/kaggle/working/results")
