In [None]:
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import pandas as pd
import torch
import os
# Load model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your dataset
df = pd.read_csv("/kaggle/input/merged-one/merged_vqa_dataset_output.csv")  # Update with your actual filename

all_results = []

for idx, row in df.iterrows():
    BASE_IMAGE_DIR = "/kaggle/input/abo-dataset/images/small"
    full_image_path = os.path.join(BASE_IMAGE_DIR, str(row['path']))
    image = Image.open(full_image_path).convert('RGB')
    for q_col, a_col in zip(['q1', 'q2', 'q3'], ['a1', 'a2', 'a3']):
        question = row[q_col]
        gt_answer = row[a_col]
        inputs = processor(image, question, return_tensors="pt").to(device)
        with torch.no_grad():
            out = model.generate(**inputs)
        pred_answer = processor.decode(out[0], skip_special_tokens=True)
        # Simple normalization for comparison
        gt_answer_norm = str(gt_answer).strip().lower()
        pred_answer_norm = str(pred_answer).strip().lower()
        is_correct = (pred_answer_norm == gt_answer_norm)
        all_results.append({
            "path": row['path'],
            "question": question,
            "gt_answer": gt_answer,
            "pred_answer": pred_answer,
            "is_correct": is_correct
        })
    if idx % 100 == 0:
        print(f"Processed {idx+1}/{len(df)} images")

# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Compute accuracy
accuracy = results_df['is_correct'].mean()
print(f"Baseline Accuracy: {accuracy:.3f}")

# Optionally, save results
results_df.to_csv("vqa_baseline_results.csv", index=False)


In [None]:
number_map = {
    "zero": "0",
    "one": "1",
    "two": "2",
    "three": "3",
    "four": "4",
    "five": "5",
    "six": "6",
    "seven": "7",
    "eight": "8",
    "nine": "9",
    "ten": "10"
}

def normalize_answer(ans):
    ans = str(ans).strip().lower()
    # Convert number words to digits
    if ans in number_map:
        return number_map[ans]
    # Also check if it's a digit string
    if ans.isdigit():
        return ans
    # Normalize common yes/no answers
    if ans in ["yes", "no"]:
        return ans
    # Optionally, add more normalization rules here
    return ans

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your dataset
df = pd.read_csv("/kaggle/input/merged-one/merged_vqa_dataset_output.csv")

all_results = []

BASE_IMAGE_DIR = "/kaggle/input/abo-dataset/images/small"

for idx, row in df.iterrows():
    full_image_path = os.path.join(BASE_IMAGE_DIR, str(row['path']))
    image = Image.open(full_image_path).convert('RGB')
    for q_col, a_col in zip(['q1', 'q2', 'q3'], ['a1', 'a2', 'a3']):
        question = row[q_col]
        gt_answer = row[a_col]
        inputs = processor(image, question, return_tensors="pt").to(device)
        with torch.no_grad():
            out = model.generate(**inputs)
        pred_answer = processor.decode(out[0], skip_special_tokens=True)
        # Robust normalization for comparison
        gt_answer_norm = normalize_answer(gt_answer)
        pred_answer_norm = normalize_answer(pred_answer)
        is_correct = (pred_answer_norm == gt_answer_norm)
        all_results.append({
            "path": row['path'],
            "question": question,
            "gt_answer": gt_answer,
            "pred_answer": pred_answer,
            "is_correct": is_correct
        })
    if idx % 100 == 0:
        print(f"Processed {idx+1}/{len(df)} images")

# Convert results to DataFrame
results_df = pd.DataFrame(all_results)

# Compute accuracy
accuracy = results_df['is_correct'].mean()
print(f"Baseline Accuracy: {accuracy:.3f}")

# Optionally, save results
results_df.to_csv("vqa_baseline_results_normalized.csv", index=False)