In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip /content/drive/MyDrive/VQA.zip

In [None]:
import os
import torch
# os.environ["VISIBLE_CUDA_DEVICES"]="0"
os.environ["WANDB_API_KEY"] = "007c52154146420e5b9b166408c9eeef9eb20599"

In [None]:
import torch
from torch.utils.data import Subset,DataLoader
from transformers import TrainingArguments, Trainer,BlipProcessor, BlipForQuestionAnswering
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset,load_dataset
from PIL import Image
import pandas as pd
from torchvision import transforms
from sklearn.model_selection import train_test_split
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')

In [None]:
from torch.utils.data import Dataset
# from datasets import Dataset
from PIL import Image
import os
class VQADataset(Dataset):
    def __init__(self, data, processor):
        """
        Args:
            data (list of dicts): Each item should have keys: 'image', 'question', 'answer'
            processor: BLIP-2 processor (e.g., Blip2Processor.from_pretrained(...))
            image_root_dir (str): Root directory where images are stored
        """
        # super().__init__()
        self.data = data
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        try:
            # Retrieve the data item for the current index
            item = self.data.iloc[idx]

            # Construct the full path to the image
            image_path = str(item['image_path'])

            # Check if the image exists
            if not os.path.exists(image_path):
                raise FileNotFoundError(f"Image not found: {image_path}")

            # Try to open the image and convert to RGB
            try:
                image = Image.open(image_path).convert("RGB")
            except Exception as img_err:
                print(f"[ERROR] Unable to open image at index {idx}: {image_path}, Error: {str(img_err)}")
                # You can choose to raise or skip based on your use-case
                raise img_err

            # Extract question and answer
            question = str(item["question"])
            answer = str(item["response"])

            # Process the image and question using the BLIP-2 processor
            inputs = self.processor(
                images=image,
                text=question,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=64
            )

            # Remove batch dimension from the processed inputs
            inputs = {k: v.squeeze(0) for k, v in inputs.items()}

            inputs["labels"] = processor.tokenizer(answer, return_tensors="pt", padding='max_length', max_length=16, truncation=True).input_ids.squeeze(0)

            return inputs

        except FileNotFoundError as e:
            print(f"[ERROR] FileNotFoundError: {str(e)}")
            raise e
        except KeyError as e:
            print(f"[ERROR] KeyError: Missing key in data at index {idx}: {str(e)}")
            raise e
        except Exception as e:
            print(f"[ERROR] Unexpected error at index {idx}: {str(e)}")
            raise e

In [None]:
train_df =  pd.read_csv("train_dataset.csv")
val_df = pd.read_csv("val.csv")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
train_dataset = VQADataset(train_df[:10000],processor)
val_dataset = VQADataset(val_df,processor)

In [None]:
sample_item = train_dataset[0]
print(sample_item.keys())  # Should include 'input_ids', 'attention_mask', 'pixel_values', 'labels'
print(sample_item['input_ids'].shape)
print(sample_item['labels'].shape)
print(sample_item['pixel_values'].shape)
print( sample_item['attention_mask'].shape)
print(sample_item)

In [None]:
pip install -U bitsandbytes

In [None]:
from transformers import BlipForQuestionAnswering, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

# Ensure CUDA is available
assert torch.cuda.is_available(), "8-bit quantization requires CUDA (GPU)."

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

# Load model with quantization
model = BlipForQuestionAnswering.from_pretrained(
    "Salesforce/blip-vqa-base",
    device_map="auto",
    quantization_config=bnb_config
)

# Prepare for LoRA training
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.05,
    bias="none",
    # task_type="MULTI_MODAL"
)

# Apply LoRA
model = get_peft_model(model, lora_config)


In [None]:
def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.nn.utils.rnn.pad_sequence(
        [item["labels"] for item in batch], batch_first=True, padding_value=-100
    )
    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:

from transformers import DefaultDataCollator,PrinterCallback,EarlyStoppingCallback,ProgressCallback

args = TrainingArguments(
    output_dir="/kaggle/working/output_dir",
    eval_strategy="epoch",
    run_name="blipvqabase-40k-16-rankqlora ",
    gradient_accumulation_steps=1,
    num_train_epochs=5,
    fp16=True,
    label_names=["labels"],
    per_device_eval_batch_size=32,
    per_device_train_batch_size=32,
    load_best_model_at_end=True,
    warmup_ratio=0.1,
    save_strategy="epoch",
    logging_strategy="epoch",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    lr_scheduler_type="cosine"
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=collate_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2,early_stopping_threshold=0.001),PrinterCallback(),ProgressCallback()]
)

In [None]:
print("Training Started")
trainer.train()

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration

# After training…
output_dir = "weights-qlora"

# 1) Save the model weights & config
model.save_pretrained(output_dir)

# 2) (If you used a processor for images + text)
processor.save_pretrained(output_dir)

In [None]:
!zip -r weights.zip "weights-qlora"

In [None]:
!mv weights.zip /content/drive/MyDrive/
