In [None]:
!pip install -q transformers==4.43.3 accelerate datasets peft bitsandbytes pillow evaluate trl==0.9.4

In [None]:
!pip install evaluate -q

In [None]:
!pip install "trl==0.9.4" -q

In [None]:
import pandas as pd
from datasets import Dataset, Image

# Load parquet file
parquet_path = "/kaggle/input/appron-prompt-dataset/dataset.parquet"
df = pd.read_parquet(parquet_path)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Cast the 'image' column as Image feature
dataset = dataset.cast_column("image", Image(decode=True))

# First split: train/test
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [None]:
def format_data(sample):
    return {
        "messages": [
            {"role": "system", "content": "You are an assistant that describes images."},
            {
                "role": "user",
                "content": [{"type": "image", "image": sample["image"]}],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["caption"]}],
            },
        ]
    }

In [None]:
train_dataset = [format_data(sample) for sample in train_dataset]
test_dataset = [format_data(sample) for sample in test_dataset]

In [None]:
from transformers import AutoProcessor

model = "Intel/llava-gemma-2b"
processor = AutoProcessor.from_pretrained(model)

def preprocess(sample):
    proc = processor.apply_chat_template(
        sample["messages"],
        tokenize=True,
        add_generation_prompt=False,
        return_tensors="pt"
    )
    return {k: v[0] for k, v in proc.items()}

# train_dataset = train_dataset.map(preprocess, remove_columns=train_dataset.column_names)
# test_dataset = test_dataset.map(preprocess, remove_columns=test_dataset.column_names)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./llava-gemma-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_strategy="steps",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=500,
    save_total_limit=2,
    bf16=True,
    report_to="none",   # no wandb
    push_to_hub=False
)


In [None]:
import evaluate
import math

bleu = evaluate.load("bleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Compute perplexity from loss
    loss = logits.mean()  # trainer already logs loss
    perplexity = math.exp(loss) if loss < 20 else float("inf")
    
    # Decode predictions & labels
    predictions = processor.tokenizer.batch_decode(
        logits.argmax(-1), skip_special_tokens=True
    )
    references = processor.tokenizer.batch_decode(
        labels, skip_special_tokens=True
    )
    
    bleu_score = bleu.compute(predictions=predictions, references=[[r] for r in references])
    return {"perplexity": perplexity, "bleu": bleu_score["bleu"]}


In [None]:
from transformers import AutoModelForVision2Seq, TrainingArguments
from trl import SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    packing=False
)


In [None]:
import transformers, trl
print("Transformers:", transformers.__version__)
print("TRL:", trl.__version__)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert logs to DataFrame
logs = pd.DataFrame(trainer.state.log_history)

# Training loss
train_loss = logs[logs["loss"].notnull()]
plt.plot(train_loss["step"], train_loss["loss"], label="Training Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.legend()
plt.show()

# Evaluation metrics
if "eval_loss" in logs.columns:
    eval_logs = logs[logs["eval_loss"].notnull()]
    plt.plot(eval_logs["step"], eval_logs["eval_loss"], label="Eval Loss")
    plt.xlabel("Step")
    plt.ylabel("Eval Loss")
    plt.title("Evaluation Loss Curve")
    plt.legend()
    plt.show()


In [None]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torchvision import transforms
from tqdm.notebook import tqdm
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM, get_scheduler
import numpy as np

# --------------------------
# CONFIG
# --------------------------
MODEL_ID = "Intel/llava-gemma-2b"
BATCH_SIZE = 2
NUM_EPOCHS = 2
LR = 2e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_LENGTH = 1024

# --------------------------
# LOAD MODEL + PROCESSOR
# --------------------------
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID).to(DEVICE)

# --------------------------
# DUMMY DATASET (replace with your JSON/CSV)
# Each sample has: {"image": <path>, "messages": [{"role":"user","content":"..."} , {"role":"assistant","content":"..."}]}
# --------------------------
class MyDataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

# --------------------------
# COLLATE FN
# --------------------------
class Collator:
    def __init__(self, processor, max_length=1024):
        self.processor = processor
        self.max_length = max_length

    def __call__(self, batch):
        input_ids_list, attn_list, pixel_values_list, labels_list = [], [], [], []

        for sample in batch:
            # Load image
            img = sample["image"]
            if isinstance(img, str):   # path
                img = Image.open(img).convert("RGB")

            # Apply chat template
            messages = sample["messages"]
            proc = self.processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=False,
                return_tensors="pt"
            )

            input_ids = proc["input_ids"][0]
            attention_mask = proc["attention_mask"][0]

            # Get image features
            img_out = self.processor(images=img, return_tensors="pt")
            pixel_values = img_out["pixel_values"][0]

            # Labels = copy input_ids but mask user/system tokens (-100)
            labels = input_ids.clone()
            # Simple masking rule: only keep assistant tokens
            if messages[-1]["role"] == "assistant":
                assistant_text = messages[-1]["content"]
                with_assistant = self.processor.apply_chat_template(
                    [{"role": "assistant", "content": assistant_text}],
                    tokenize=True,
                    add_generation_prompt=False,
                    return_tensors="pt"
                )
                keep_len = with_assistant["input_ids"].shape[1]
                labels[:-keep_len] = -100
            else:
                labels[:] = -100

            # Truncate
            input_ids = input_ids[:self.max_length]
            attention_mask = attention_mask[:self.max_length]
            labels = labels[:self.max_length]

            input_ids_list.append(input_ids)
            attn_list.append(attention_mask)
            labels_list.append(labels)
            pixel_values_list.append(pixel_values)

        # Pad text
        input_ids_padded = torch.nn.utils.rnn.pad_sequence(
            input_ids_list, batch_first=True, padding_value=processor.tokenizer.pad_token_id
        )
        attn_padded = torch.nn.utils.rnn.pad_sequence(
            attn_list, batch_first=True, padding_value=0
        )
        labels_padded = torch.nn.utils.rnn.pad_sequence(
            labels_list, batch_first=True, padding_value=-100
        )

        pixel_values = torch.stack(pixel_values_list)  # (B, C, H, W)

        return {
            "input_ids": input_ids_padded,
            "attention_mask": attn_padded,
            "labels": labels_padded,
            "pixel_values": pixel_values,
        }

# --------------------------
# TRAINING LOOP
# --------------------------
def train_fn(train_dataset):
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        collate_fn=Collator(processor, max_length=MAX_LENGTH),
    )

    optimizer = AdamW(model.parameters(), lr=LR)
    num_training_steps = NUM_EPOCHS * len(train_loader)
    lr_scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=100, num_training_steps=num_training_steps
    )

    model.train()
    for epoch in range(NUM_EPOCHS):
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
        for step, batch in enumerate(pbar):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            pbar.set_postfix({"loss": loss.item()})

    torch.save(model.state_dict(), "llava_gemma2b_finetuned.pt")

# --------------------------
# EXAMPLE USAGE
# --------------------------
samples = [
    {
        "image": "your_image1.jpg",
        "messages": [
            {"role": "user", "content": "What is in this picture?"},
            {"role": "assistant", "content": "A cat sitting on a sofa."},
        ],
    },
    {
        "image": "your_image2.jpg",
        "messages": [
            {"role": "user", "content": "Describe the object."},
            {"role": "assistant", "content": "A red car on the street."},
        ],
    },
]

dataset = MyDataset(samples)
train_fn(dataset)


In [None]:
!pip install torch tensorboard pillow transformers datasets accelerate evaluate bitsandbytes trl peft

In [None]:
!pip install -q transformers==4.43.3 accelerate datasets peft bitsandbytes pillow evaluate trl==0.9.4

In [4]:
# Fine-Tuning LLaVA-Gemma-2B on an Image-Caption Dataset using Hugging Face TRL and PEFT
# Updated to fix RuntimeError: only Tensors of floating point dtype can require gradients
# by dequantizing the multi_modal_projector before applying PEFT

# Step 1: Install required libraries
# Run in your environment:
# pip install torch tensorboard pillow transformers datasets accelerate evaluate bitsandbytes trl peft

import torch
from huggingface_hub import login
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import BitsAndBytesConfig, LlavaForConditionalGeneration, AutoProcessor, TrainingArguments
from trl import SFTTrainer

# Step 2: Log in to Hugging Face (if pushing model)
# login(token="your_hf_token_here")  # Replace with your Hugging Face token

# Step 3: Define the model ID
model_id = "Intel/llava-gemma-2b"

# Step 4: Load the processor
processor = AutoProcessor.from_pretrained(model_id)

# Step 5: Prepare the dataset
# Assume you have an image-caption dataset in Hugging Face format, e.g., "laion/coco" or your custom one.
# For demonstration, we'll use a small subset. Replace with your dataset.
import pandas as pd
from datasets import Dataset, Image

# Load parquet file
parquet_path = "/kaggle/input/appron-prompt-dataset/dataset.parquet"
df = pd.read_parquet(parquet_path)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Cast the 'image' column as Image feature
dataset = dataset.cast_column("image", Image(decode=True))

# Format the dataset into conversational format for TRL
system_message = "You are a helpful image captioning assistant."
user_prompt = "Describe the image in detail."

def format_example(sample):
    return {
        "messages": [
            {"role": "system", "content": [{"type": "text", "text": system_message}]},
            {"role": "user", "content": [
                {"type": "image", "image": sample["image"]},
                {"type": "text", "text": user_prompt}
            ]},
            {"role": "assistant", "content": [{"type": "text", "text": dataset['caption']}]}
        ]
    }

formatted_dataset = [format_example(sample) for sample in dataset]

# Step 6: Load the model with 4-bit quantization for efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = LlavaForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

# Step 6.5: Dequantize the multi_modal_projector to avoid gradient error
# Since multi_modal_projector will be in modules_to_save, its parameters need to be float dtype
from bitsandbytes.functional import dequantize_4bit

def dequantize_module(module):
    if not hasattr(module, 'weight'):  # Skip if not a linear layer with weight
        return module
    quant_state = module.weight.quant_state
    dequant_weight = dequantize_4bit(module.weight.data, quant_state).to(torch.bfloat16)
    bias = module.bias
    # Get device from parameters
    device = next(module.parameters()).device
    new_module = torch.nn.Linear(
        module.in_features,
        module.out_features,
        bias=bias is not None,
        dtype=dequant_weight.dtype,
        device=device
    )
    new_module.weight.data = dequant_weight
    if bias is not None:
        new_module.bias.data = bias
    return new_module

# Apply to both linear layers in the projector
model.multi_modal_projector.linear_1 = dequantize_module(model.multi_modal_projector.linear_1)
model.multi_modal_projector.linear_2 = dequantize_module(model.multi_modal_projector.linear_2)

# Step 7: Prepare for LoRA fine-tuning
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],  # Target LLM modules
    modules_to_save=["multi_modal_projector"]  # Save the vision-language connector
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

# Step 8: Set up training arguments using TrainingArguments
training_args = TrainingArguments(
    output_dir="./llava-gemma-2b-finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=2,  # Adjust based on GPU memory (Gemma-2B is small)
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.1,
    max_grad_norm=1.0,
    optim="paged_adamw_8bit",
    fp16=True,
    remove_unused_columns=False,
    report_to="tensorboard",
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    push_to_hub=False  # Set to True to push to HF
)

# Step 9: Initialize the SFTTrainer, passing SFT-specific args here
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,
    peft_config=lora_config,  # Already applied, but for reference
    tokenizer=processor.tokenizer,  # Use the tokenizer from processor
    dataset_text_field="messages",
    packing=False,  # No packing for VLM
    max_seq_length=512  # Adjust as needed
)

# Step 10: Start fine-tuning
trainer.train()

# Step 11: Save the fine-tuned model
trainer.save_model("./llava-gemma-2b-finetuned")
processor.save_pretrained("./llava-gemma-2b-finetuned")

# Explanation of Fix:
# The TypeError occurs because in older versions of TRL (e.g., <= v0.8.x), SFTConfig does not support parameters like max_seq_length, dataset_text_field, or packing.
# These are SFTTrainer-specific arguments.
# By switching to transformers.TrainingArguments for the core training params and passing the SFT-specific args directly to SFTTrainer, the code becomes compatible with both old and new TRL versions.
# If you update TRL to the latest version (pip install -U trl), you co

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


AttributeError: 'list' object has no attribute 'column_names'