In [None]:
from datasets import load_dataset, Dataset
from PIL import Image
import pandas as pd
import os
from unsloth import FastVisionModel
import torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import TrainingArguments, Trainer

# Load and Format Dataset

In [None]:
# Load Json dataset
df = pd.read_json("./data/train_labels.jsonl", lines=True)
dataset = Dataset.from_pandas(df)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df)

In [None]:
# Function to load images
def load_image(example, raw_data_dir: str = './data'):
    example["image"] = Image.open(os.path.join(raw_data_dir, example["path"])).convert("RGB")
    return example

In [None]:
# Apply image loading function
dataset = dataset.map(load_image)

# Select Vision Language Model

In [None]:
model_name = "unsloth/Llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit"
max_seq_length = 512

# Load tokenizer & model
model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
)

model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = False,
    finetune_language_layers   = True,
    finetune_attention_modules = True,
    finetune_mlp_modules       = True,

    r = 8,
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    random_state = 42
)

In [None]:
instruct_prompt = 'You are an expert at inspecting power grid infrastructure. Specifically, you analyze images and determine if there is or is not damage from a woodpecker to the wooden utility/power pole(s). Your output must be a valid Json object, with only one key, "has_woodpecker_damage", mapping to a boolean true or false.'

def generate_conversation(sample):
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruct_prompt},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : f'{{"has_woodpecker_damage": {sample["has_woodpecker_damage"]}}}'} ]
        },
    ]
    return { "messages" : conversation }
pass

In [None]:
conversational_dataset = [generate_conversation(sample) for sample in dataset]

In [None]:
conversational_dataset[0]

In [None]:
# Enable the model for inference
FastVisionModel.for_inference(model)

image = dataset[0]["image"]

messages = [
    {"role": "user", "content": [
        {"type": "text", "text": instruct_prompt},
        {"type": "image"}
    ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

In [None]:
from trl import SFTTrainer, SFTConfig

In [None]:
class VisionLanguageDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        image = sample["image"]
        text = sample["text"]

        # Tokenize text
        inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True)

        return {
            "pixel_values": image,  # This needs to be preprocessed correctly
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }

In [None]:
# Create dataset and dataloader
train_dataset = VisionLanguageDataset(dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    evaluation_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision
    push_to_hub=False,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()


In [None]:
model.save_pretrained("llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit-woodpecker")
tokenizer.save_pretrained("llama-3.2-11B-Vision-Instruct-unsloth-bnb-4bit-woodpecker")