# Prepare Dataset

In [None]:
def convert_to_conversation(sample):
    instruction = "You are an expert radiographer. Describe accurately what you see in this image."
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample["image"]} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type" : "text",  "text"  : sample["caption"]} ]
        },
    ]
    return { "messages" : conversation }

In [None]:
def datasplit(train_num, test_num):
    from datasets import load_dataset
    dataset = load_dataset("unsloth/Radiology_mini")
    test_data = dataset["test"].select(range(test_num))
    train_data = dataset["train"].select(range(train_num))
    print(test_data)
    print(train_data)
    converted_dataset = [convert_to_conversation(sample) for sample in train_data]
    return converted_dataset, test_data

# Prepare model

In [None]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
def load_model():
    model, tokenizer = FastVisionModel.from_pretrained(
        "unsloth/Qwen2-VL-2B-Instruct",
        load_in_4bit = False, # Use 4bit to reduce memory use. False for 16bit LoRA.
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
    )
    model = FastVisionModel.get_peft_model(
        model,
        finetune_vision_layers     = False, # False if not finetuning vision layers
        finetune_language_layers   = True, # False if not finetuning language layers
        finetune_attention_modules = True, # False if not finetuning attention layers
        finetune_mlp_modules       = True, # False if not finetuning MLP layers

        r = 16,           # The larger, the higher the accuracy, but might overfit
        lora_alpha = 16,  # Recommended alpha == r at least
        lora_dropout = 0,
        bias = "none",
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
        # target_modules = "all-linear", # Optional now! Can specify a list if needed
    )
    return model, tokenizer

# Create Trainer object

In [None]:
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig

def prep_train(model, tokenizer, converted_dataset, num_step, num_epoch):
    FastVisionModel.for_training(model) # Enable for training!

    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
        train_dataset = converted_dataset,
        args = SFTConfig(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            max_steps = num_step*num_epoch ,
            # num_train_epochs = 1, # Set this instead of max_steps for full training runs
            learning_rate = 2e-4,
            fp16 = not is_bf16_supported(),
            bf16 = is_bf16_supported(),
            logging_steps = 30,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            report_to = "none",     # For Weights and Biases

            # You MUST put the below items for vision finetuning:
            remove_unused_columns = False,
            dataset_text_field = "",
            dataset_kwargs = {"skip_prepare_dataset": True},
            dataset_num_proc = 4,
            max_seq_length = 2048,
        ),
    )
    return trainer

# Start memory

In [None]:
def init_mem():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    return start_gpu_memory

# Get Memory Status

In [None]:
def get_mem(trainer_stats, start_gpu_memory):
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    min_time = round(trainer_stats.metrics['train_runtime']/60, 2)
    return min_time, used_memory, used_memory_for_lora

# Get Response

In [None]:
def get_res(model, tokenizer, test_data):
    FastVisionModel.for_inference(model) # Enable for inference!
    num = len(test_data)
    response = {}
    for i in range(num):
        image = test_data[i]["image"]
        instruction = "You are an expert radiographer. Describe accurately what you see in this image."

        messages = [
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": instruction}
            ]}
        ]
        input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
        inputs = tokenizer(
            image,
            input_text,
            add_special_tokens = False,
            return_tensors = "pt",
        ).to("cuda")

        from transformers import TextStreamer
        text_streamer = TextStreamer(tokenizer, skip_prompt = True)
        output_ids = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                            use_cache = True, temperature = 1.5, min_p = 0.1)
        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        response[i] = generated_text
    return response

# Evaluate BERTScore

In [None]:
import numpy as np
from bert_score import score as bert_score

def evaluate(response, test_data):
    bert_p_scores, bert_r_scores, bert_f1_scores = [], [], []

    results = {}

    # Evaluate each response
    for i in range(len(response)):
        reference = test_data[i]["caption"]
        lines = response[i].splitlines()
        hypothesis = "\n".join(lines[4:])

        # BERTScore
        P, R, F1 = bert_score([hypothesis], [reference], lang="en", verbose=False)
        bert_p_scores.append(P.item())
        bert_r_scores.append(R.item())
        bert_f1_scores.append(F1.item())

    # Compute average scores
    avg_bert_p = np.mean(bert_p_scores)
    avg_bert_r = np.mean(bert_r_scores)
    avg_bert_f1 = np.mean(bert_f1_scores)
    results["BERT_Precision"] = avg_bert_p
    results["BERT_Recall"] = avg_bert_r
    results["BERT_F1"] = avg_bert_f1
    return results
