## Unsloth Installation: *ONLY FOR COLAB*

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab and Kaggle notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

# Import Libraries

In [None]:
from unsloth import FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTConfig, SFTTrainer
import torch
from datasets import load_dataset, Dataset
from transformers import Qwen2VLImageProcessor
import pandas as pd

#### Expandable Memory Segements

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Make the Model

## Define Models

In [None]:
qwen_models = {
    "2B": {
        'model': "unsloth/Qwen2-VL-2B-Instruct-unsloth-bnb-4bit",
        'repo' : "hamzamooraj99/MedQA-Qwen-2B-LoRA16"
    },
    "7B": {
        'model': "unsloth/Qwen2-VL-7B-Instruct-unsloth-bnb=4bit",
        'repo' : "hamzamooraj99/MedQA-Qwen-7B-LoRA16"
    }
}

In [None]:
model_name = qwen_models['2B']['model']
save_repo = qwen_models['2B']['repo']

## Model Prep

In [None]:
model, tokenizer = FastVisionModel.from_pretrained(
    model_name,
    load_in_4bit= True,
    use_gradient_checkpointing="unsloth",
    
)
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=True, 
    finetune_attention_modules=True, 
    finetune_language_layers=True, 
    finetune_mlp_modules=True,

    r=16,
    lora_alpha=16,
    lora_dropout=0,
    bias='none',
    random_state=3407,
    use_rslora=False,
    loftq_config=None
    
)

tokenizer.image_processor = Qwen2VLImageProcessor(
    do_resize=True,
    max_pixels=512*512,
    min_pixels=224*224
)

# Dataset

## Load the Data & Preview

In [None]:
train_set = load_dataset("hamzamooraj99/PMC-VQA-1", split='train')
val_set = load_dataset("hamzamooraj99/PMC-VQA-1", split='validation')

## Format the Dataset

In [None]:
def convert_to_conversation(sample):
    conversation = [
        {"role": "user",
        "content": [
                {"type": "text", "text": "You are a botanist expert and have to identify and describe the crop and disease (if any) present in the image provided."},
                {"type": "image", "image": sample['image']}
            ]
        },
        {"role": "assistant",
        "content": [
                {"type": "text", "text": f"Class: {sample['crop']}\nDisease: {sample['disease']}"}
            ]
        }
    ]
    return({"messages": conversation})

# Model before Fine-Tuning

In [None]:
FastVisionModel.for_inference(model)
image = train_set[2]['image']
instruction = "You are a botanist expert and have to identify and describe the crop and disease (if any) present in the image provided."

messages = [
    {'role': 'user',
     'content': [
         {'type': 'image'},
         {'type': 'text', 'text': instruction}
     ]}
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to('cuda')

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

# Fine-Tune the Model

In [None]:
FastVisionModel.for_training(model)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer, convert_to_conversation),
    train_dataset = train_set,
    args = SFTConfig(
        per_device_train_batch_size=2,  #Each GPU processes 2 samples per batch,
        gradient_accumulation_steps=4,  #Gradients are accumulated for 4 steps before updating model
        warmup_steps=50,                #Gradually increases learning rate for first n steps to prevent instability
        num_train_epochs=1,             #Parameter to perform full fine-tune (use max_steps=30 for a quick test)
        # Optimisation & Mixed Precision
        learning_rate=2e-4,
        fp16=not is_bf16_supported(),   #Use float16 if GPU does not support bf16
        bf16=is_bf16_supported(),         #Use bfloat16 if GPU supports it (better stability)
        # Optimiser & Weight Decay
        optim="adamw_8bit",
        weight_decay=0.01,              #Regularisation to prevent overfitting
        lr_scheduler_type='linear',     #Decay type for learning rate from learning_rate to 0
        seed=3407,
        output_dir='outputs',
        # Logging & Reporting
        report_to='none',               #Integration with Weights & Biases ('none' disables, 'wandb' enables)
        # Settings for Vision Fine-Tuning
        remove_unused_columns=False,
        dataset_text_field="",
        dataset_kwargs={"skip_prepare_dataset": True},
        dataset_num_proc=8,             #CPU processes for parallel dataset processing
        max_seq_length=256,
        gradient_checkpointing = True,
        # Validation Settings
        do_eval=True,
        eval_strategy='steps',
        eval_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
        per_device_eval_batch_size=4,
        # Save Settings
        save_strategy='steps',
        save_steps=500,
        save_total_limit=2
    )
)

In [None]:
# @title Reset memory allocation
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.reset_max_memory_allocated()

In [None]:
print(torch.cuda.memory_summary())

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Save the Model

In [None]:
save_repo = "hamzamooraj99/AgriPath-Qwen2-VL-2B-LoRA16"

model.push_to_hub(save_repo)
tokenizer.push_to_hub(save_repo)

In [None]:
# @title Reset memory allocation
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.reset_max_memory_allocated()