<a name="Data"></a>
### DATA
This section will unload all of the .npz info I had saved and the axial, saggital, and coronal slices to create individual data entries.

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Thu Mar  6 19:05:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             45W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import torch
print(torch.__version__)

2.5.1+cu121


In [1]:
# Unload all data such that it's easily accessible
main_data_path = "/content/drive/MyDrive/ENGSCI/4TH YEAR/fall 4th year/ESC499/Code Test/Finetuning/Data/"
training_data_path = main_data_path+"training"
test_data_path = main_data_path+"test"

training_seg_path = main_data_path+"train_seg_poly.npz"
test_seg_path = main_data_path+"test_seg_poly.npz"

training_bb_path = main_data_path+"train_seg_poly_beg_end.npz"
test_bb_path = main_data_path+"test_seg_poly_beg_end.npz"

In [2]:
# Import libraries
from PIL import Image
import numpy as np
import os
import matplotlib.pyplot as plt
import random

random.seed(42)

In [3]:
# Mount the Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Functions

def load_scan_from_npz(file_path):
    data = np.load(file_path)
    return data['voxel'], data['ax'], data['sag'], data['cor'], data['label']

def pair_segs(filename, patient_specific_bb, patient_bbb):
    v, a, s, c, l = load_scan_from_npz(filename)
    begin, end = patient_bbb
    image_seg_pairs = []
    for i in range(begin, end, 1):
        polygon_indices = patient_specific_bb[i]
        # remove any indices that are [-1, -1] from the list and make a new list
        polygon_indices = [(index[0], index[1]) for index in polygon_indices if not np.array_equal(index, [-1, -1])]
        np_array, label = v[i], l
        np_array = np.uint8(255 * (np_array - np.min(np_array)) / (np.max(np_array) - np.min(np_array)))
        image = Image.fromarray(np_array)
        image_seg_pairs.append({"query_image":image, "segpoly": polygon_indices})
    return image_seg_pairs

def load_normal_npz(file_path):
    loaded_data = np.load(file_path, allow_pickle = True)
    return loaded_data['array']

def convert_to_conversation(sample, ref_image, ref_gt):
    instruction = '''
    For each glioma in a 128×128 grayscale FLAIR-mode brain scan, output coordinates of a 10 to 15 point polygon that encloses the tumor region.
    These points should be arranged in a clockwise direction and should accurately trace the tumor boundary.
    The output should be a tuple in the format of ((row1, col1), (row2, col2), ..., (rowN, colN)), where N is between 10 and 15 points, and row and col are integers between 0 and 127.
    The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
    For example, for the [reference image], the output should be [reference ground truth].
    For the other provided image, the query image, what is the output?
    Output only the polygon coordinates and no additional text or explanations.
    '''
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : ref_image},
            {"type" : "text",  "text"  : f"{ref_gt}"},
            {"type" : "image", "image" : sample['query_image']}
          ]
        },
        { "role" : "assistant",
          "content" : [
            {"type": "text", "text": f"{sample['segpoly']}"} ]
        },
    ]
    return { "messages" : conversation }


def create_conversation_dataset(data_path, segs_path, bb_path):

    segs_bb = load_normal_npz(segs_path)
    beg_end = load_normal_npz(bb_path)

    # Extract all the patients and the corresponding filenames
    filenames = []
    for filename in os.listdir(data_path):
        if filename.endswith(".npz"):
            file_path = os.path.join(data_path, filename)
            filenames.append(file_path)
    filenames = sorted(filenames, key=lambda x: int(x.split('_')[-1].split('.')[0]))

    # Now we build the dataset
    patients = []
    for index in range(len(filenames)):
        filename = filenames[index]
        patient_specific_bb = segs_bb[index]
        patient_bbb = beg_end[index]
        # print(patient_bbb)
        image_seg_pairs = pair_segs(filename, patient_specific_bb, patient_bbb)
        # patients.append(image_seg_pairs)
        patients += image_seg_pairs

    # we need to get the reference patient - 0 and a centre ish slice is good

    ref_patient = patients[5]
    ref_image = ref_patient['query_image']
    ref_gt = ref_patient['segpoly']

    # Now convert the dataset into input for LLM
    converted_dataset = [convert_to_conversation(sample, ref_image, ref_gt) for sample in patients]
    print(f"Dataset size: {len(converted_dataset)}")
    print(converted_dataset[0])

    return patients, converted_dataset, ref_image, ref_gt


In [5]:
# test
# dataset, converted_dataset, ref_image, ref_gt = create_conversation_dataset(test_data_path, test_seg_path, test_bb_path)

# training
dataset, converted_dataset, ref_image, ref_gt = create_conversation_dataset(training_data_path, training_seg_path, training_bb_path)
random.shuffle(converted_dataset)

Dataset size: 11880
{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': '\n    For each glioma in a 128×128 grayscale FLAIR-mode brain scan, output coordinates of a 10 to 15 point polygon that encloses the tumor region.\n    These points should be arranged in a clockwise direction and should accurately trace the tumor boundary.\n    The output should be a tuple in the format of ((row1, col1), (row2, col2), ..., (rowN, colN)), where N is between 10 and 15 points, and row and col are integers between 0 and 127.\n    The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.\n    For example, for the [reference image], the output should be [reference ground truth].\n    For the other provided image, the query image, what is the output?\n    Output only the polygon coordinates and no additional text or explanations.\n    '}, {'type': 'image', 'image': <PIL.Image.Image image mode=L size=128x128 at 0x7C00F4E20910>}, {'type': 'text', '

In [None]:
random.shuffle(converted_dataset)

<a name="LLM Setup"></a>
### LLM SETUP
This notebook finetunes **Llama 3.2 11B Vision Instruct** to (hopefully) better detect Low VS High Grade Gliomas from Flair Torso Scans.

In [6]:
# 42s 44s
%%capture
!pip uninstall unsloth unsloth_zoo -y
!pip install unsloth==2025.3.6
!pip install unsloth_zoo==2025.3.4
# !pip install unsloth
# # Also get the latest nightly Unsloth!
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [7]:
# import unsloth
# !pip install --upgrade torchvision
import torchvision
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from transformers import TextStreamer
from transformers import TrainerCallback, TrainingArguments

# def config_get(self, key, default=None):
#     return getattr(self, key, default)

# SFTConfig.get = config_get

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [9]:
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",
    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",
    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

# 2mins
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

==((====))==  Unsloth 2025.3.6: Fast Mllama vision patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

**[NEW]** We also support finetuning ONLY the vision part of the model, or ONLY the language part. Or you can select both! You can also select to finetune the attention or the MLP layers!

In [10]:
# 5s
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


<a name="LLM Inference"></a>
### LLM INFERENCE
This let's us run the model for a few examples, so we can see what's happening as the model predicts inherently.

In [11]:
FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[1202]["query_image"]

instruction = '''
For each glioma in a 128×128 grayscale FLAIR-mode brain scan, output coordinates of a 10 to 15 point polygon that encloses the tumor region.
These points should be arranged in a clockwise direction and should accurately trace the tumor boundary.
The output should be a tuple in the format of ((row1, col1), (row2, col2), ..., (rowN, colN)), where N is between 10 and 15 points, and row and col are integers between 0 and 127.
The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
For example, for the [reference image], the output should be [reference ground truth].
For the other provided image, the query image, what is the output?
Output only the polygon coordinates and no additional text or explanations.
'''
messages = [
    {"role": "user", "content": [
        {"type" : "text",  "text"  : instruction},
        {"type" : "image", "image" : ref_image},
        {"type" : "text",  "text"  : f"{ref_gt}"}
      ]
    }
]

input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 140,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

[(47, 25), (58, 17), (60, 17), (62, 18), (65, 28), (65, 30), (63, 31), (47, 26)]<|eot_id|>


In [11]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
7.639 GB of memory reserved.


In [11]:
model = FastVisionModel.for_training(model) # Enable for training!

random.seed(42)
random.shuffle(converted_dataset)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(

        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 2e-5,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # output_dir=main_data_path+"checkpoints", # Directory to save checkpoints
        # save_steps=30,                        # Save every 100 steps
        # save_total_limit=3,                    # Keep only the last 3 checkpoints
        # logging_dir=main_data_path+'logs',     # Directory for logging
        # logging_steps=1,                     # Log every 100 steps

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 1024, # changed from 2048!
    ),
)

In [13]:
torch.cuda.empty_cache()


def config_get(self, key, default=None):
    return getattr(self, key, default)

SFTConfig.get = config_get

In [14]:
# Start the training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 11,880 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 100
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!
Unsloth: Not an error, but MllamaForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.8532
2,2.8186
3,2.8492
4,2.8308
5,2.8051
6,2.7481
7,2.7028
8,2.6519
9,2.585
10,2.4956


In [17]:
FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[0]["query_image"]
ground_truth = dataset[0]["segpoly"]

instruction = '''
For each glioma in a 128×128 grayscale FLAIR-mode brain scan, output coordinates of a 10 to 15 point polygon that encloses the tumor region.
These points should be arranged in a clockwise direction and should accurately trace the tumor boundary.
The output should be a tuple in the format of ((row1, col1), (row2, col2), ..., (rowN, colN)), where N is between 10 and 15 points, and row and col are integers between 0 and 127.
The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
For example, for the [reference image], the output should be [reference ground truth].
For the other provided image, the query image, what is the output?
Output only the polygon coordinates and no additional text or explanations.
'''
messages = [
    {"role": "user", "content": [
        {"type" : "text",  "text"  : instruction},
        {"type" : "image", "image" : ref_image},
        {"type" : "text",  "text"  : f"{ref_gt}"}
      ]
    }
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

print(f"Ground truth: {ground_truth}")
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 140,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Ground truth: [(50, 24), (52, 24), (51, 25)]
[(42, 30), (56, 21), (59, 22), (61, 24), (64, 33), (64, 35), (62, 36), (43, 31)]<|eot_id|>


In [None]:
nameee = "liufelic/segpoly_100step_model"

model.save_pretrained(nameee)
tokenizer.save_pretrained(nameee)
model.push_to_hub(nameee, token = "token")
tokenizer.push_to_hub(nameee, token = "token")

No files have been modified since last commit. Skipping to prevent empty commit.


Saved model to https://huggingface.co/liufelic/segpoly_100step_model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
ll
