In [1]:
# Import libraries
import numpy as np
import os
from PIL import Image
import random

# Set random seed
random.seed(42)

# Unload all data such that it's easily accessible
main_data_path = "/content/drive/MyDrive/ENGSCI/4TH YEAR/fall 4th year/ESC499/Code Test/Finetuning/Data/"
training_data_path = main_data_path+"training"
test_data_path = main_data_path+"test"

training_seg_path = main_data_path+"train_seg_poly.npz"
test_seg_path = main_data_path+"test_seg_poly.npz"

training_bb_path = main_data_path+"train_seg_poly_beg_end.npz"
test_bb_path = main_data_path+"test_seg_poly_beg_end.npz"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 42s 44s
%%capture
!pip uninstall unsloth unsloth_zoo -y
!pip install unsloth==2025.3.6
!pip install unsloth_zoo==2025.3.4

In [4]:
# !pip install --upgrade torch torchvision torchaudio # Upgrading PyTorch to the latest version
import torchvision
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from transformers import TextStreamer
from transformers import TrainerCallback, TrainingArguments

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
# Functions (TEST VERSION)

def load_scan_from_npz(file_path):
    data = np.load(file_path)
    return data['voxel'], data['ax'], data['sag'], data['cor'], data['label']

def pair_segs(filename, patient_specific_bb, patient_bbb):
    v, a, s, c, l = load_scan_from_npz(filename)
    begin, end = patient_bbb
    image_seg_pairs = []
    for i in range(begin, end, 1):
        polygon_indices = patient_specific_bb[i]
        # remove any indices that are [-1, -1] from the list and make a new list
        polygon_indices = [(index[0], index[1]) for index in polygon_indices if not np.array_equal(index, [-1, -1])]
        np_array, label = v[i], l
        np_array = np.uint8(255 * (np_array - np.min(np_array)) / (np.max(np_array) - np.min(np_array)))
        image = Image.fromarray(np_array)
        image_seg_pairs.append({"query_image":image, "segpoly": polygon_indices})
    return image_seg_pairs

def load_normal_npz(file_path):
    loaded_data = np.load(file_path, allow_pickle = True)
    return loaded_data['array']


def convert_to_conversation(sample):
    instruction = '''
    For each glioma in a 128×128 grayscale FLAIR-mode brain scan, output coordinates of a 10 to 15 point polygon that encloses the tumor region.
    These points should be arranged in a clockwise direction and should accurately trace the tumor boundary.
    The output should be a tuple in the format of ((row1, col1), (row2, col2), ..., (rowN, colN)), where N is between 10 and 15 points, and row and col are integers between 0 and 127.
    The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
    For the other provided image, the query image, what is the output?
    Output only the polygon coordinates and no additional text or explanations.
    '''
    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample['query_image']}
          ]
        },
        { "role" : "assistant",
          "content" : [
            {"type": "text", "text": f"{sample['segpoly']}"} ]
        },
    ]
    return { "messages" : conversation }


def create_conversation_dataset(data_path, segs_path, bb_path):

    segs_bb = load_normal_npz(segs_path)
    beg_end = load_normal_npz(bb_path)

    # Extract all the patients and the corresponding filenames
    filenames = []
    for filename in os.listdir(data_path):
        if filename.endswith(".npz"):
            file_path = os.path.join(data_path, filename)
            filenames.append(file_path)
    filenames = sorted(filenames, key=lambda x: int(x.split('_')[-1].split('.')[0]))

    # Now we build the dataset
    patients = []
    for index in range(len(filenames)):
        filename = filenames[index]
        patient_specific_bb = segs_bb[index]
        patient_bbb = beg_end[index]
        # print(patient_bbb)
        image_seg_pairs = pair_segs(filename, patient_specific_bb, patient_bbb)
        patients.append(image_seg_pairs)
        # patients += image_seg_pairs
    print(patients[0][0])

    llm_patients = []
    for patient in patients:
        llm_patient = [convert_to_conversation(sample) for sample in patient]
        llm_patients.append(llm_patient)
    print(f"Number of patients: {len(llm_patients)}")
    print(llm_patients[0][0])

    return patients, llm_patients


In [6]:
data_patients, llm_patients = create_conversation_dataset(test_data_path, test_seg_path, test_bb_path)

{'query_image': <PIL.Image.Image image mode=L size=128x128 at 0x7B80303972D0>, 'segpoly': [(30, 93), (30, 88), (36, 88), (42, 90), (50, 94), (51, 98), (47, 106), (44, 109), (40, 111), (31, 98)]}
Number of patients: 55
{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': '\n    For each glioma in a 128×128 grayscale FLAIR-mode brain scan, output coordinates of a 10 to 15 point polygon that encloses the tumor region.\n    These points should be arranged in a clockwise direction and should accurately trace the tumor boundary.\n    The output should be a tuple in the format of ((row1, col1), (row2, col2), ..., (rowN, colN)), where N is between 10 and 15 points, and row and col are integers between 0 and 127.\n    The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.\n    For the other provided image, the query image, what is the output?\n    Output only the polygon coordinates and no additional text or explanations.\n    '}, {'t

In [8]:
# Add the name of the model we want to import
# lora_model_name = "liufelic/segpoly_noref_150step_model"
lora_model_name = "unsloth/Llama-3.2-11B-Vision-Instruct"

# Load the model we previously trained
from unsloth import FastVisionModel
model, tokenizer = FastVisionModel.from_pretrained(
    model_name = lora_model_name, # YOUR MODEL YOU USED FOR TRAINING
    load_in_4bit = True, # Set to False for 16bit LoRA
)
FastVisionModel.for_inference(model) # Enable for inference!

==((====))==  Unsloth 2025.3.6: Fast Mllama vision patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

MllamaForConditionalGeneration(
  (vision_model): MllamaVisionModel(
    (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)
    (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(
      (tile_embedding): Embedding(9, 8197120)
    )
    (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
      (embedding): Embedding(9, 5120)
    )
    (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(
      (embedding): Embedding(9, 5120)
    )
    (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (transformer): MllamaVisionEncoder(
      (layers): ModuleList(
        (0-12): 13 x MllamaVisionEncoderLayer(
          (self_attn): MllamaVisionSdpaAttention(
            (q_proj): Linear4bit(in_features=1280, out_features=1280, bias=False)
            (k_proj): Linear4bit(in_features=1280, out_features

In [9]:
import re

def extract_choice(output_text):
    cleaned_line = output_text.split("assistant")[-1].replace("\n", "").strip()
    return cleaned_line

    # cleaned_line = output_text.split("assistant")[-1].replace("\n", "||").strip()
    # match = re.search(r'\((\d+),\s*(\d+)\)', cleaned_line)
    # if match: return (int(match.group(1)), int(match.group(2)))
    # else: return (-1, -1)

def write_file_for_one_patient(patient, path):
    # Open the file in append mode (if it doesn't exist, it will be created)
    with open(path, 'w') as file:
      for sample_id in range(len(patient)):

          # Extract the info from the message
          sample = patient[sample_id]
          image = sample["query_image"]
          ground_truth = sample["segpoly"]

          FastVisionModel.for_inference(model) # Enable for inference!

          instruction = '''
          For each glioma in a 128×128 grayscale FLAIR-mode brain scan, output coordinates of a 10 to 15 point polygon that encloses the tumor region.
          These points should be arranged in a clockwise direction and should accurately trace the tumor boundary.
          The output should be a tuple in the format of ((row1, col1), (row2, col2), ..., (rowN, colN)), where N is between 10 and 15 points, and row and col are integers between 0 and 127.
          The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
          For the other provided image, the query image, what is the output?
          Output only the polygon coordinates and no additional text or explanations.
          '''

          messages = [
              {"role": "user", "content": [
                  {"type": "image"},
                  {"type": "text", "text": instruction}
              ]}
          ]
          input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
          inputs = tokenizer(
              image,
              input_text,
              add_special_tokens = False,
              return_tensors = "pt",
          ).to("cuda")

          output_tokens = model.generate(**inputs, max_new_tokens=140,
                                          use_cache=True, temperature=1.5, min_p=0.1)
          generated_text = extract_choice(tokenizer.decode(output_tokens[0], skip_special_tokens=True))

          # Prepare the output message
          result_message = f"Ground truth: {ground_truth} || Model output: {generated_text}"

          # # Print to the terminal
          # print(result_message)

          # Write to the file
          file.write(result_message + '\n')  # Add a newline after each entry

    return


In [10]:
# Unload all data such that it's easily accessible
main_results_folder = "/content/drive/MyDrive/ENGSCI/4TH YEAR/fall 4th year/ESC499/Code Test/Finetuning/Results_2/"

FastVisionModel.for_inference(model)

# for patient_id in range(len(data_patients)):
# for patient_id in range(42, 55, 1):
for patient_id in range(41, 55, 1):

    print(f"Patient ID: {patient_id}")
    patient = data_patients[patient_id]

    # path for patient data storage
    patient_results_text = main_results_folder + f"patient_{patient_id}.txt"
    write_file_for_one_patient(patient, patient_results_text)

Patient ID: 31
Patient ID: 32
Patient ID: 33
Patient ID: 34
Patient ID: 35
Patient ID: 36
Patient ID: 37
Patient ID: 38
Patient ID: 39
Patient ID: 40
Patient ID: 41
Patient ID: 42
Patient ID: 43
Patient ID: 44
Patient ID: 45
Patient ID: 46
Patient ID: 47
Patient ID: 48
Patient ID: 49
Patient ID: 50
Patient ID: 51
Patient ID: 52
Patient ID: 53
Patient ID: 54


In [None]:
print()