<a name="Data"></a>
### DATA
This section will unload all of the .npz info I had saved and the axial, saggital, and coronal slices to create individual data entries.

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Wed Feb 12 18:03:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             47W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
# Unload all data such that it's easily accessible
main_data_path = "/content/drive/MyDrive/ENGSCI/4TH YEAR/fall 4th year/ESC499/Code Test/Finetuning/Data/"
training_data_path = main_data_path+"training"
test_data_path = main_data_path+"test"
training_seg_path = main_data_path+"train_seg_bbounds.npz"
test_seg_path = main_data_path+"test_seg_bbounds.npz"

In [2]:
# Import libraries
from PIL import Image
import numpy as np
import os
import matplotlib.pyplot as plt
import random

random.seed(42)

In [3]:
# Functions

def load_scan_from_npz(file_path):
    data = np.load(file_path)
    return data['voxel'], data['ax'], data['sag'], data['cor'], data['label']

def pair_segs(filename, patient_specific_bb):
    v, a, s, c, l = load_scan_from_npz(filename)
    begin, end = a
    image_seg_pairs = []
    for i in range(begin+20, end+1-20, 1):
        row_min, col_min, row_max, col_max = patient_specific_bb[i]
        np_array, label = v[i], l
        np_array = np.uint8(255 * (np_array - np.min(np_array)) / (np.max(np_array) - np.min(np_array)))
        image = Image.fromarray(np_array)
        image_seg_pairs.append({"image":image, "bb":[(row_min, col_min), (row_min, col_max), (row_max, col_max), (row_max, col_min)] })
    return image_seg_pairs

def load_normal_npz(file_path):
    loaded_data = np.load(file_path)
    return loaded_data['array']


def convert_to_conversation(sample):
    instruction = '''
You are an expert medical AI assistant specializing in glioma segmentation on FLAIR-mode brain scans.
Given a 128x128 grayscale brain scan, output the bounding box around the tumor using the four corner vertices.
The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
Ensure the bounding box tightly encloses the entire tumor without extending into non-tumor regions.
The bounding box output must be formatted strictly as:[(row_min, col_min), (row_min, col_max), (row_max, col_max), (row_max, col_min)] where (row, col) are integers between 0 and 127, with (0,0) at the top-left and row increasing downward, and col increasing rightward.
Do not output any other text or explanation, only the coordinate list in the exact format above.
    '''

    conversation = [
        { "role": "user",
          "content" : [
            {"type" : "text",  "text"  : instruction},
            {"type" : "image", "image" : sample['image']} ]
        },
        { "role" : "assistant",
          "content" : [
            {"type": "text", "text": f"{sample['bb']}"} ]
        },
    ]
    return { "messages" : conversation }


def create_conversation_dataset(data_path, segs_path):

    segs_bb = load_normal_npz(segs_path)

    # Extract all the patients and the corresponding filenames
    filenames = []
    for filename in os.listdir(data_path):
        if filename.endswith(".npz"):
            file_path = os.path.join(data_path, filename)
            filenames.append(file_path)
    filenames = sorted(filenames, key=lambda x: int(x.split('_')[-1].split('.')[0]))

    # Now we build the dataset
    patients = []
    for index in range(len(filenames)):
        filename = filenames[index]
        patient_specific_bb = segs_bb[index]
        image_seg_pairs = pair_segs(filename, patient_specific_bb)
        # patients.append(image_seg_pairs)
        patients += image_seg_pairs

    # # Now convert the dataset into input for LLM
    converted_dataset = [convert_to_conversation(sample) for sample in patients]
    print(f"Dataset size: {len(converted_dataset)}")
    print(converted_dataset[0])

    return patients, converted_dataset


In [4]:
# test
# dataset, converted_dataset = create_conversation_dataset(test_data_path, test_seg_path)

# training
dataset, converted_dataset = create_conversation_dataset(training_data_path, training_seg_path)
random.shuffle(converted_dataset)

KeyboardInterrupt: 

In [None]:
random.shuffle(converted_dataset)

<a name="LLM Setup"></a>
### LLM SETUP
This notebook finetunes **Llama 3.2 11B Vision Instruct** to (hopefully) better detect Low VS High Grade Gliomas from Flair Torso Scans.

In [5]:
# 42s 44s
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [7]:
# 44s
!pip install --upgrade torchvision
import torchvision
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch
from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
from trl import SFTTrainer, SFTConfig
from transformers import TextStreamer
from transformers import TrainerCallback, TrainingArguments

Collecting torchaudio
  Downloading torchaudio-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Downloading torchaudio-2.6.0-cp311-cp311-manylinux1_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchaudio
  Attempting uninstall: torchaudio
    Found existing installation: torchaudio 2.5.1+cu121
    Uninstalling torchaudio-2.5.1+cu121:
      Successfully uninstalled torchaudio-2.5.1+cu121
Successfully installed torchaudio-2.6.0



Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastVisionModel # FastLanguageModel for LLMs


RuntimeError: Unsloth: Failed to create dynamic compiled modules!

In [None]:
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit", # Llama 3.2 vision support
    "unsloth/Llama-3.2-11B-Vision-bnb-4bit",
    "unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit", # Can fit in a 80GB card!
    "unsloth/Llama-3.2-90B-Vision-bnb-4bit",
    "unsloth/Pixtral-12B-2409-bnb-4bit",              # Pixtral fits in 16GB!
    "unsloth/Pixtral-12B-Base-2409-bnb-4bit",         # Pixtral base model
    "unsloth/Qwen2-VL-2B-Instruct-bnb-4bit",          # Qwen2 VL support
    "unsloth/Qwen2-VL-7B-Instruct-bnb-4bit",
    "unsloth/Qwen2-VL-72B-Instruct-bnb-4bit",
    "unsloth/llava-v1.6-mistral-7b-hf-bnb-4bit",      # Any Llava variant works!
    "unsloth/llava-1.5-7b-hf-bnb-4bit",
] # More models at https://huggingface.co/unsloth

# 2mins
model, tokenizer = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

==((====))==  Unsloth 2025.2.4: Fast Mllama vision patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.9k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

We now add LoRA adapters for parameter efficient finetuning - this allows us to only efficiently train 1% of all parameters.

**[NEW]** We also support finetuning ONLY the vision part of the model, or ONLY the language part. Or you can select both! You can also select to finetune the attention or the MLP layers!

In [None]:
# 5s
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # False if not finetuning vision layers
    finetune_language_layers   = True, # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True, # False if not finetuning MLP layers

    r = 16,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 16,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    # target_modules = "all-linear", # Optional now! Can specify a list if needed
)

Unsloth: Making `model.base_model.model.vision_model.transformer` require gradients


<a name="LLM Inference"></a>
### LLM INFERENCE
This let's us run the model for a few examples, so we can see what's happening as the model predicts inherently.

In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[1202]["image"]

instruction = '''
You are an expert medical AI assistant specializing in glioma segmentation on FLAIR-mode brain scans.
Given a 128x128 grayscale brain scan, output the bounding box around the tumor using the four corner vertices.
The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
Ensure the bounding box tightly encloses the entire tumor without extending into non-tumor regions.
The bounding box output must be formatted strictly as:[(row_min, col_min), (row_min, col_max), (row_max, col_max), (row_max, col_min)] where (row, col) are integers between 0 and 127, with (0,0) at the top-left and row increasing downward, and col increasing rightward.
Do not output any other text or explanation, only the coordinate list in the exact format above.
    '''

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 35,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

[(37, 41), (37, 126), (120, 126), (120, 40)]<|eot_id|>


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
8.688 GB of memory reserved.


In [None]:
FastVisionModel.for_training(model) # Enable for training!

random.seed(27)
random.shuffle(converted_dataset)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
    train_dataset = converted_dataset,
    args = SFTConfig(

        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
        learning_rate = 8e-6,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",     # For Weights and Biases

        # output_dir=main_data_path+"checkpoints", # Directory to save checkpoints
        # save_steps=30,                        # Save every 100 steps
        # save_total_limit=3,                    # Keep only the last 3 checkpoints
        # logging_dir=main_data_path+'logs',     # Directory for logging
        # logging_steps=1,                     # Log every 100 steps

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 1024, # changed from 2048!
    ),
)

# trainer = SFTTrainer(
#     model=model,
#     tokenizer=tokenizer,
#     data_collator=UnslothVisionDataCollator(model, tokenizer),  # Must use!
#     train_dataset=converted_train_dataset,
#     eval_dataset=converted_val_dataset,  # Add validation dataset here
#     args=SFTConfig(
#         per_device_train_batch_size=16,
#         gradient_accumulation_steps=4,
#         warmup_steps=10,
#         num_train_epochs=1,  # Set this instead of max_steps for full training runs
#         learning_rate=2e-6,
#         fp16=not is_bf16_supported(),
#         bf16=is_bf16_supported(),
#         logging_steps=10,  # Log every 10 steps
#         evaluation_strategy="steps",  # Run validation during training
#         eval_steps=50,  # Evaluate every 50 steps
#         save_strategy="steps",  # Save checkpoints periodically
#         save_steps=50,  # Save every 50 steps
#         save_total_limit=3,  # Keep only the last 3 checkpoints
#         optim="adamw_8bit",
#         weight_decay=0.01,
#         lr_scheduler_type="linear",
#         seed=3407,
#         output_dir="outputs",
#         report_to="none",  # For Weights and Biases
#         remove_unused_columns=False,  # For vision fine-tuning
#         dataset_text_field="",
#         dataset_kwargs={"skip_prepare_dataset": True},
#         dataset_num_proc=4,
#         max_seq_length=1024,
#     ),
# )


In [None]:
torch.cuda.empty_cache()

In [None]:
# Start the training
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,118 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 4
\        /    Total batch size = 64 | Total steps = 100
 "-____-"     Number of trainable parameters = 67,174,400
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!


Step,Training Loss
1,3.0171
2,3.0037
3,2.994
4,2.9749
5,2.9314
6,2.903
7,2.8484
8,2.7918
9,2.75
10,2.693


Step,Training Loss
1,3.0171
2,3.0037
3,2.994
4,2.9749
5,2.9314
6,2.903
7,2.8484
8,2.7918
9,2.75
10,2.693


In [None]:
FastVisionModel.for_inference(model) # Enable for inference!

image = dataset[0]["image"]
ground_truth = dataset[0]["bb"]
instruction = '''
You are an expert medical AI assistant specializing in glioma segmentation on FLAIR-mode brain scans.
Given a 128x128 grayscale brain scan, output the bounding box around the tumor using the four corner vertices.
The tumor region is the brightest, high-intensity abnormality distinct from normal brain structures.
Ensure the bounding box tightly encloses the entire tumor without extending into non-tumor regions.
The bounding box output must be formatted strictly as:[(row_min, col_min), (row_min, col_max), (row_max, col_max), (row_max, col_min)] where (row, col) are integers between 0 and 127, with (0,0) at the top-left and row increasing downward, and col increasing rightward.
Do not output any other text or explanation, only the coordinate list in the exact format above.
    '''

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizer(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

print(f"Ground truth: {ground_truth}")
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 35,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

Ground truth: [(50, 15), (50, 34), (70, 34), (70, 15)]
[(37, 41), (37, 126), (120, 126), (120, 44)]<|eot_id|>


In [None]:
nameee = "liufelic/seg_200step_model"

model.save_pretrained(nameee)
tokenizer.save_pretrained(nameee)
model.push_to_hub(nameee, token = "token")
tokenizer.push_to_hub(nameee, token = "token")

README.md:   0%|          | 0.00/631 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Saved model to https://huggingface.co/liufelic/seg_200step_model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:

# Save the model
model.save_pretrained("liufelic/Llama-3.2-11B-Vision-lora_full_model")
tokenizer.save_pretrained("liufelic/Llama-3.2-11B-Vision-lora_full_model")
model.push_to_hub("liufelic/Llama-3.2-11B-Vision-lora_full_model", token = "token")
tokenizer.push_to_hub("liufelic/Llama-3.2-11B-Vision-lora_full_model", token="token")

# model.save_pretrained_merged("liufelic/Llama-3.2-11B-Vision-lora_full_model", tokenizer,)
# model.push_to_hub_merged("liufelic/Llama-3.2-11B-Vision-lora_full_model", tokenizer, save_method = "merged_16bit", token = "token")

README.md:   0%|          | 0.00/635 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Saved model to https://huggingface.co/liufelic/Llama-3.2-11B-Vision-lora_full_model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
# Save the model and tokenizer
model.save_pretrained("liufelic/Llama-3.2-11B-Vision-lora_16full_model")
tokenizer.save_pretrained("liufelic/Llama-3.2-11B-Vision-lora_16full_model")

# Save the model weights in float16
torch.save(model.state_dict(), "liufelic/Llama-3.2-11B-Vision-lora_16full_model/pytorch_model.bin", _use_new_zipfile_serialization=False)

# Push to Hugging Face Hub (optional)
model.push_to_hub("liufelic/Llama-3.2-11B-Vision-lora_16full_model", token = "token")
tokenizer.push_to_hub("liufelic/Llama-3.2-11B-Vision-lora_16full_model", token = "token")

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
# Load model directly
# from transformers import AutoProcessor, AutoModelForImageTextToText
from transformers import MllamaForConditionalGeneration, AutoProcessor

model_id = "liufelic/Llama-3.2-11B-Vision-lora_16full_model"
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto")
processor = AutoProcessor.from_pretrained(model_id)


# processor = AutoProcessor.from_pretrained("liufelic/Llama-3.2-11B-Vision-lora_full_model")
# model = AutoModelForImageTextToText.from_pretrained("liufelic/Llama-3.2-11B-Vision-lora_full_model")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

image = balanced_train_dataset[0]["image"]
ground_truth = balanced_train_dataset[0]["bb"]


instruction = "Classify the brain scan as Low Grade Glioma (0), High Grade Glioma (1), or No Glioma (2) based on the scan's visual features. Respond only in the following format: Choice: <0, 1, or 2>."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]

input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

# Prepare inputs
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt",
).to("cuda")

# inputs = {key: value.to(torch.float16) for key, value in inputs.items()}

# # Change aspect_ratio_ids to torch.long
# inputs['aspect_ratio_ids'] = inputs['aspect_ratio_ids'].type(torch.long)

# # Converting other tensors to float16 (if necessary)
# # Make sure to exclude 'aspect_ratio_ids' from the conversion
# for key, value in inputs.items():
#     if key != 'aspect_ratio_ids':  # Exclude aspect_ratio_ids from float16 conversion
#         inputs[key] = value.to(torch.float16)

# have the model generate a response
output = model.generate(**inputs, max_new_tokens=20)
response = processor.decode(output[0])

result_message = f"Ground truth: {ground_truth} || Model output: {response}"

print(result_message)

# FastVisionModel.for_inference(model) # Enable for inference!

# output_tokens = model.generate(**inputs, max_new_tokens=20,
#                                 use_cache=True, temperature=1.5, min_p=0.1)
# generated_text = extract_choice(tokenizer.decode(output_tokens[0], skip_special_tokens=True))





# print(f"Ground truth: {ground_truth}")
# text_streamer = TextStreamer(tokenizer, skip_prompt = True)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 20,
#                    use_cache = True, temperature = 1.5, min_p = 0.1)




RuntimeError: self and mat2 must have the same dtype, but got Half and Float

In [None]:
import torch
    from transformers import MllamaForConditionalGeneration, AutoProcessor

    model_id = "decapoda-research/llama-7b-hf"  # Replace with your base model ID
    adapter_path = "lora_full_model"  # Replace with the path to your saved LoRA adapter

    # Load the base model in float16
    base_model = MllamaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
    )

    # Load the LoRA adapter state dictionary
    lora_state_dict = torch.load(os.path.join(adapter_path, "adapter_model.bin"), map_location="cuda")

def find_lora_modules(model):
        lora_modules = []
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear) and name.endswith((".lora_A", ".lora_B")):
                lora_modules.append(module)
        return lora_modules

    lora_modules = find_lora_modules(base_model)

for lora_module in lora_modules:
        lora_name = lora_module.weight.data
        lora_module.weight.data = lora_state_dict[lora_name].type(torch.float16)
        # Assuming you saved the LoRA weights in float32, convert to float16

processor = AutoProcessor.from_pretrained(model_id)