In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
import torch
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset

# 1. Load the Model
# Unsloth will automatically handle 4-bit quantization and PEFT configuration.
# We also specify a max sequence length for the model.
max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/medgemma-4b-pt",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    dtype = None, # Will default to torch.bfloat16 if available
)

# 2. Configure LoRA Adapters
# This adds trainable "adapter" layers to the model.
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank of the adapters. A common choice.
    lora_alpha = 16, # A scaling factor for the adapters.
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 42,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

print("Unsloth model configured for 4-bit LoRA fine-tuning!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.7: Fast Gemma3 patching. Transformers: 4.53.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Unsloth: Making `base_model.model.model.vision_tower.vision_model` require gradients
Unsloth model configured for 4-bit LoRA fine-tuning!


In [None]:
from datasets import load_dataset

# Load just the first example from the dataset to inspect its structure
ds_preview = load_dataset("epfl-llm/guidelines", split="train", streaming=True).take(1)
example = list(ds_preview)[0]

print("Dataset Columns Found:")
print(example.keys())

README.md: 0.00B [00:00, ?B/s]

Dataset Columns Found:
dict_keys(['id', 'source', 'title', 'clean_text', 'raw_text', 'url', 'overview'])


In [None]:
# --- CORRECTED STEP 2: Prepare the Dataset (Based on Actual Data Structure) ---

# Define a simple prompt structure
prompt_template = """### Source:
{}

### Guideline Text:
{}"""

# We need a special token to signify the end of a sequence
EOS_TOKEN = tokenizer.eos_token

# Function to format each example in the dataset
def format_prompt(example):
    # CORRECTED: Use 'clean_text' for the main content and 'source' for the origin.
    # These keys are confirmed to be in the dataset from your debugging.
    formatted_text = prompt_template.format(example['source'], example['clean_text']) + EOS_TOKEN

    # Return a dictionary with a single key named "text",
    # as this is what the SFTTrainer expects by default.
    return { "text" : formatted_text }


# Load the full dataset for training
ds = load_dataset("epfl-llm/guidelines", split="train")

# Apply the formatting function to the entire dataset
# This will now work because we are using the correct, verified keys.
ds = ds.map(format_prompt, num_proc=4) # Using multiple processes to speed it up

print("\nDataset loaded and formatted successfully!")
print("Here is an example of a formatted prompt:")
print(ds[0]['text']) # Print the first example to see the final format

open_guidelines.jsonl:   0%|          | 0.00/878M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map (num_proc=4):   0%|          | 0/37970 [00:00<?, ? examples/s]


Dataset loaded and formatted successfully!
Here is an example of a formatted prompt:
### Source:
cco

### Guideline Text:
# QUESTIONS Diagnosis/Staging
What benefit to clinical management does positron emission tomography (PET) or positron emission tomography/computed tomography (PET/CT) contribute to the diagnosis or staging of head and neck cancer? What benefit to clinical management does PET or PET/CT contribute to the assessment of treatment response for head and neck cancer?
What benefit to clinical management does PET or PET/CT contribute when recurrence of head and neck cancer is suspected but not proven? What benefit to clinical management does PET or PET/CT contribute to restaging at the time of documented recurrence for head and neck cancer? What is the role of PET when a solitary metastasis is identified at the time of recurrence and a metastectomy is being contemplated?

# TARGET POPULATION
Patients with head and neck cancer are the target population for this recommendatio

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = ds,
    dataset_text_field = "text", # The name of the field containing our formatted prompts
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can be set to True for faster training on many short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Simulates a larger batch size (2 * 4 = 8)
        warmup_steps = 5,
        max_steps = 500, # A small number for demonstration. Increase for a full run (e.g., 200-500).
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(), # Use bf16 if available, else fp16
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit", # Use 8-bit optimizer to save memory
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 42,
        output_dir = "outputs",
    ),
)

# Let's start the training!
print("Starting the fine-tuning process...")
trainer_stats = trainer.train()
print("Fine-tuning complete!")

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"]:   0%|          | 0/37970 [00:00<?, ? examples/s]

Starting the fine-tuning process...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 37,970 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 32,788,480 of 4,332,867,952 (0.76% trained)


Step,Training Loss
1,1.7672
2,1.9928
3,2.0467
4,2.0577
5,1.7961
6,2.149
7,2.0342
8,1.9147
9,1.9019
10,1.8525


Step,Training Loss
1,1.7672
2,1.9928
3,2.0467
4,2.0577
5,1.7961
6,2.149
7,2.0342
8,1.9147
9,1.9019
10,1.8525


Fine-tuning complete!


In [None]:
# Let's test our newly fine-tuned model
# We'll use the same prompt template, but only provide the "Source" part
# The model should then complete the "Guideline Text" part

# Load the base model and tokenizer for inference
from transformers import pipeline

# You can use the `trainer.model` directly if you're in the same session
# Or load the saved adapters like this for a new session
# from unsloth import FastLanguageModel
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "outputs/checkpoint-60", # Or your final saved model
# )

text_pipeline = pipeline("text-generation", model=trainer.model, tokenizer=tokenizer)

# Create a test prompt
test_prompt_input = "American College of Cardiology"
test_prompt_formatted = prompt_template.format(test_prompt_input, "") # The empty string is where the model will generate

# Run inference
output = text_pipeline(test_prompt_formatted, max_new_tokens=256)

# Print the result
print("="*50)
print("PROMPT:")
print(test_prompt_formatted)
print("\nMODEL OUTPUT:")
print(output[0]['generated_text'])
print("="*50)

Device set to use cuda:0


PROMPT:
### Source:
American College of Cardiology

### Guideline Text:


MODEL OUTPUT:
### Source:
American College of Cardiology

### Guideline Text:
<b><i>1.5.4.4.1</i></b>  The patient with a history of acute coronary syndrome who is receiving a noninvasive test for suspected acute coronary syndrome should be referred to a cardiac catheterization laboratory or, if appropriate, to a hospital with appropriate cardiac catheterization laboratory resources for evaluation and possible treatment of the suspected acute coronary syndrome.
### Source:
American College of Cardiology

### Guideline Text:
<b><i>1.5.4.4.2</i></b>  For patients who are at risk for acute coronary syndrome, the use of stress testing in the absence of chest pain is not recommended.
### Source:
American College of Cardiology

### Guideline Text:
<b><i>1.5.4.4.3</i></b>  Patients with a history of acute coronary syndrome who are considered candidates for stress testing should be referred to a cardiac catheterization l

In [None]:
# Save the fine-tuned LoRA adapters
# --- Step 5: Save your fine-tuned model adapters locally ---
# (This is the same as before, ensuring the files are ready)

lora_model_name = "medgemma-guidelines-4b-4bit-lora"
model.save_pretrained(lora_model_name)
tokenizer.save_pretrained(lora_model_name)

print(f"LoRA adapters saved locally to '{lora_model_name}'")


# --- Step 6: Log in and Upload to Hugging Face Hub ---

from huggingface_hub import notebook_login

# 1. Log in to your Hugging Face account
# A widget will appear. Paste your access token with 'write' permissions here.
notebook_login()

# 2. Push the model adapters to the Hub
# The push_to_hub command will create a new repository if it doesn't exist.
# Make sure to replace "your-hf-username" with your actual Hugging Face username.
hf_repo_name = "huseyincavus/medgemma-4b-guidelines-lora"

print(f"Uploading adapters to Hugging Face Hub repository: {hf_repo_name}")
model.push_to_hub(hf_repo_name, use_auth_token=True)
tokenizer.push_to_hub(hf_repo_name, use_auth_token=True)
print("Upload complete!")

LoRA adapters saved locally to 'medgemma-guidelines-4b-4bit-lora'


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Uploading adapters to Hugging Face Hub repository: huseyincavus/medgemma-4b-guidelines-lora


README.md:   0%|          | 0.00/571 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/131M [00:00<?, ?B/s]

Saved model to https://huggingface.co/huseyincavus/medgemma-4b-guidelines-lora


  0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

Upload complete!
