In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 2048,   # Context length
    load_in_4bit = True, # Load in 4 bit
    load_in_8bit = False,
    full_finetuning = False,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.2: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/168k [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.67k [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,           # The rank of the LoRA matrices. A higher rank means more trainable parameters, 32 is a balanced choice
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", # A list of the specific layers (modules) in the model to apply LoRA to
                      "gate_proj", "up_proj", "down_proj",], # We adapt the most critical parts of the model for our task
    lora_alpha = 32,  # The scaling factor for the LoRA updates. A common practice is to set alpha equal to the rank (r).
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # A highly optimized custom implementation of memory-saving technique
    random_state = 3407,
    use_rslora = False,   # Rank stabilized LoRA
    loftq_config = None,  # LoftQ quantization
)

Unsloth 2025.6.2 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("pubmed_qa", "pqa_artificial")

# Formatting promt
def format_prompt(example):

    # The  prompt teaches the model to first state the simple answer, then provide the full explanation from the 'long_answer' field.
    prompt = f"""<|im_start|>system
You are a helpful biomedical assistant. Your task is to answer the given question based on the provided context. First, provide a simple 'yes', 'no', or 'maybe' answer, followed by a detailed explanation.<|im_end|>
<|im_start|>user
Question: {example['question']}
Context: {' '.join(example['context']['contexts'])}<|im_end|>
<|im_start|>assistant
{example['final_decision']}. {example['long_answer']}<|im_end|>"""
    return {"text": prompt}

# Apply the new formatting function
formatted_dataset = dataset.map(
    format_prompt,
    remove_columns=list(dataset["train"].features),
)

README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

Map:   0%|          | 0/211269 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 211269
    })
})

In [None]:
print("First example:")
print(formatted_dataset["train"][0]["text"])
print("\nSecond example:")
print(formatted_dataset["train"][1]["text"])

First example:
<|im_start|>system
You are a helpful biomedical assistant. Your task is to answer the given question based on the provided context. First, provide a simple 'yes', 'no', or 'maybe' answer, followed by a detailed explanation.<|im_end|>
<|im_start|>user
Question: Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?
Context: Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated. The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease. A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were

In [None]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset["train"], # Use the formatted pubmed_qa training split
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run -thousands of steps-
        max_steps = 300, # 300 max_steps for demonstration
        learning_rate = 2e-5, # Common practice
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/211269 [00:00<?, ? examples/s]

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
11.898 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 211,269 | Num Epochs = 1 | Total steps = 300
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 128,450,560/14,000,000,000 (0.92% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1069
2,2.011
3,2.144
4,2.0888
5,1.9048
6,2.2044
7,2.0747
8,1.9317
9,1.9338
10,2.0513


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

7151.5903 seconds used for training.
119.19 minutes used for training.
Peak reserved memory = 13.963 GB.
Peak reserved memory for training = 2.065 GB.
Peak reserved memory % of max memory = 94.722 %.
Peak reserved memory for training % of max memory = 14.009 %.


In [None]:
from transformers import TextStreamer

# The fine-tuned model and tokenizer are already in memory.
# We access the model directly from the completed trainer object.
model = trainer.model

# Define the conversation using the chat template structure
# This is the modern, correct way to format prompts for conversational models.
system_prompt = "You are a helpful biomedical assistant. Your task is to answer the given question based on the provided context. First, provide a simple 'yes', 'no', or 'maybe' answer, followed by a detailed explanation."

user_question = "Is there a definitive link between coffee consumption and a reduced risk of Parkinson's disease?"
user_context = "Several epidemiological studies have suggested an inverse association between coffee consumption and the risk of Parkinson's disease (PD). A large meta-analysis of 26 studies found that the risk of PD was, on average, 30% lower in coffee drinkers compared to non-drinkers. The association appears to be dose-dependent. However, the mechanism is not fully understood, though caffeine's role as an adenosine A2A receptor antagonist is a leading hypothesis. It's important to note that these are observational studies, which show correlation but cannot prove causation. Confounding factors, such as genetics and lifestyle, may also play a role. Randomized controlled trials are needed to establish a causal relationship definitively."

# Combine the question and context into the user's message
user_prompt = f"Question: {user_question}\nContext: {user_context}"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

# Apply the chat template and explicitly disable the "thinking" step
# This is the key change to get a direct, clean answer. `add_generation_prompt=True` is crucial as it adds the `<|im_start|>assistant\n`tokens, telling the model where to start its response.
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False, # This prevents the <think>...</think> block
)

# Tokenize the prompt and prepare for streaming generation
# Move the tokenized inputs to the GPU where the model is.
model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# The TextStreamer will print the output token-by-token to the console for a live, typewriter-like effect.
streamer = TextStreamer(tokenizer, skip_prompt=True)

# Generate the response
print("\n" + "="*50)
print("       FINE-TUNED MODEL RESPONSE (Direct Output)")
print("="*50 + "\n")

_ = model.generate(
    **model_inputs,
    streamer=streamer,
    max_new_tokens=256, # Increased for detailed full answers
    temperature=0.6,
    top_p=0.9,
    do_sample=True, # Recommended for more natural-sounding text
)


       FINE-TUNED MODEL RESPONSE (Direct Output)

Yes, there is a definitive link between coffee consumption and a reduced risk of Parkinson's disease. The evidence from epidemiological studies, including a large meta-analysis of 26 studies, consistently shows a dose-dependent inverse association between coffee consumption and the risk of Parkinson's disease. Caffeine, a major component of coffee, is a leading candidate for the protective effect, as it acts as an adenosine A2A receptor antagonist. However, the exact mechanism remains to be fully elucidated, and further research is needed to establish a causal relationship definitively.<|im_end|>


In [None]:
# The 'trainer' object holds the fine-tuned model.
# This command saves the LoRA adapters to a new directory.
trainer.save_model("qwen3-14b-pubmedqa-lora")

print("Model adapters saved successfully to the directory 'qwen3-14b-pubmedqa-lora'")

Model adapters saved successfully to the directory 'qwen3-14b-pubmedqa-lora'


In [None]:
from google.colab import userdata
hf_token = userdata.get('HF_TOKEN')

# Define the name for your repository
adapters_repo_name = "huseyincavus/Qwen3-14B-PubMedQA-lora-adapters"

# Set this to True to run the upload.
if True:
    print(f"Starting push of LoRA adapters to Hub repository: {adapters_repo_name}")

    # Call the push function with the corrected token.
    model.push_to_hub(
        adapters_repo_name,
        token = hf_token
    )

    print(f"\nSuccessfully pushed LoRA adapters to: https://huggingface.co/{adapters_repo_name}")

else:
    print("Skipping push to Hub. Set the `if` condition to `True` to run.")

Starting push of LoRA adapters to Hub repository: huseyincavus/Qwen3-14B-PubMedQA-lora-adapters


README.md:   0%|          | 0.00/593 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/514M [00:00<?, ?B/s]

Saved model to https://huggingface.co/huseyincavus/Qwen3-14B-PubMedQA-lora-adapters

Successfully pushed LoRA adapters to: https://huggingface.co/huseyincavus/Qwen3-14B-PubMedQA-lora-adapters
