In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np

dataset = load_dataset("openlifescienceai/medmcqa")
dataset = dataset['train'].to_pandas()[
    ["question", "opa", "opb","opc","opd","cop","exp"]
]

dataset

In [None]:
# Converting each dataset entry into a prompt-response format suitable for the LLM.
def convert_row(row):
    prompt = f"""Question: {row['question']}
Options:
a(0) {row['opa']}
b(1) {row['opb']}
c(2) {row['opc']}
d(3) {row['opd']}
Choose the correct answer and explain why."""
    
    response = f"<answer>{row['cop']}</answer>\n<think>{row['exp']}</think>"
    return {"prompt": prompt, "response": response}

In [None]:
converted = dataset.apply(convert_row, axis=1, result_type="expand")

In [None]:
import os
os.environ["UNSLOTH_VLLM_STANDBY"] = "1"  # Enables extra vLLM kernels
# Remove existing PyTorch to avoid conflicts
!pip install -qqq pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install -qqq torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
!pip install -qqq unsloth
!pip install -qqq vllm==0.9.2

In [None]:
import unsloth
from unsloth import FastLanguageModel
import torch
max_seq_length = 512 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Base",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = False, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank*2, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

In [None]:
#@title System Prompt { display-mode: "form" }
# Chat Template for GRPO
explanation_start = "<think>" # Acts as <think>
explanation_end   = "</think>"   # Acts as </think>
solution_start  = "<answer>"
solution_end    = "</answer>"

system_prompt = \
f"""You are given a question from a Medical Entrance Exam.  
Each question has four options: "a(0)", "b(1)", "c(2)", and "d(3)".  
Some questions may have multiple correct answers.  
First, provide the correct answer(s) enclosed between {solution_start} and {solution_end}
Then, think through the question carefully and explain your reasoning.  
Enclose your reasoning between {explanation_start} and {explanation_end}."""
system_prompt

In [None]:
#@title Chat Template { display-mode: "form" }
chat_template = \
    "{% if messages[0]['role'] == 'system' %}"\
        "{{ messages[0]['content'] + eos_token }}"\
        "{% set loop_messages = messages[1:] %}"\
    "{% else %}"\
        "{{ '{system_prompt}' + eos_token }}"\
        "{% set loop_messages = messages %}"\
    "{% endif %}"\
    "{% for message in loop_messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ message['content'] }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ message['content'] + eos_token }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}{{ '{solution_start}' }}"\
    "{% endif %}"

# Replace with out specific template:
chat_template = chat_template\
    .replace("'{system_prompt}'",   f"'{system_prompt}'")\
    .replace("'{reasoning_start}'", f"'{solution_start}'")  
tokenizer.chat_template = chat_template

In [None]:
#@title GRPO Template { display-mode: "form" }
def format_dataset(x):
    # The correct_options has numeric values [0,1,2,3], while the options have alphabets [a,b,c,d]
    # We map the intergers to the alphabets
    idx2letter = {0:"a", 1:"b", 2:"c", 3:"d"}
    correct_answer = idx2letter[x["cop"]]

    # The user prompt is the question + options
    question = converted.iloc[x.name]["prompt"]

    # Get the Explanations. Some of the questions also have no explanation to them
    explanation = x["exp"]
    if explanation is None or str(explanation).lower() in ["none", "nan", "null", ""]:
        explanation = ""
    explanation = str(explanation).strip()


    # Add our custom formatting
    final_prompt = \
        solution_start + correct_answer  + solution_end + \
        explanation_start + explanation + explanation_end 

    return [
        {"role" : "system",    "content" : system_prompt},
        {"role" : "user",      "content" : question},
        {"role" : "assistant", "content" : final_prompt},
    ]

dataset["Messages"] = dataset.apply(format_dataset, axis = 1)

In [None]:
tokenizer.apply_chat_template(dataset["Messages"][1], tokenize = False)

In [None]:
from datasets import Dataset

dataset["text"] = tokenizer.apply_chat_template(dataset["Messages"].values.tolist(), tokenize = False)
dataset = Dataset.from_pandas(dataset)
# Dataset in required prompt respomse format
dataset

In [None]:
# To get the model used to the prompt style, we will only train it on 5000 questions
subset_size = 5000  # pick a few thousand
dataset_small = dataset.shuffle(seed=42).select(range(subset_size))

print(f"Using {len(dataset_small)} examples for pre-finetune")
dataset_small

In [None]:
#@title Fine Tune Model { display-mode: "form" }
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_small,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 2, # Set this for 1 full training run.
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
trainer.train()

In [None]:
# Take just system + user, not the assistant answer
text = tokenizer.apply_chat_template(
    dataset[48]["Messages"][:2],  # system + user only
    tokenize=False,
    add_generation_prompt=True   # so model knows it's assistant's turn
)

print(text)

In [None]:
_ = model.generate(
    **tokenizer(text, return_tensors="pt").to("cuda"),
    temperature=0,
    max_new_tokens=128,
    streamer=TextStreamer(tokenizer, skip_prompt=False),  # ðŸ‘ˆ skip prompt when printing
)

In [None]:
save_dir = "./pre_finetuned_model_4B"

trainer.save_model(save_dir)   # saves model + adapter weights (if any)
tokenizer.save_pretrained(save_dir)  # saves tokenizer files too

In [None]:
# Fine Tuned Model for Prompt Response Style
!tar -czvf pre_finetuned_model_4B.tar.gz pre_finetuned_model_4B