## 1. Import libraries

In [5]:
from datasets import load_dataset
from unsloth import FastLanguageModel
from huggingface_hub import login
from dotenv import load_dotenv
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

## 2. Load dataset

In [6]:
ds = load_dataset("openlifescienceai/medmcqa")
del ds["test"]

Generating train split: 100%|██████████| 182822/182822 [00:00<00:00, 813730.76 examples/s]
Generating test split: 100%|██████████| 6150/6150 [00:00<00:00, 1416138.87 examples/s]
Generating validation split: 100%|██████████| 4183/4183 [00:00<00:00, 827193.48 examples/s]


### 2.1. Format data

In [7]:
data_prompt = """Choose the correct option for the following question.

### Question:
{}

### Choice:
{}

### Answer:
"""

# Mapping chỉ số sang nhãn
id2label = {
    0: 'A',
    1: 'B',
    2: 'C',
    3: 'D'
}

# Hàm xử lý dữ liệu và tạo prompt
def formatting_prompt(examples):
    questions = examples["question"]
    opas = examples["opa"]
    opbs = examples["opb"]
    opcs = examples["opc"]
    opds = examples["opd"]
    cops = examples["cop"]

    texts = []
    for idx in range(len(questions)):
        question = questions[idx]
        opa = opas[idx]
        opb = opbs[idx]
        opc = opcs[idx]
        opd = opds[idx]
        answer = id2label[cops[idx]]

        # Thêm đáp án đúng vào phần trả lời
        if answer == "A":
            answer += " " + opa
        elif answer == "B":
            answer += " " + opb
        elif answer == "C":
            answer += " " + opc
        elif answer == "D":
            answer += " " + opd

        # Gộp các lựa chọn thành một chuỗi
        choices = f"A. {opa} B. {opb} C. {opc} D. {opd}"
        text = data_prompt.format(question, choices) + answer
        texts.append(text)

    return {"text": texts}

# Áp dụng hàm xử lý lên tập dữ liệu
process_ds = ds.map(formatting_prompt, batched=True)

Map: 100%|██████████| 182822/182822 [00:00<00:00, 204977.73 examples/s]
Map: 100%|██████████| 4183/4183 [00:00<00:00, 173667.64 examples/s]


In [8]:
process_ds['train'][0]

{'id': 'e9ad821a-c438-4965-9f77-760819dfa155',
 'question': 'Chronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma',
 'opa': 'Hyperplasia',
 'opb': 'Hyperophy',
 'opc': 'Atrophy',
 'opd': 'Dyplasia',
 'cop': 2,
 'choice_type': 'single',
 'exp': 'Chronic urethral obstruction because of urinary calculi, prostatic hyperophy, tumors, normal pregnancy, tumors, uterine prolapse or functional disorders cause hydronephrosis which by definition is used to describe dilatation of renal pelvis and calculus associated with progressive atrophy of the kidney due to obstruction to the outflow of urine Refer Robbins 7yh/9,1012,9/e. P950',
 'subject_name': 'Anatomy',
 'topic_name': 'Urinary tract',
 'text': 'Choose the correct option for the following question.\n\n### Question:\nChronic urethral obstruction due to benign prismatic hyperplasia can lead to the following change in kidney parenchyma\n\n### Choice:\nA. Hyperplasia B. Hyperophy 

## 3. Load pre-trained model

In [11]:
# Thiết lập độ dài chuỗi tối đa
max_seq_length = 2048

# Load mô hình đã nén 4-bit
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

# Thiết lập PEFT với LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "up_proj",
        "down_proj", "o_proj", "gate_proj"
    ],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    random_state=42,
    loftq_config=None,
)

# In thông tin các tham số có thể huấn luyện
print(model.print_trainable_parameters())


==((====))==  Unsloth 2025.7.3: Fast Llama patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.7.3 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039
None


## 4. Finetuning

In [12]:
# Thiết lập tham số huấn luyện
args = TrainingArguments(
    output_dir="med-mcqa-llama-3.2-1B-4bit-lora",
    logging_dir="logs",
    learning_rate=3e-4,
    lr_scheduler_type="linear",
    per_device_train_batch_size=64,
    gradient_accumulation_steps=16,
    num_train_epochs=2,
    eval_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    eval_steps=50,
    save_steps=50,
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_8bit",
    weight_decay=0.01,
    warmup_steps=10,
    seed=0,
)

# Khởi tạo trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=process_ds["train"],
    eval_dataset=process_ds["validation"],
    dataset_text_field="text",
)

# Bắt đầu huấn luyện
trainer.train()


Unsloth: Tokenizing ["text"]: 100%|██████████| 182822/182822 [00:04<00:00, 38128.51 examples/s]
Unsloth: Tokenizing ["text"]: 100%|██████████| 4183/4183 [00:00<00:00, 37564.04 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 182,822 | Num Epochs = 2 | Total steps = 358
O^O/ \_/ \    Batch size per device = 64 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (64 x 16 x 1) = 1,024
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Step,Training Loss,Validation Loss
50,1.3405,1.415393
100,1.1474,1.402965
150,1.1365,1.397485
200,1.109,1.39856
250,1.0795,1.398535
300,1.0736,1.394387
350,1.0702,1.392415


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=358, training_loss=1.1351087293145377, metrics={'train_runtime': 13945.5162, 'train_samples_per_second': 26.219, 'train_steps_per_second': 0.026, 'total_flos': 3.670647073422213e+17, 'train_loss': 1.1351087293145377})

## 5. Save model

In [16]:
import os
load_dotenv()

hf_token = os.getenv('HF_TOKEN')
login(token=hf_token)

model.save_pretrained("unsloth-llama-trained")

PEFT_MODEL = "dainlieu/Llama-3.2-1B-bnb-4bit-MedMCQA"

model.push_to_hub(PEFT_MODEL, use_auth_token=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
adapter_model.safetensors: 100%|██████████| 45.1M/45.1M [00:07<00:00, 5.86MB/s]


Saved model to https://huggingface.co/dainlieu/Llama-3.2-1B-bnb-4bit-MedMCQA


## 6. Inference

In [17]:
def infer_from_hf(
    model_path="dainlieu/Llama-3.2-3B-bnb-4bit-MedMCQA",
    prompt="""Question: What is the capital of France?
Choices:
A. Berlin
B. Paris
C. Madrid
D. Rome
Answer:"""
):
    # ✅ Load mô hình từ Hugging Face đã fine-tune
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path,
        max_seq_length = 2048,
        dtype = None,              # Tự chọn float16/bfloat16
        load_in_4bit = True,
    )

    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=32,
        do_sample=False,
        temperature=0.7,
        top_p=0.95,
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n--- Output ---\n", answer)

# Gọi hàm
infer_from_hf()

==((====))==  Unsloth 2025.7.3: Fast Llama patching. Transformers: 4.53.2.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



--- Output ---
 Question: What is the capital of France?
Choices:
A. Berlin
B. Paris
C. Madrid
D. Rome
Answer: B
