In [None]:
!pip -q install transformers accelerate bitsandbytes trl mlflow boto3

In [None]:
import os
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    AutoModelForSeq2SeqLM,          # AutoModel for language modeling tasks
    AutoTokenizer,                # AutoTokenizer for tokenization
    BitsAndBytesConfig,           # Configuration for BitsAndBytes
    HfArgumentParser,             # Argument parser for Hugging Face models
    TrainingArguments,            # Training arguments for model training
    pipeline,                     # Creating pipelines for model inference
    logging,                      # Logging information during training
    TrainerCallback
)
from peft import LoraConfig, PeftModel,PeftConfig
from trl import SFTTrainer

import pandas as pd
import numpy as np
import os
import logging as log
from pydantic import BaseModel, Field
import json
from datetime import datetime
import mlflow
from mlflow.tracking import MlflowClient
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

def setup_logging():
    # Remove all handlers associated with the root logger object.
    for handler in log.root.handlers[:]:
        log.root.removeHandler(handler)

    log.basicConfig(
        level=log.WARNING,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[log.StreamHandler()],
    )

logger = log.getLogger(__name__)
setup_logging()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Utils

In [None]:
SYSTEM_PROMPT = (
    "Bạn là một trợ lý y tế thông minh, trả lời ngắn gọn, chính xác, dựa trên kiến thức y tế Việt Nam."
)

def preprocess_function(example):
    user_content = example["question"].strip()
    assistant_content = example["answer"].strip()

    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content}
        ]
    }

def get_dataset():
    ds = load_dataset("tarudesu/ViHealthQA")
    full_ds = concatenate_datasets([ds["train"], ds["validation"]])
    full_ds = full_ds.map(preprocess_function, remove_columns=full_ds.column_names).select(range(100))

    return full_ds


In [None]:
mlflow.start_run()


# Dataset EDA

In [None]:
ds = load_dataset("tarudesu/ViHealthQA")
ds_train = ds["train"].to_pandas()
ds_val = ds["validation"].to_pandas()

ds_train = pd.concat([ds_train, ds_test])

README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

val.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/7009 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/993 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2013 [00:00<?, ? examples/s]

In [None]:
ds_train

Unnamed: 0,id,question,answer,link
0,1,Đang chích ngừa viêm gan B có chích ngừa Covid...,Nếu anh/chị đang tiêm ngừa vaccine phòng bệnh ...,https://vnexpress.net/tu-van-tiem-vaccine-covi...
1,2,"Đau đầu, căng thẳng do công việc, suy giảm trí...",Tình trạng đau đầu theo bạn mô tả thì chưa rõ....,https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
2,3,Đặt lưu lượng khí hệ thống Jackson-Rees thấp h...,Hệ thống Jackson – Rees dùng khi gây mê để trá...,https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
3,4,Bé 13 tháng tuổi uống thuốc Acyclovir có được ...,Acyclovir có thể sử dụng cho cả trẻ dưới 13 th...,https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
4,5,Vừa qua ngày 4/6 tôi có bị con chó ở nhà cắn x...,Bệnh dại là bệnh nguy hiểm và nếu có chỉ định ...,https://vnexpress.net/tu-van-tiem-vaccine-covi...
...,...,...,...,...
2008,2009,Sốt kèm nhức đầu sau khi ngủ dậy là bệnh gì?,"Bạn có biểu hiện sốt, nhức đầu sau khi ngủ dậy...",https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
2009,2010,Trước Tết em đã làm IVF một lần ở một bệnh việ...,"Theo các nghiên cứu, dự trữ buồng trứng của ng...",https://vnexpress.net/tu-van-vo-sinh-hiem-muon...
2010,2011,Bệnh nhân tiền sử tiểu đường tuýp 2 nóng rát t...,Anh đã xuất hiện biến chứng viêm đa thần kinh ...,https://www.vinmec.com/vi/tin-tuc/hoi-dap-bac-...
2011,2012,Cháu 34 tuổi có tiền sử bị dị ứng với đồ ăn nh...,"Với tiền sử như đã nêu, anh nên thực hiện tiêm...",https://vnexpress.net/tu-van-tiem-vaccine-covi...


# Set up PEFT and Lora Configs

In [None]:
model_name = "VietAI/vit5-base"
model_name_finetuned = "VietAI/vit5-law-base"

In [None]:
# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

In [None]:
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

In [None]:
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 5000

# Log every X updates steps
logging_steps = 50

In [None]:
# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Step 2 :Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
# Step 3 :Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
# Step 4 :Load base model
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/904M [00:00<?, ?B/s]

In [None]:
# Step 5 :Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

special_tokens = {"additional_special_tokens": ["<|system|>", "<|user|>", "<|assistant|>"]}
tokenizer.add_special_tokens(special_tokens)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "right"

tokenizer.chat_template = """{% for message in messages %}
{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}
{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + eos_token }}
{% elif message['role'] == 'assistant' %}{{ '<|assistant|>\n' + message['content'] + eos_token }}
{% endif %}
{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}
{% endif %}
{% endfor %}"""

model.resize_token_embeddings(len(tokenizer))

model.config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
# Step 6 :Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

## s3 Config

In [None]:
import boto3

os.environ["AWS_ACCESS_KEY_ID"] = "AKIATCKAPM7IMG7AFW5P"
os.environ["AWS_SECRET_ACCESS_KEY"] = "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"
os.environ["AWS_DEFAULT_REGION"] = "ap-southeast-2"

S3_BUCKET = "mlflow-artifacts-monitor"
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    region_name=os.environ["AWS_DEFAULT_REGION"]
)

## mlflow tracking

In [None]:
# mlflow set tracking
mlflow.set_tracking_uri("https://ridgy-receptually-mike.ngrok-free.dev")
tracking_uri = mlflow.get_tracking_uri()
print(f"Current tracking uri: {tracking_uri}")

Current tracking uri: https://ridgy-receptually-mike.ngrok-free.dev


In [None]:
mlflow.set_experiment("Model finetuning")

# Step 7 :Set training parameters
training_arguments = TrainingArguments(
    # Set this to mlflow for logging your training
    report_to="mlflow",
    # Name the MLflow run
    run_name=f"{model_name_finetuned}-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
)

In [None]:
ds_train = get_dataset()

message = ds_train['messages']

tokenized_chat = tokenizer.apply_chat_template(message, tokenize=True, add_generation_prompt=True, return_tensors="pt")
print(tokenizer.decode(tokenized_chat[0]))

Map:   0%|          | 0/8002 [00:00<?, ? examples/s]

<|assistant|> 


In [None]:
ds_train[0]

{'messages': [{'content': 'Bạn là một trợ lý y tế thông minh, trả lời ngắn gọn, chính xác, dựa trên kiến thức y tế Việt Nam.',
   'role': 'system'},
  {'content': 'Đang chích ngừa viêm gan B có chích ngừa Covid-19 được không?',
   'role': 'user'},
  {'content': 'Nếu anh/chị đang tiêm ngừa vaccine phòng bệnh viêm gan B, anh/chị vẫn có thể tiêm phòng vaccine phòng Covid-19, tuy nhiên vaccine Covid-19 phải được tiêm cách trước và sau mũi vaccine viêm gan B tối thiểu là 14 ngày.',
   'role': 'assistant'}]}

In [None]:
from mlflow.models import infer_signature

sample = ds_train[0]["messages"]

# MLflow infers schema from the provided sample input/output/params
signature = infer_signature(
  model_input=sample[1]["content"],
  model_output=sample[2]["content"],
  # Parameters are saved with default values if specified
  params={"max_new_tokens": 256, "repetition_penalty": 1.15, "return_full_text": False},
)

signature

inputs: 
  [string (required)]
outputs: 
  [string (required)]
params: 
  ['max_new_tokens': long (default: 256), 'repetition_penalty': double (default: 1.15), 'return_full_text': boolean (default: False)]

In [None]:
max_length = 1024

def tokenize_and_mask(example):
    messages = example["messages"]
    prompt_messages = messages[:-1]
    completion = messages[-1]["content"] + tokenizer.eos_token

    # build prompt text using the chat template (no generation content)
    prompt_text = tokenizer.apply_chat_template(prompt_messages, tokenize=False, add_generation_prompt=True)
    full_text = prompt_text + completion

    tokenized_full = tokenizer(full_text, truncation=True, max_length=max_length, padding="max_length")
    tokenized_prompt = tokenizer(prompt_text, truncation=True, max_length=max_length)

    prompt_len = len(tokenized_prompt["input_ids"])
    input_ids = tokenized_full["input_ids"]
    attention_mask = tokenized_full.get("attention_mask", [1]*len(input_ids))

    # labels: -100 for prompt tokens, actual ids for completion tokens
    labels = [-100] * prompt_len + input_ids[prompt_len:]
    if len(labels) < max_length:
        labels = labels + [-100] * (max_length - len(labels))
    else:
        labels = labels[:max_length]

    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Apply mapping (non-batched for simplicity; batched mapping can be used for speed)
tokenized = ds_train.map(tokenize_and_mask, remove_columns=["messages"])
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_dataset = tokenized


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
train_dataset[0]

{'input_ids': tensor([36096,  4440,    49,  ...,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'labels': tensor([-100, -100, -100,  ...,    0,    0,    0])}

In [None]:
# Step 8 :Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    args=training_arguments,
)

Truncating train dataset:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
class MLflowLossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            for k, v in logs.items():
                if isinstance(v, (int, float)):
                    mlflow.log_metric(k, v, step=state.global_step)

trainer.add_callback(MLflowLossCallback)

In [None]:
# Step 9 : Train model
with mlflow.start_run() as run:
    trainer.train()

    # Log training loss curve
    history = trainer.state.log_history
    steps = [h["step"] for h in history if "loss" in h]
    losses = [h["loss"] for h in history if "loss" in h]

    # Log loss per step as metrics
    for step, loss in zip(steps, losses):
        mlflow.log_metric("train_loss", loss, step=step)

    # Step 10 : Save trained adapter model (checkpoint)
    checkpoint_dir = "checkpoint_model"
    trainer.model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)

    # Upload checkpoint folder recursively to S3
    checkpoint_s3_prefix = f"{run.info.run_id}/checkpoint"
    for root, _, files in os.walk(checkpoint_dir):
        for file in files:
            local_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_path, checkpoint_dir)
            s3_client.upload_file(local_path, S3_BUCKET, f"{checkpoint_s3_prefix}/{relative_path}")

    # Log S3 path metadata in MLflow
    mlflow.log_param("checkpoint_s3_path", f"s3://{S3_BUCKET}/{checkpoint_s3_prefix}")

    # Step 11: Merge and save final model with adapter
    model = trainer.model
    if hasattr(model, "merge_and_unload"):
        model = model.merge_and_unload()
        model.resize_token_embeddings(len(tokenizer))
        model.config.pad_token_id = tokenizer.pad_token_id
        model.config.eos_token_id = tokenizer.eos_token_id
        model.config.bos_token_id = tokenizer.bos_token_id

        merged_dir = "merged_model"
        model.save_pretrained(merged_dir)
        tokenizer.save_pretrained(merged_dir)

        # Upload merged model to S3
        merged_s3_prefix = f"{run.info.run_id}/final_model"
        for root, _, files in os.walk(merged_dir):
            for file in files:
                local_path = os.path.join(root, file)
                relative_path = os.path.relpath(local_path, merged_dir)
                s3_client.upload_file(local_path, S3_BUCKET, f"{merged_s3_prefix}/{relative_path}")

        # Log S3 path metadata for merged model
        mlflow.log_param("merged_model_s3_path", f"s3://{S3_BUCKET}/{merged_s3_prefix}")

    print(f"Artifacts uploaded to S3:")
    print(f"  Checkpoint: s3://{S3_BUCKET}/{checkpoint_s3_prefix}")
    print(f"  Merged model: s3://{S3_BUCKET}/{merged_s3_prefix}")

    model_uri = f"s3://{S3_BUCKET}/{merged_s3_prefix}"  # URI tới merged model
    model_name = "MyModel"

    client = MlflowClient()

    # Tạo model trong Registry nếu chưa có
    try:
        client.create_registered_model(model_name)
    except Exception as e:
        print(f"Model {model_name} có thể đã tồn tại: {e}")

    # Tạo version mới
    result = client.create_model_version(
        name=model_name,
        source=model_uri,
        run_id=run.info.run_id
    )

    # Promote version sang Staging
    client.transition_model_version_stage(
        name=model_name,
        version=result.version,
        stage="Staging"
    )

    print(f"Registered model '{model_name}' version {result.version} in stage 'Staging'")


Step,Training Loss
50,30.2967


Artifacts uploaded to S3:
  Checkpoint: s3://mlflow-artifacts-monitor/774900a8f5f145fea1839e90d7921e89/checkpoint
  Merged model: s3://mlflow-artifacts-monitor/774900a8f5f145fea1839e90d7921e89/final_model


2025/10/04 09:30:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MyModel, version 1


Registered model 'MyModel' version 1 in stage 'Staging'
🏃 View run melodic-koi-780 at: https://ridgy-receptually-mike.ngrok-free.dev/#/experiments/1/runs/774900a8f5f145fea1839e90d7921e89
🧪 View experiment at: https://ridgy-receptually-mike.ngrok-free.dev/#/experiments/1
