In [2]:
!pip -q install transformers accelerate bitsandbytes trl mlflow boto3

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.6/564.6 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import torch
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import (
    AutoModelForSeq2SeqLM,          # AutoModel for language modeling tasks
    AutoTokenizer,                # AutoTokenizer for tokenization
    BitsAndBytesConfig,           # Configuration for BitsAndBytes
    TrainingArguments,            # Training arguments for model training
    TrainerCallback
)
from peft import LoraConfig, PeftModel,PeftConfig
from trl import SFTTrainer

import pandas as pd
import numpy as np
import os
import logging as log
from datetime import datetime
import matplotlib.pyplot as plt

import mlflow
from mlflow.tracking import MlflowClient

import warnings
warnings.filterwarnings('ignore')

from data_prep import get_dataset, tokenize_and_mask
from peft_lora_config import Peft_Config

def setup_logging():
    # Remove all handlers associated with the root logger object.
    for handler in log.root.handlers[:]:
        log.root.removeHandler(handler)

    log.basicConfig(
        level=log.WARNING,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[log.StreamHandler()],
    )

logger = log.getLogger(__name__)
setup_logging()

# Dataset EDA

In [None]:
ds = load_dataset("tarudesu/ViHealthQA")
ds_train = ds["train"].to_pandas()
ds_test = ds["test"].to_pandas()

ds_train = pd.concat([ds_train, ds_test])

In [None]:
ds_train

Unnamed: 0,id,question,answer,link
0,1,Đang chích ngừa viêm gan B có chích ngừa Covid...,Nếu anh/chị đang tiêm ngừa vaccine phòng bệnh ...,https://vnexpress.net/tu-van-tiem-vaccine-covi...
1,2,"Đau đầu, căng thẳng do công việc, suy giảm trí...",Tình trạng đau đầu theo bạn mô tả thì chưa rõ....,https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
2,3,Đặt lưu lượng khí hệ thống Jackson-Rees thấp h...,Hệ thống Jackson – Rees dùng khi gây mê để trá...,https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
3,4,Bé 13 tháng tuổi uống thuốc Acyclovir có được ...,Acyclovir có thể sử dụng cho cả trẻ dưới 13 th...,https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
4,5,Vừa qua ngày 4/6 tôi có bị con chó ở nhà cắn x...,Bệnh dại là bệnh nguy hiểm và nếu có chỉ định ...,https://vnexpress.net/tu-van-tiem-vaccine-covi...
...,...,...,...,...
2008,2009,Sốt kèm nhức đầu sau khi ngủ dậy là bệnh gì?,"Bạn có biểu hiện sốt, nhức đầu sau khi ngủ dậy...",https://www.vinmec.com/vi/suc-khoe-tong-quat/t...
2009,2010,Trước Tết em đã làm IVF một lần ở một bệnh việ...,"Theo các nghiên cứu, dự trữ buồng trứng của ng...",https://vnexpress.net/tu-van-vo-sinh-hiem-muon...
2010,2011,Bệnh nhân tiền sử tiểu đường tuýp 2 nóng rát t...,Anh đã xuất hiện biến chứng viêm đa thần kinh ...,https://www.vinmec.com/vi/tin-tuc/hoi-dap-bac-...
2011,2012,Cháu 34 tuổi có tiền sử bị dị ứng với đồ ăn nh...,"Với tiền sử như đã nêu, anh nên thực hiện tiêm...",https://vnexpress.net/tu-van-tiem-vaccine-covi...


# Set up PEFT and Lora Configs

In [None]:
config = Peft_Config()

In [None]:
# Step 2 :Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, config.bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=config.use_4bit,
    bnb_4bit_quant_type=config.bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=config.use_nested_quant,
)

In [None]:
# Step 3 :Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and config.use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
# Step 4 :Load base model
model = AutoModelForSeq2SeqLM.from_pretrained(
    config.model_name,
    quantization_config=bnb_config,
    device_map=config.device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [None]:
# Step 5 :Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.model_name, trust_remote_code=True)

special_tokens = {"additional_special_tokens": ["<|system|>", "<|user|>", "<|assistant|>"]}
tokenizer.add_special_tokens(special_tokens)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "right"

tokenizer.chat_template = """{% for message in messages %}
{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}
{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + eos_token }}
{% elif message['role'] == 'assistant' %}{{ '<|assistant|>\n' + message['content'] + eos_token }}
{% endif %}
{% if loop.last and add_generation_prompt %}{{ '<|assistant|>' }}
{% endif %}
{% endfor %}"""

model.resize_token_embeddings(len(tokenizer))

model.config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
# Step 6 :Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    r=config.lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

## s3 Config

In [None]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv()

S3_BUCKET = "mlflow-artifacts-monitor"
s3_client = boto3.client(
    "s3",
    aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    region_name=os.environ["AWS_DEFAULT_REGION"]
)

## mlflow tracking

In [None]:
# mlflow set tracking
mlflow.set_tracking_uri("https://ridgy-receptually-mike.ngrok-free.dev")
tracking_uri = mlflow.get_tracking_uri()
print(f"Current tracking uri: {tracking_uri}")

Current tracking uri: https://ridgy-receptually-mike.ngrok-free.dev


In [None]:
mlflow.set_experiment("healthcarechatbot")

# Step 7 :Set training parameters
training_arguments = TrainingArguments(
    # Set this to mlflow for logging your training
    report_to="mlflow",
    # Name the MLflow run
    run_name=f"{config.model_name_finetuned}-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
    output_dir=config.output_dir,
    num_train_epochs=config.num_train_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    optim=config.optim,
    save_steps=config.save_steps,
    logging_steps=config.logging_steps,
    learning_rate=config.learning_rate,
    weight_decay=config.weight_decay,
    fp16=config.fp16,
    bf16=config.bf16,
    max_grad_norm=config.max_grad_norm,
    max_steps=config.max_steps,
    warmup_ratio=config.warmup_ratio,
    group_by_length=config.group_by_length,
    lr_scheduler_type=config.lr_scheduler_type,
)

In [None]:
ds_train = get_dataset()

message = ds_train['messages']

tokenized_chat = tokenizer.apply_chat_template(message, tokenize=True, add_generation_prompt=True, return_tensors="pt")
print(tokenizer.decode(tokenized_chat[0]))

<|assistant|> 


In [None]:
ds_train[0]

{'messages': [{'content': 'Bạn là một trợ lý y tế thông minh, trả lời ngắn gọn, chính xác, dựa trên kiến thức y tế Việt Nam.',
   'role': 'system'},
  {'content': 'Đang chích ngừa viêm gan B có chích ngừa Covid-19 được không?',
   'role': 'user'},
  {'content': 'Nếu anh/chị đang tiêm ngừa vaccine phòng bệnh viêm gan B, anh/chị vẫn có thể tiêm phòng vaccine phòng Covid-19, tuy nhiên vaccine Covid-19 phải được tiêm cách trước và sau mũi vaccine viêm gan B tối thiểu là 14 ngày.',
   'role': 'assistant'}]}

In [None]:
from mlflow.models import infer_signature

sample = ds_train[0]["messages"]

# MLflow infers schema from the provided sample input/output/params
signature = infer_signature(
  model_input=sample[1]["content"],
  model_output=sample[2]["content"],
  # Parameters are saved with default values if specified
  params={"max_new_tokens": 256, "repetition_penalty": 1.15, "return_full_text": False},
)

signature

inputs: 
  [string (required)]
outputs: 
  [string (required)]
params: 
  ['max_new_tokens': long (default: 256), 'repetition_penalty': double (default: 1.15), 'return_full_text': boolean (default: False)]

In [None]:
max_length = 1024

# Apply mapping (non-batched for simplicity; batched mapping can be used for speed)
tokenized = ds_train.map(
    lambda example: tokenize_and_mask(example, tokenizer, max_length), remove_columns=["messages"])
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_dataset = tokenized

In [None]:
train_dataset[0]

{'input_ids': tensor([36096,  4440,    49,  ...,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'labels': tensor([-100, -100, -100,  ...,    0,    0,    0])}

In [None]:
# Step 8 :Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    args=training_arguments,
)

In [None]:
class MLflowLossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            for k, v in logs.items():
                if isinstance(v, (int, float)):
                    mlflow.log_metric(k, v, step=state.global_step)

trainer.add_callback(MLflowLossCallback)

In [None]:
with mlflow.start_run() as run:
    trainer.train()

    # Log training loss curve
    history = trainer.state.log_history
    steps = [h["step"] for h in history if "loss" in h]
    losses = [h["loss"] for h in history if "loss" in h]

    # Log loss per step as metrics
    for step, loss in zip(steps, losses):
        mlflow.log_metric("train_loss", loss, step=step)

    # Step 10: Save trained adapter model (checkpoint)
    checkpoint_dir = "checkpoint_model"
    trainer.model.save_pretrained(checkpoint_dir)
    tokenizer.save_pretrained(checkpoint_dir)

    # Upload checkpoint folder recursively to S3
    checkpoint_s3_prefix = f"{run.info.run_id}/checkpoint"
    for root, _, files in os.walk(checkpoint_dir):
        for file in files:
            local_path = os.path.join(root, file)
            relative_path = os.path.relpath(local_path, checkpoint_dir)
            s3_client.upload_file(local_path, S3_BUCKET, f"{checkpoint_s3_prefix}/{relative_path}")

    # Log S3 path metadata in MLflow
    mlflow.log_param("checkpoint_s3_path", f"s3://{S3_BUCKET}/{checkpoint_s3_prefix}")

    # Step 11: Merge and save final model with adapter
    model = trainer.model
    if hasattr(model, "merge_and_unload"):
        model = model.merge_and_unload()
        model.resize_token_embeddings(len(tokenizer))
        model.config.pad_token_id = tokenizer.pad_token_id
        model.config.eos_token_id = tokenizer.eos_token_id
        model.config.bos_token_id = tokenizer.bos_token_id

    merged_dir = "merged_model"
    model.save_pretrained(merged_dir)
    tokenizer.save_pretrained(merged_dir)

    REGISTERED_MODEL_NAME = "health-llm"

    model_info = mlflow.transformers.log_model(
        transformers_model=merged_dir,
        tokenizer=tokenizer,
        name="model",
        task="text2text-generation",
        model_card=None,
        pip_requirements=["transformers", "accelerate", "bitsandbytes"],
        registered_model_name=REGISTERED_MODEL_NAME
    )

    client = MlflowClient()

    client.set_registered_model_tag(
        name=REGISTERED_MODEL_NAME, key="use_case", value="patient_service"
    )

    client.update_registered_model(
        name=REGISTERED_MODEL_NAME,
        description="A health-specific chatbot about daily Vietnamese sickness question",
    )

    client.set_model_version_tag(
        name=REGISTERED_MODEL_NAME,
        version=model_info.registered_model_version,
        key="validation_status",
        value="testing",
    )

    print(f"Model registered with version: {model_info.registered_model_version}")

    # Create an alias for easy reference
    client.set_registered_model_alias(
        name=REGISTERED_MODEL_NAME,
        alias="Little-testing",
        version=model_info.registered_model_version,
    )

Step,Training Loss
50,30.2752



Repository Not Found for url: https://huggingface.co/merged_model/resolve/main/README.md.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication
Invalid username or password.
Registered model 'health-llm' already exists. Creating a new version of this model...
2025/10/08 16:49:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: health-llm, version 3
Created version '3' of model 'health-llm'.


Model registered with version: 3
🏃 View run zealous-foal-159 at: https://ridgy-receptually-mike.ngrok-free.dev/#/experiments/10/runs/acaee44c6918425eba079f5e18ae050c
🧪 View experiment at: https://ridgy-receptually-mike.ngrok-free.dev/#/experiments/10


# Load model

In [3]:
import mlflow
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Lấy đường dẫn artifacts thật
model_uri = mlflow.get_artifact_uri("models:/health-llm@Little-testing")

# In ra để xem đúng chưa
print("Model URI:", model_uri)




Model URI: file:///content/mlruns/0/0dea1fb9aa3445ee9d03cbbc7917a2bb/artifacts/models:/health-llm@Little-testing


In [4]:
# Tải model trực tiếp từ artifact local
model_dir = mlflow.artifacts.download_artifacts(model_uri)

# Load thủ công qua HF
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device_map="auto")

test_query = "Con tôi bị chảy máu mũi thì bị gì?"
response = pipe(test_query, max_new_tokens=128)
print(response[0]["generated_text"])

OSError: No such file or directory: '/content/mlruns/0/0dea1fb9aa3445ee9d03cbbc7917a2bb/artifacts/models:/health-llm@Little-testing'