In [4]:
!pip -q install mlflow transformers boto3

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import mlflow
import boto3

In [8]:
# mlflow set tracking
url = "https://victoria-communicable-sometimes.ngrok-free.dev"
mlflow.set_tracking_uri(url)
tracking_uri = mlflow.get_tracking_uri()
print(f"Current tracking uri: {tracking_uri}")

Current tracking uri: https://victoria-communicable-sometimes.ngrok-free.dev


In [9]:
mlflow.set_experiment("healthcarechatbot")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1760322730708, experiment_id='1', last_update_time=1760322730708, lifecycle_stage='active', name='healthcarechatbot', tags={}>

In [15]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "health-llm"

# Lấy thông tin model versions
versions = client.get_latest_versions(model_name)

for v in versions:
    print("Version:", v.version)
    path = v.source
    print("Model URI:", path)
    print("Run ID:", v.run_id)


  versions = client.get_latest_versions(model_name)


Version: 1
Model URI: s3://mlflow-artifacts-monitor/models/health-llm/a067fa2b0e724057a797ead349550265
Run ID: 


In [16]:
import os
import boto3
from tqdm import tqdm
from dotenv import load_dotenv

def load_model_from_s3(s3_prefix: str, local_dir: str = "downloaded_model"):
    """
    Download an entire model directory (e.g., from MLflow-registered S3 prefix).
    Example s3_prefix: "models/health-llm/b3f91d2b6f42464aab9b9ff07d22ad89"
    """

    load_dotenv()

    aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
    aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
    aws_region = os.getenv("AWS_DEFAULT_REGION", "ap-southeast-2")
    bucket_name = os.getenv("AWS_BUCKET_NAME", "mlflow-artifacts-monitor")

    if not all([aws_access_key, aws_secret_key, bucket_name]):
        raise ValueError("Missing AWS credentials or bucket name in .env file")

    s3 = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key,
        aws_secret_access_key=aws_secret_key,
        region_name=aws_region
    )

    os.makedirs(local_dir, exist_ok=True)

    paginator = s3.get_paginator("list_objects_v2")
    total_files = 0

    # Đếm file trước (để tqdm chạy đẹp)
    for page in paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix):
        for obj in page.get("Contents", []):
            total_files += 1

    with tqdm(total=total_files, desc=f"Downloading model from {s3_prefix}") as pbar:
        for page in paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix):
            for obj in page.get("Contents", []):
                key = obj["Key"]
                local_path = os.path.join(local_dir, os.path.relpath(key, s3_prefix))
                os.makedirs(os.path.dirname(local_path), exist_ok=True)

                s3.download_file(bucket_name, key, local_path)
                pbar.update(1)

    print(f"Model downloaded successfully → {local_dir}")
    return local_dir


In [18]:
local_dir = load_model_from_s3("models/health-llm/a067fa2b0e724057a797ead349550265")

Downloading model from models/health-llm/a067fa2b0e724057a797ead349550265: 100%|██████████| 9/9 [00:19<00:00,  2.15s/it]

Model downloaded successfully → downloaded_model





In [19]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_dir = "downloaded_model"

# Load tokenizer và model từ local
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)


In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_dir = "downloaded_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

# Chat history
messages = [
    {"role": "system", "content": "Bạn là một trợ lý y tế thông minh, trả lời ngắn gọn, chính xác, dựa trên kiến thức y tế Việt Nam."},
    {"role": "user", "content": "Con tôi bị chảy máu mũi thì nên làm gì?"}
]

# Dùng built-in chat template
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,   # chỉ tạo text string, không tokenize luôn
    add_generation_prompt=True
)

inputs = tokenizer(prompt, return_tensors="pt")

outputs = model.generate(
    **inputs,
    max_new_tokens=80,
    temperature=0.7,
    do_sample=True,
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


* Tôi bị chảy máu mũi+, _Z, []]!> j thực tế.& W vàỴ y tế Việt Nam. Ẽ Tôi bị chảy máu mũiỠ tôi là một trợ lý y tế thông minh.Ẫ lý y tế thông minh.Ẳ!Ằ tư vấn viênÕ Ẻ Tôi là một


# Streaming

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

In [25]:
from threading import Thread
# --- Tạo streamer ---
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)

# --- Chạy generate trong thread riêng để không block ---
generation_kwargs = dict(
    **inputs,
    max_new_tokens=150,
    do_sample=True,
    temperature=0.7,
    streamer=streamer
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()

# --- Stream output dần ---
print("Assistant:", end=" ", flush=True)
for new_text in streamer:
    print(new_text, end="", flush=True)
thread.join()

Assistant: *+_Z  Tôi bị chảy máu mũi[]]! > j và phương pháp Việt Nam. & W ỴẼ, vàỠẪẲ có thểẰ làÕ Ẻ Ặ.Ỗ, thích hợp cho nhiều người Việt Nam.      Ẹ Ỹ tôi bị chảy máu mũi thì nên làm gì?          È.Ữ chảy máu mũiỰỢ  Ử tôi bị chảy máu mũi thì nên làm gì? Ể Tôi bị chảy máu mũi thì nên làm gì