In [28]:
import pandas as pd
from openai import OpenAI
from openai import AsyncOpenAI
import asyncio
from dotenv import load_dotenv
from tqdm.asyncio import tqdm_asyncio
import os

In [29]:
load_dotenv()
client = AsyncOpenAI()
client.api_key = os.getenv("OPENAI_API_KEY")

In [30]:
async def translator(sentence):
    response = await client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a professional medical translator. Your task is to translate medical sentences from English "
                    "to Vietnamese accurately and using correct medical terminology. "
                    "Return only the translated sentence without any explanation, comment, or formatting."
                ),
            },
            {
                "role": "user",
                "content": f"Translate the following medical sentence into Vietnamese: '{sentence}'",
            },
        ]
    )
    return response.choices[0].message.content.strip()


In [31]:
df = pd.read_json("data/train.json", lines=True)
df.head(5)

Unnamed: 0,question,exp,cop,opa,opb,opc,opd,subject_name,topic_name,id,choice_type
0,Chronic urethral obstruction due to benign pri...,Chronic urethral obstruction because of urinar...,3,Hyperplasia,Hyperophy,Atrophy,Dyplasia,Anatomy,Urinary tract,e9ad821a-c438-4965-9f77-760819dfa155,single
1,Which vitamin is supplied from only animal sou...,Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...,3,Vitamin C,Vitamin B7,Vitamin B12,Vitamin D,Biochemistry,Vitamins and Minerals,e3d3c4e1-4fb2-45e7-9f88-247cc8f373b3,single
2,All of the following are surgical options for ...,"Ans. is 'd' i.e., Roux en Y Duodenal Bypass Ba...",4,Adjustable gastric banding,Biliopancreatic diversion,Duodenal Switch,Roux en Y Duodenal By pass,Surgery,Surgical Treatment Obesity,5c38bea6-787a-44a9-b2df-88f4218ab914,multi
3,Following endaerectomy on the right common car...,The central aery of the retina is a branch of ...,1,Central aery of the retina,Infraorbital aery,Lacrimal aery,Nasociliary aretry,Ophthalmology,,cdeedb04-fbe9-432c-937c-d53ac24475de,multi
4,Growth hormone has its effect on growth through?,"Ans. is 'b' i.e., IGI-1GH has two major functi...",2,Directly,IG1-1,Thyroxine,Intranuclear receptors,Physiology,,dc6794a3-b108-47c5-8b1b-3b4931577249,single


In [32]:
async def translate_df(df: pd.DataFrame, start: int, end: int, output_csv_path: str):
    subset = df.iloc[start:end].copy()
    results = []

    for idx, row in tqdm_asyncio(subset.iterrows(), total=len(subset)):
        try:
            # Dịch song song 5 field
            q, a, b, c, d = await asyncio.gather(
                translator(row["question"]),
                translator(row["opa"]),
                translator(row["opb"]),
                translator(row["opc"]),
                translator(row["opd"])
            )

            results.append({
                "id": row["id"],
                "cop": row["cop"],
                "question": q,
                "opa": a,
                "opb": b,
                "opc": c,
                "opd": d
            })

        except Exception as e:
            print(f"⚠️ Lỗi dịch dòng {idx} - ID {row.get('id')}: {e}")
            results.append({
                "id": row["id"],
                "cop": row["cop"],
                "question": row["question"],
                "opa": row["opa"],
                "opb": row["opb"],
                "opc": row["opc"],
                "opd": row["opd"]
            })

    # Xuất kết quả
    df_out = pd.DataFrame(results)
    df_out.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    print(f"✅ Dịch xong {len(df_out)} dòng. Đã lưu vào {output_csv_path}")


In [None]:
await translate_df(df, start=0, end=10, output_csv_path="data/translated.csv")

100%|██████████| 10/10 [00:18<00:00,  1.81s/it]

✅ Dịch xong 10 dòng. Đã lưu vào translated.csv



