<a href="https://colab.research.google.com/github/lapatradaa/M-MMT4NL/blob/main/promt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q textdistance pandas openpyxl


In [2]:
from google.colab import files
import pandas as pd

uploaded = files.upload()
fname = next(iter(uploaded))

df = pd.read_excel(fname) if fname.endswith((".xlsx",".xls")) else pd.read_csv(fname)
df["references"] = df["references"].astype(str).fillna("")

print("✅ Loaded:", fname, " — rows:", len(df))
df.head()


Saving MMT4NL_result - LLMs evaluation.csv to MMT4NL_result - LLMs evaluation.csv
✅ Loaded: MMT4NL_result - LLMs evaluation.csv  — rows: 50


Unnamed: 0,references,candidates,BLEU,BERT(F1)
0,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,100.0,1.0
1,วันนี้อากาศดีและสดชื่น,อากาศวันนี้สดชื่นและน่ารื่นรมย์,15.107,0.956
2,หล่อนรู้สึกเสียใจที่ไม่สามารถไปงานได้,เธอเสียใจที่ไม่สามารถไปร่วมงานได้,49.626,0.959
3,อาหารมื้อนี้อร่อยสุดๆ,อาหารมื้อนี้อร่อยอย่างยิ่ง,66.874,0.981
4,ฉันไม่พอใจกับบริการที่ได้รับ,ฉันไม่พอใจกับบริการที่ได้รับ,100.0,1.0


In [3]:
#1. Thai -> English
TH2EN_SYSTEM = (
    "You are a professional translator and language expert. "
    "Translate the following Thai sentence into English. "
    "Output only the final English translation."
)
TH2EN_USER_TPL = "{thai_text}"

In [4]:
#2. Perturb English
PERTURB_SYSTEM = (
    "You are a careful paraphraser. Make small, meaning-preserving perturbations."
)
PERTURB_USER_TPL = (
    "Rephrase the following English sentence with small, natural perturbations.\n"
    "- Keep semantics the same.\n"
    "- Prefer synonyms, light reordering, minor tense/voice changes.\n"
    "- Keep length roughly similar.\n"
    "- Output only the final English sentence.\n\n"
    "{english_text}"
)

In [5]:
#3. English -> Thai  (YOUR EXACT TEXT)
EN2TH_SYSTEM = (
    "You are a professional translator and language expert. "
    "Please translate the following English sentence into Thai. "
    "Use English at a C1 proficiency level when framing any instructions or explanations—and output only the final Thai translation."
)
EN2TH_USER_TPL = (
    "Given this English text, translate it into Thai. Preserve the semantic of the original English text in the translate version as much as possible.\n\n"
    "{English text}"
)

In [17]:
USE_PROVIDER = "gemini"

In [18]:
!pip install openai
!pip install google-generativeai



In [19]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyCwSGELvgdp5T4yb_zXfeHZ5n7MHqyKAwA"

In [20]:
import google.generativeai as genai
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
print("Gemini key ok:", bool(os.getenv("GOOGLE_API_KEY")))

Gemini key ok: True


In [21]:
def llm_call(system_prompt: str, user_prompt: str) -> str:
    if USE_PROVIDER == "openai":
        from openai import OpenAI
        import os
        api_key = os.getenv("OPENAI_API_KEY")
        if not api_key:
            raise RuntimeError("OPENAI_API_KEY not set. Either set it or switch USE_PROVIDER='gemini'.")
        client = OpenAI(api_key=api_key)
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_prompt},
            ],
            temperature=0.2,
        )
        return resp.choices[0].message.content.strip()

    elif USE_PROVIDER == "gemini":
        import os, google.generativeai as genai
        api_key = os.getenv("GOOGLE_API_KEY")
        if not api_key:
            raise RuntimeError("GOOGLE_API_KEY not set. Either set it or switch USE_PROVIDER='openai'.")
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel("gemini-1.5-flash")
        prompt = f"[SYSTEM]\n{system_prompt}\n\n[USER]\n{user_prompt}"
        resp = model.generate_content(prompt)
        return resp.text.strip()

    else:
        raise ValueError("Set USE_PROVIDER to 'openai' or 'gemini'.")


In [22]:
def make_user_prompt(template: str, **kwargs) -> str:
    out = template
    for k, v in kwargs.items():
        out = out.replace("{" + k + "}", v)
    return out

# Add new columns
df["en_raw"]      = ""   # TH -> EN
df["en_perturb"]  = ""   # Perturbed EN
df["final_thai"]  = ""   # EN -> TH (back translation)

for i, thai in enumerate(df["references"].tolist()):
    print(f"\n--- Processing row {i+1}/{len(df)} ---")
    print("Input Thai:", thai)

    # 1) Thai → English
    u_th2en = make_user_prompt(TH2EN_USER_TPL, thai_text=thai)
    en_raw  = llm_call(TH2EN_SYSTEM, u_th2en)
    print("EN raw:", en_raw)

    # 2) Perturb English
    u_pert  = make_user_prompt(PERTURB_USER_TPL, english_text=en_raw)
    en_p    = llm_call(PERTURB_SYSTEM, u_pert)
    print("EN perturb:", en_p)

    # 3) English → Thai (your exact prompts)
    u_en2th = make_user_prompt(EN2TH_USER_TPL, **{"English text": en_p})
    th_out  = llm_call(EN2TH_SYSTEM, u_en2th)
    print("TH back:", th_out)

    # Save results into dataframe
    df.at[i, "en_raw"]     = en_raw
    df.at[i, "en_perturb"] = en_p
    df.at[i, "final_thai"] = th_out

print("✅ Pipeline complete!")
df[["references", "en_raw", "en_perturb", "final_thai"]].head()



--- Processing row 1/50 ---
Input Thai: ฉันมีความสุขมากที่ได้พบเพื่อนใหม่




BadRequest: 400 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: API Key not found. Please pass a valid API key.