<a href="https://colab.research.google.com/github/lapatradaa/M-MMT4NL/blob/main/llms_evaluation_direct_translate_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install sacrebleu bert-score
!pip install sacrebleu bert-score pythainlp
!pip -q install pandas openpyxl jellyfish bert-score tqdm openai --upgrade



In [18]:
import os, random, math
from pathlib import Path
import pandas as pd
import random
import os
from tqdm import tqdm
import jellyfish
from bert_score import score as bertscore


In [19]:
OPENAI_MODEL_TRANSLATE = "gpt-4o-mini"
OPENAI_MODEL_PERTURB  = "gpt-4o-mini"
TEMPERATURE_TRANSLATE  = 0.2
TEMPERATURE_PERTURB    = 0.7


In [20]:
# multilingual BERTScore model (works well for Thai)
BERT_MODEL = "xlm-roberta-large"
W_BERT = 0.8
W_JARO = 0.2

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

In [21]:
from openai import OpenAI

##_client = OpenAI <“OPENAI_API_KEY”>
def _chat_once(system_prompt: str, user_prompt: str, *, model: str, temperature: float) -> str:
    out = _client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user",   "content": user_prompt.strip()},
        ],
    )
    return (out.choices[0].message.content or "").strip()

In [22]:


# --- Prompt Templates ---
PROMPT_TAXONOMY = """You're an expert linguist in English and Thai. You need to modify this Thai sentence by substituting a word with its respective synonym, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

I'll give you some examples of converting one sentence to another sentence: "I'm so tired" is converted to "I'm so exhausted" "I'm really hungry" is converted to "I'm really starving" "I'm not sure if I'm up for that" is converted to "I'm not certain if I'm up for that" "I'm not sure if I can make it to the event" is converted to "I'm not confident if I can make it to the event"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.
You're an expert linguist in English and Thai. You need to modify this Thai sentence by negating the sentence, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

"I'm so tired" is converted to "I'm so not energetic"
"I'm really hungry" is converted to "I'm really not full"
"I'm not sure if I'm up for that" is converted to "I'm sure I'm not up for that"
"I'm not sure if I can make it to the event" is converted to "I'm unsure if I can make it to the event"
"I'm feeling a bit confused right now" is converted to "I'm feeling a bit not clear right now"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.

{thai_text}

"""


In [23]:
PROMPT_NER = """Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.
You're an expert linguist in English and Thai. You need to modify this Thai sentence by replacing the named entity with a new name, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

"I'm so tired" is converted to "Jane is so tired"
"I'm really hungry" is converted to "Jack is really hungry" "I'm not sure if I'm up for that" is converted to "Jones is not sure if she is up for that"
"I'm not sure if I can make it to the event" is converted to "Jill is not sure if she can make it to the event"
"I'm feeling a bit confused right now" is converted to "Andy is feeling a bit confused right now"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.

{thai_text}
"""

In [24]:
def perturb_thai(thai_text: str, taxonomy: str):
    if taxonomy == "synonym":
        prompt = PROMPT_SYNONYM.format(thai_text=thai_text)
    elif taxonomy == "negation":
        prompt = PROMPT_NEGATION.format(thai_text=thai_text)
    elif taxonomy == "ner":
        prompt = PROMPT_NER.format(thai_text=thai_text)
    else:
        raise ValueError("taxonomy must be synonym / negation / ner")

    resp = _client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":"You are a helpful assistant."},
                  {"role":"user","content":prompt}],
        temperature=0.7
    )
    return resp.choices[0].message.content.strip()

In [25]:
def jaro(a: str, b: str) -> float:
    try:
        return float(jellyfish.jaro_similarity(a or "", b or ""))
    except Exception:
        return 0.0

In [26]:
def bert_f1(ref: str, cand: str) -> float:
    if not ref.strip() or not cand.strip():
        return 0.0
    try:
        P, R, F1 = bertscore(
            cands=[cand],
            refs=[ref],
            model_type="xlm-roberta-large",
            lang="th",  # ภาษาไทย
            rescale_with_baseline=False,  # non-baseline for th
            verbose=False
        )
        return float(F1[0].item())
    except Exception:
        return 0.0

In [27]:
def fitness(ref_thai: str, cand_thai: str, W_BERT=0.8, W_JARO=0.2):
    b = bert_f1(ref_thai, cand_thai)
    j = jaro(ref_thai, cand_thai)
    f = W_BERT * b + W_JARO * j
    return round(f, 4), round(b, 4), round(j, 4)

In [28]:
# Evaluate Sentence with Perturbation
# --------------------------
def evaluate_sentence(thai_text: str, taxonomy: str, rounds: int = 3):
    """
    thai_text: original text is thai
    taxonomy: 'synonym', 'negation', 'ner'
    rounds: (n) perturbation
    """
    out = {"original": thai_text}
    best = {"score": -1.0, "round": None, "perturb": "", "bert": 0.0, "jaro": 0.0}

    cur_text = thai_text
    for r in range(1, rounds + 1):
        cand_th = perturb_thai(cur_text, taxonomy)
        f, b, j = fitness(thai_text, cand_th)

        out[f"round{r}_perturb"] = cand_th
        out[f"round{r}_bert"] = b
        out[f"round{r}_jaro"] = j
        out[f"round{r}_fitness"] = f

        if f > best["score"]:
            best = {"score": f, "round": r, "perturb": cand_th, "bert": b, "jaro": j}

        cur_text = cand_th  # new sentence is input for the next eounr

    out["best_round"] = best["round"]
    out["best_fitness"] = best["score"]
    out["best_bert"] = best["bert"]
    out["best_jaro"] = best["jaro"]
    out["best_text"] = best["perturb"]

    return out

In [29]:
def run_file(infile: str, outfile: str, taxonomy: str, rounds: int = 3):
    df = pd.read_excel(infile)
    if "original" not in df.columns:
        raise ValueError("request original column")

    results = []
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
        thai_text = str(row["original"]).strip()
        if not thai_text:
            continue
        res = evaluate_sentence(thai_text, taxonomy, rounds=rounds)
        results.append(res)

    out_df = pd.DataFrame(results)
    out_df.to_excel(outfile, index=False)
    print(f"✅ Results saved to {outfile}")
    return out_df

In [30]:
# Final cell to run the script
if __name__ == "__main__":
    infile = "/Test2_mmmt4nl (1).xlsx"       # input file
    outfile = "results.xlsx"    # output file
    taxonomy = "ner"       # e.g. "negation", "fairness", etc.
    rounds = 3                  # number of evaluation rounds

    results_df = run_file(infile, outfile, taxonomy, rounds)
    display(results_df.head())

Evaluating: 100%|██████████| 10/10 [02:27<00:00, 14.80s/it]

✅ Results saved to results.xlsx





Unnamed: 0,original,round1_perturb,round1_bert,round1_jaro,round1_fitness,round2_perturb,round2_bert,round2_jaro,round2_fitness,round3_perturb,round3_bert,round3_jaro,round3_fitness,best_round,best_fitness,best_bert,best_jaro,best_text
0,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ปอมีความสุขมากที่ได้พบเพื่อนใหม่,0.946,0.9444,0.9457,นัทมีความสุขมากที่ได้พบเพื่อนใหม่,0.9511,0.9444,0.9497,มิลานมีความสุขมากที่ได้พบเพื่อนใหม่,0.9487,0.8752,0.934,2,0.9497,0.9511,0.9444,นัทมีความสุขมากที่ได้พบเพื่อนใหม่
1,วันนี้อากาศดีและสดชื่น,วันนี้อากาศดีและสดชื่นของสมชาย,0.9571,0.8889,0.9434,วันนี้อากาศดีและสดชื่นของมิ่งขวัญ,0.9487,0.8889,0.9367,วันนี้อากาศดีและสดชื่นของสันติภาพ,0.9492,0.88,0.9353,1,0.9434,0.9571,0.8889,วันนี้อากาศดีและสดชื่นของสมชาย
2,หล่อนรู้สึกเสียใจที่ไม่สามารถไปงานได้,นางสาวมีนา รู้สึกเสียใจที่ไม่สามารถไปงานได้,0.9736,0.7427,0.9274,นางสาวจันทร์ รู้สึกเสียใจที่ไม่สามารถไปงานได้,0.9606,0.7357,0.9156,นางสาวมาลี รู้สึกเสียใจที่ไม่สามารถไปงานได้,0.9654,0.7427,0.9209,1,0.9274,0.9736,0.7427,นางสาวมีนา รู้สึกเสียใจที่ไม่สามารถไปงานได้
3,อาหารมื้อนี้อร่อยสุดๆ,อาหารมื้อนี้อร่อยสุดๆ ของนัท,0.9629,0.9048,0.9513,อาหารมื้อนี้อร่อยสุดๆ ของมิกซ์,0.9491,0.8939,0.938,อาหารมื้อนี้อร่อยสุดๆ ของนัท,0.9629,0.9048,0.9513,1,0.9513,0.9629,0.9048,อาหารมื้อนี้อร่อยสุดๆ ของนัท
4,ฉันไม่พอใจกับบริการที่ได้รับ,จอห์นไม่พอใจกับบริการที่ได้รับ,0.947,0.8852,0.9347,สมชายไม่พอใจกับบริการที่ได้รับ,0.9395,0.8016,0.9119,สมหญิงไม่พอใจกับบริการที่ได้รับ,0.9388,0.8942,0.9299,1,0.9347,0.947,0.8852,จอห์นไม่พอใจกับบริการที่ได้รับ
