<a href="https://colab.research.google.com/github/lapatradaa/M-MMT4NL/blob/main/llms_evaluation_direct_translate_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sacrebleu bert-score
!pip install sacrebleu bert-score pythainlp
!pip -q install pandas openpyxl jellyfish bert-score tqdm openai --upgrade

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading color

In [2]:
import os, random, math
from pathlib import Path
import pandas as pd
import random
import os
from tqdm import tqdm
import jellyfish
from bert_score import score as bertscore


In [3]:
OPENAI_MODEL_TRANSLATE = "gpt-4o-mini"
OPENAI_MODEL_PERTURB  = "gpt-4o-mini"
TEMPERATURE_TRANSLATE  = 0.2
TEMPERATURE_PERTURB    = 0.7


In [4]:
# multilingual BERTScore model (works well for Thai)
BERT_MODEL = "xlm-roberta-large"
W_BERT = 0.8
W_JARO = 0.2

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

In [5]:
from openai import OpenAI

##_client = OpenAI <“OPENAI_API_KEY”>
def _chat_once(system_prompt: str, user_prompt: str, *, model: str, temperature: float) -> str:
    out = _client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user",   "content": user_prompt.strip()},
        ],
    )
    return (out.choices[0].message.content or "").strip()

In [6]:


# --- Prompt Templates ---
PROMPT_TAXONOMY = """You're an expert linguist in English and Thai. You need to modify this Thai sentence by substituting a word with its respective synonym, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

I'll give you some examples of converting one sentence to another sentence: "I'm so tired" is converted to "I'm so exhausted" "I'm really hungry" is converted to "I'm really starving" "I'm not sure if I'm up for that" is converted to "I'm not certain if I'm up for that" "I'm not sure if I can make it to the event" is converted to "I'm not confident if I can make it to the event"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.
You're an expert linguist in English and Thai. You need to modify this Thai sentence by negating the sentence, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

"I'm so tired" is converted to "I'm so not energetic"
"I'm really hungry" is converted to "I'm really not full"
"I'm not sure if I'm up for that" is converted to "I'm sure I'm not up for that"
"I'm not sure if I can make it to the event" is converted to "I'm unsure if I can make it to the event"
"I'm feeling a bit confused right now" is converted to "I'm feeling a bit not clear right now"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.

{thai_text}

"""


In [7]:
PROMPT_NER = """Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.
You're an expert linguist in English and Thai. You need to modify this Thai sentence by replacing the named entity with a new name, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

"I'm so tired" is converted to "Jane is so tired"
"I'm really hungry" is converted to "Jack is really hungry" "I'm not sure if I'm up for that" is converted to "Jones is not sure if she is up for that"
"I'm not sure if I can make it to the event" is converted to "Jill is not sure if she can make it to the event"
"I'm feeling a bit confused right now" is converted to "Andy is feeling a bit confused right now"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.

{thai_text}
"""

In [8]:
def perturb_thai(thai_text: str, taxonomy: str):
    if taxonomy == "synonym":
        prompt = PROMPT_SYNONYM.format(thai_text=thai_text)
    elif taxonomy == "negation":
        prompt = PROMPT_NEGATION.format(thai_text=thai_text)
    elif taxonomy == "ner":
        prompt = PROMPT_NER.format(thai_text=thai_text)
    else:
        raise ValueError("taxonomy must be synonym / negation / ner")

    resp = _client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":"You are a helpful assistant."},
                  {"role":"user","content":prompt}],
        temperature=0.7
    )
    return resp.choices[0].message.content.strip()

In [9]:
def jaro(a: str, b: str) -> float:
    try:
        return float(jellyfish.jaro_similarity(a or "", b or ""))
    except Exception:
        return 0.0

In [10]:
def bert_f1(ref: str, cand: str) -> float:
    if not ref.strip() or not cand.strip():
        return 0.0
    try:
        P, R, F1 = bertscore(
            cands=[cand],
            refs=[ref],
            model_type="xlm-roberta-large",
            lang="th",  # ภาษาไทย
            rescale_with_baseline=False,  # non-baseline for th
            verbose=False
        )
        return float(F1[0].item())
    except Exception:
        return 0.0

In [11]:
def fitness(ref_thai: str, cand_thai: str, W_BERT=0.8, W_JARO=0.2):
    b = bert_f1(ref_thai, cand_thai)
    j = jaro(ref_thai, cand_thai)
    f = W_BERT * b + W_JARO * j
    return round(f, 4), round(b, 4), round(j, 4)

In [12]:
# Evaluate Sentence with Perturbation
# --------------------------
def evaluate_sentence(thai_text: str, taxonomy: str, rounds: int = 3):
    """
    thai_text: original text is thai
    taxonomy: 'synonym', 'negation', 'ner'
    rounds: (n) perturbation
    """
    out = {"original": thai_text}
    best = {"score": -1.0, "round": None, "perturb": "", "bert": 0.0, "jaro": 0.0}

    cur_text = thai_text
    for r in range(1, rounds + 1):
        cand_th = perturb_thai(cur_text, taxonomy)
        f, b, j = fitness(thai_text, cand_th)

        out[f"round{r}_perturb"] = cand_th
        out[f"round{r}_bert"] = b
        out[f"round{r}_jaro"] = j
        out[f"round{r}_fitness"] = f

        if f > best["score"]:
            best = {"score": f, "round": r, "perturb": cand_th, "bert": b, "jaro": j}

        cur_text = cand_th  # new sentence is input for the next eounr

    out["best_round"] = best["round"]
    out["best_fitness"] = best["score"]
    out["best_bert"] = best["bert"]
    out["best_jaro"] = best["jaro"]
    out["best_text"] = best["perturb"]

    return out

In [13]:
def run_file(infile: str, outfile: str, taxonomy: str, rounds: int = 3):
    df = pd.read_excel(infile)
    if "original" not in df.columns:
        raise ValueError("request original column")

    results = []
    for i, row in tqdm(df.iterrows(), total=len(df), desc="Evaluating"):
        thai_text = str(row["original"]).strip()
        if not thai_text:
            continue
        res = evaluate_sentence(thai_text, taxonomy, rounds=rounds)
        results.append(res)

    out_df = pd.DataFrame(results)
    out_df.to_excel(outfile, index=False)
    print(f"✅ Results saved to {outfile}")
    return out_df

In [16]:
# Final cell to run the script
if __name__ == "__main__":
    infile = "/Test2_mmmt4nl (1).xlsx"       # input file
    outfile = "results.xlsx"    # output file
    taxonomy = "ner"       # e.g. "negation", "fairness", etc.
    rounds = 3                  # number of evaluation rounds

    results_df = run_file(infile, outfile, taxonomy, rounds)
    display(results_df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Evaluating: 100%|██████████| 10/10 [03:29<00:00, 20.93s/it]

✅ Results saved to results.xlsx





Unnamed: 0,original,round1_perturb,round1_bert,round1_jaro,round1_fitness,round2_perturb,round2_bert,round2_jaro,round2_fitness,round3_perturb,round3_bert,round3_jaro,round3_fitness,best_round,best_fitness,best_bert,best_jaro,best_text
0,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ฉันมีความสุขมากที่ได้พบสมชายใหม่,0.937,0.8778,0.9252,ฉันมีความสุขมากที่ได้พบแจ็คใหม่,0.9479,0.901,0.9385,ฉันมีความสุขมากที่ได้พบมาร์คใหม่,0.9641,0.8889,0.9491,3,0.9491,0.9641,0.8889,ฉันมีความสุขมากที่ได้พบมาร์คใหม่
1,วันนี้อากาศดีและสดชื่น,วันนี้อากาศดีและสดชื่นของมาร์ค,0.9486,0.8986,0.9386,วันนี้อากาศดีและสดชื่นของสตีฟ,0.9601,0.9091,0.9499,วันนี้อากาศดีและสดชื่นของนิกซ์,0.9559,0.9091,0.9465,2,0.9499,0.9601,0.9091,วันนี้อากาศดีและสดชื่นของสตีฟ
2,หล่อนรู้สึกเสียใจที่ไม่สามารถไปงานได้,จูดีรู้สึกเสียใจที่ไม่สามารถไปงานได้,0.9784,0.9267,0.968,มาร์กี้รู้สึกเสียใจที่ไม่สามารถไปงานได้,0.9744,0.9048,0.9605,นาตาลีรู้สึกเสียใจที่ไม่สามารถไปงานได้,0.9783,0.785,0.9396,1,0.968,0.9784,0.9267,จูดีรู้สึกเสียใจที่ไม่สามารถไปงานได้
3,อาหารมื้อนี้อร่อยสุดๆ,อาหารมื้อนี้อร่อยสุดๆ ของนัท,0.9629,0.9048,0.9513,อาหารมื้อนี้อร่อยสุดๆ ของมินต์,0.9509,0.8939,0.9395,อาหารมื้อนี้อร่อยสุดๆ ของนัท,0.9629,0.9048,0.9513,1,0.9513,0.9629,0.9048,อาหารมื้อนี้อร่อยสุดๆ ของนัท
4,ฉันไม่พอใจกับบริการที่ได้รับ,อันนีไม่พอใจกับบริการที่ได้รับ,0.9432,0.9516,0.9448,นัทไม่พอใจกับบริการที่ได้รับ,0.9553,0.9333,0.9509,นัทไม่พอใจกับบริการที่ได้รับ,0.9553,0.9333,0.9509,2,0.9509,0.9553,0.9333,นัทไม่พอใจกับบริการที่ได้รับ
