<a href="https://colab.research.google.com/github/lapatradaa/M-MMT4NL/blob/main/llms_evaluation_direct_translate_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
!pip install sacrebleu bert-score
!pip install sacrebleu bert-score pythainlp
!pip install textdistance

!pip -q install pandas openpyxl jellyfish bert-score tqdm openai --upgrade



In [16]:
import os, random, math
from pathlib import Path
import pandas as pd
import random
import os
from tqdm import tqdm
import jellyfish
from bert_score import score as bertscore


In [17]:
OPENAI_MODEL_TRANSLATE = "gpt-4o-mini"
OPENAI_MODEL_PERTURB  = "gpt-4o-mini"
TEMPERATURE_TRANSLATE  = 0.2
TEMPERATURE_PERTURB    = 0.7


In [18]:
# multilingual BERTScore model (works well for Thai)
BERT_MODEL = "xlm-roberta-large"
W_BERT = 0.8
W_JARO = 0.2

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

In [30]:
from openai import OpenAI

##_client = OpenAI <“OPENAI_API_KEY”>

def _chat_once(system_prompt: str, user_prompt: str, *, model: str, temperature: float) -> str:
    out = _client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user",   "content": user_prompt.strip()},
        ],
    )
    return (out.choices[0].message.content or "").strip()

In [20]:
# --- Prompt Templates ---
PROMPT_TAXONOMY = """You're an expert linguist in English and Thai. You need to modify this Thai sentence by substituting a word with its respective synonym, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

I'll give you some examples of converting one sentence to another sentence: "I'm so tired" is converted to "I'm so exhausted" "I'm really hungry" is converted to "I'm really starving" "I'm not sure if I'm up for that" is converted to "I'm not certain if I'm up for that" "I'm not sure if I can make it to the event" is converted to "I'm not confident if I can make it to the event"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.
You're an expert linguist in English and Thai. You need to modify this Thai sentence by negating the sentence, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

"I'm so tired" is converted to "I'm so not energetic"
"I'm really hungry" is converted to "I'm really not full"
"I'm not sure if I'm up for that" is converted to "I'm sure I'm not up for that"
"I'm not sure if I can make it to the event" is converted to "I'm unsure if I can make it to the event"
"I'm feeling a bit confused right now" is converted to "I'm feeling a bit not clear right now"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.

{thai_text}

"""


In [21]:
PROMPT_NER = """Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.
You're an expert linguist in English and Thai. You need to modify this Thai sentence by replacing the named entity with a new name, while still keeping the whole semantic of the sentence.

Examples of this modifications in English are as follows.

"I'm so tired" is converted to "Jane is so tired"
"I'm really hungry" is converted to "Jack is really hungry" "I'm not sure if I'm up for that" is converted to "Jones is not sure if she is up for that"
"I'm not sure if I can make it to the event" is converted to "Jill is not sure if she can make it to the event"
"I'm feeling a bit confused right now" is converted to "Andy is feeling a bit confused right now"

Can you apply this concept to the Thai sentence below. Only show the modified sentence without any explanation.

{thai_text}
"""

In [22]:
def perturb_thai(thai_text: str, taxonomy: str):
    if taxonomy == "taxonomy":
        prompt = PROMPT_TAXONOMY.format(thai_text=thai_text)
    elif taxonomy == "ner":
        prompt = PROMPT_NER.format(thai_text=thai_text)
    else:
        raise ValueError("taxonomy must be taxonomy / ner")

    resp = _client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":"You are a helpful assistant."},
                  {"role":"user","content":prompt}],
        temperature=0.7
    )
    return resp.choices[0].message.content.strip()

In [23]:
import textdistance

def jaro_score(ref: str, cand: str) -> float:
    """
    Compute Jaro-Winkler similarity between reference and candidate.
    Value between 0 (completely different) and 1 (exact match).
    """
    if not ref.strip() or not cand.strip():
        return 0.0
    return textdistance.jaro_winkler(ref, cand)


In [24]:
def bert_f1(ref: str, cand: str) -> float:
    if not ref.strip() or not cand.strip():
        return 0.0
    try:
        P, R, F1 = bertscore(
            cands=[cand],
            refs=[ref],
            model_type="xlm-roberta-large",
            lang="th",
            rescale_with_baseline=False,  # non-baseline for th
            verbose=False
        )
        return float(F1[0].item())
    except Exception:
        return 0.0

In [25]:
def fitness(ref_thai: str, cand_thai: str, W_BERT=0.8, W_JARO=0.2):
    b = bert_f1(ref_thai, cand_thai)
    j = jaro(ref_thai, cand_thai)
    f = W_BERT * b + W_JARO * j
    return round(f, 4), round(b, 4), round(j, 4)

In [26]:
def evaluate_sentence(sentence: str, rounds: int = 3, perturb_type: str = "taxonomy", weight_bert: float = 0.7):
    """
    Evaluate a sentence with perturbations and compute a fitness score per round.
    Returns all rounds plus highlights the best round.
    """
    results = []
    best_score = -1
    best_round = None

    for i in range(1, rounds + 1):

        # --- Select prompt ---
        if perturb_type == "taxonomy":
            prompt = PROMPT_TAXONOMY.format(thai_text=sentence)
        elif perturb_type == "ner":
            prompt = PROMPT_NER.format(thai_text=sentence)
        else:
            raise ValueError("Invalid perturb_type. Choose 'taxonomy' or 'ner'.")

        # --- Perturbation ---
        perturbed = _chat_once(
            system_prompt="You are a helpful Thai linguist.",
            user_prompt=prompt,
            model="gpt-4o-mini",
            temperature=0.7
        )

        # --- Get responses ---
        original = _chat_once(
            system_prompt="You are a helpful assistant.",
            user_prompt=sentence,
            model="gpt-4o-mini",
            temperature=0
        )

        perturbed_resp = _chat_once(
            system_prompt="You are a helpful assistant.",
            user_prompt=perturbed,
            model="gpt-4o-mini",
            temperature=0
        )

        # --- Scores ---
        bert_sim = bert_f1(original, perturbed_resp)
        jaro_sim = jaro_score(original, perturbed_resp)
        fitness = weight_bert * bert_sim + (1 - weight_bert) * jaro_sim

        # Track best round
        if fitness > best_score:
            best_score = fitness
            best_round = i

        results.append({
            "round": i,
            "original": sentence,
            "perturb_type": perturb_type,
            "perturbed": perturbed,
            "bert_f1": bert_sim,
            "jaro": jaro_sim,
            "fitness": fitness
        })

    # Mark the best round
    for r in results:
        r["best_round"] = (r["round"] == best_round)

    return results


In [27]:
def run_file(infile: str, outfile: str, perturb_type: str, rounds: int = 3, weight_bert: float = 0.7):
    """
    Run perturbation evaluation for all sentences in file.
    """
    df = pd.read_excel(infile)
    if "original" not in df.columns:
        raise ValueError("Excel file ต้องมี column 'original' ที่เป็นประโยคภาษาไทย")

    results = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Evaluating {perturb_type}"):
        thai_text = str(row["original"]).strip()
        if not thai_text:
            continue
        res = evaluate_sentence(thai_text, rounds=rounds, perturb_type=perturb_type, weight_bert=weight_bert)
        results.extend(res)

    out_df = pd.DataFrame(results)
    out_df.to_excel(outfile, index=False)
    print(f"✅ {perturb_type} results saved to {outfile}")
    return out_df


In [29]:
run_file("/content/Test2_mmmt4nl (2).xlsx", "output_taxonomy.xlsx", perturb_type="taxonomy", rounds=5)
run_file("/content/Test2_mmmt4nl (2).xlsx", "output_ner.xlsx", perturb_type="ner", rounds=5)


Evaluating taxonomy: 100%|██████████| 50/50 [52:11<00:00, 62.64s/it]


✅ taxonomy results saved to output_taxonomy.xlsx


Evaluating ner: 100%|██████████| 50/50 [52:39<00:00, 63.19s/it]

✅ ner results saved to output_ner.xlsx





Unnamed: 0,round,original,perturb_type,perturbed,bert_f1,jaro,fitness,best_round
0,1,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ner,ฉันมีความสุขมากที่ได้พบอารีย์ใหม่,0.874114,0.618697,0.797489,False
1,2,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ner,ฉันมีความสุขมากที่ได้พบสมชายใหม่,0.890621,0.594844,0.801888,False
2,3,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ner,ฉันมีความสุขมากที่ได้พบมาร์คใหม่,0.876134,0.604220,0.794560,False
3,4,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ner,ฉันมีความสุขมากที่ได้พบมินต์ใหม่,0.893531,0.660755,0.823698,True
4,5,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,ner,ฉันมีความสุขมากที่ได้พบปกรณ์ใหม่,0.878463,0.631798,0.804463,False
...,...,...,...,...,...,...,...,...
245,1,ฉันรู้สึกกังวลเกี่ยวกับอนาคต,ner,น้องสาวรู้สึกกังวลเกี่ยวกับอนาคต,0.923350,0.713282,0.860330,False
246,2,ฉันรู้สึกกังวลเกี่ยวกับอนาคต,ner,สมชายรู้สึกกังวลเกี่ยวกับอนาคต,0.921177,0.829286,0.893610,True
247,3,ฉันรู้สึกกังวลเกี่ยวกับอนาคต,ner,ปูรู้สึกกังวลเกี่ยวกับอนาคต,0.941106,0.760644,0.886967,False
248,4,ฉันรู้สึกกังวลเกี่ยวกับอนาคต,ner,นัทรู้สึกกังวลเกี่ยวกับอนาคต,0.933959,0.724682,0.871176,False
