<a href="https://colab.research.google.com/github/lapatradaa/M-MMT4NL/blob/main/llms_evaluation2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
!pip install sacrebleu bert-score
!pip install sacrebleu bert-score pythainlp
!pip -q install pandas openpyxl jellyfish bert-score tqdm openai --upgrade



In [77]:
import os, random, math
from pathlib import Path
import pandas as pd
import random
import os
from tqdm import tqdm
import jellyfish
from bert_score import score as bertscore


In [78]:
OPENAI_MODEL_TRANSLATE = "gpt-4o-mini"
OPENAI_MODEL_PERTURB  = "gpt-4o-mini"
TEMPERATURE_TRANSLATE  = 0.2
TEMPERATURE_PERTURB    = 0.7


In [79]:
# multilingual BERTScore model (works well for Thai)
BERT_MODEL = "xlm-roberta-large"
W_BERT = 0.8
W_JARO = 0.2

RANDOM_SEED = 42
random.seed(RANDOM_SEED)

In [80]:
from openai import OpenAI

# _client = OpenAI(api_key= "")
def _chat_once(system_prompt: str, user_prompt: str, *, model: str, temperature: float) -> str:
    out = _client.chat.completions.create(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user",   "content": user_prompt.strip()},
        ],
    )
    return (out.choices[0].message.content or "").strip()

In [81]:
# (A) Thai -> English (step 2)
SYSTEM_THAI_TO_EN = (
    "You are a professional translator and language expert. "
    "Translate the following Thai sentence into English. "
    "Use English at a C1 proficiency level and output only the final English translation."
)
USER_THAI_TO_EN = (
    "Given this Thai text, translate it into English. Preserve the semantics as much as possible.\n\n{thai_text}"
)


In [82]:
# (B) English -> Thai (your exact prompts, used for back-translation)
SYSTEM_EN_TO_THAI = (
    "You are a professional translator and language expert. \n"
    "Please translate the following English sentence into Thai. \n"
    "Use English at a C1 proficiency level when framing any instructions or explanations—and output only the final Thai translation."
)
USER_EN_TO_THAI = (
    "Given this English text, translate it into Thai. Preserve the semantic of the original English text in the translate version as much as possible.\n\n{english_text}"
)

In [83]:
# (C) English perturbation (step 3)
SYSTEM_PERTURB_EN = "You are an expert paraphraser. You lightly perturb English text while preserving meaning."
USER_PERTURB_EN = (
    "Paraphrase the following English sentence with very small changes "
    "(synonyms, slight reordering, optional mild intensifiers). "
    "Do NOT change the meaning. Output only the sentence.\n\n{english_text}"
)

In [84]:
# ---------- LLM wrappers ----------
def thai_to_english(thai: str) -> str:
    return _chat_once(SYSTEM_THAI_TO_EN, USER_THAI_TO_EN.format(thai_text=thai),
                      model=OPENAI_MODEL_TRANSLATE, temperature=TEMPERATURE_TRANSLATE)

def english_to_thai(eng: str) -> str:
    return _chat_once(SYSTEM_EN_TO_THAI, USER_EN_TO_THAI.format(english_text=eng),
                      model=OPENAI_MODEL_TRANSLATE, temperature=TEMPERATURE_TRANSLATE)

def perturb_english(eng: str) -> str:
    return _chat_once(SYSTEM_PERTURB_EN, USER_PERTURB_EN.format(english_text=eng),
                      model=OPENAI_MODEL_PERTURB, temperature=TEMPERATURE_PERTURB)

In [85]:
def jaro(a: str, b: str) -> float:
    try:
        return float(jellyfish.jaro_similarity(a or "", b or ""))
    except Exception:
        return 0.0

def fitness(ref_thai: str, cand_thai: str):
    b = bert_f1(ref_thai, cand_thai)
    j = jaro(ref_thai, cand_thai)
    f = W_BERT * b + W_JARO * j
    return round(f, 4), round(b, 4), round(j, 4)


In [86]:
from bert_score import BERTScorer
import torch
import logging

BERT_MODEL = "xlm-roberta-large"

# Load once (default to GPU if available)
scorer = BERTScorer(
    model_type=BERT_MODEL,
    lang="th",
    rescale_with_baseline=False,
    device="cuda" if torch.cuda.is_available() else "cpu",
)

def bert_f1(ref: str, cand: str) -> float:
    """
    Compute BERTScore F1 between a reference (Thai) and candidate (Thai).
    Falls back gracefully if GPU OOM or baseline not found.
    """
    try:
        _, _, F1 = scorer.score([cand], [ref])
        return float(F1[0])
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            logging.warning("⚠️ CUDA OOM, retrying on CPU for BERTScore.")
            cpu_scorer = BERTScorer(
                model_type=BERT_MODEL,
                lang="th",
                rescale_with_baseline=False,
                device="cpu",
            )
            _, _, F1 = cpu_scorer.score([cand], [ref])
            return float(F1[0])
        raise
    except Exception as e:
        logging.error(f"BERTScore failed: {e}")
        return 0.0


In [87]:
def evaluate_sentence(thai_text: str, rounds: int = 5):
    """
    Evaluate one Thai sentence by round-tripping translations.

    Steps:
      1. Translate Thai -> English (base_en).
      2. Iteratively perturb English, back-translate to Thai, and score.
      3. Track per-round BERT/Jaro/Fitness.
      4. Select the best round by fitness.

    Returns:
      dict with:
        - original (Thai input)
        - base_en
        - per-round: roundN_en, roundN_back_th, roundN_bert, roundN_jaro, roundN_fitness
        - best_* summary
    """
    out = {"original": thai_text}

    try:
        base_en = thai_to_english(thai_text)
    except Exception as e:
        return {
            "original": thai_text,
            "error": f"thai_to_english failed: {type(e).__name__}: {e}",
        }

    out["base_en"] = base_en

    best = {
        "score": -1.0, "round": None,
        "en": "", "th_back": "",
        "bert": 0.0, "jaro": 0.0,
    }

    cur_en = base_en
    for r in range(1, rounds + 1):
        try:
            cand_en = perturb_english(cur_en)  # Step 3: perturb
            cand_th = english_to_thai(cand_en) # back-translate
            f, b, j = fitness(thai_text, cand_th)
        except Exception as e:
            cand_en, cand_th, f, b, j = "", "", 0.0, 0.0, 0.0
            out[f"round{r}_error"] = f"{type(e).__name__}: {e}"

        out[f"round{r}_en"]       = cand_en
        out[f"round{r}_back_th"]  = cand_th
        out[f"round{r}_bert"]     = b
        out[f"round{r}_jaro"]     = j
        out[f"round{r}_fitness"]  = f

        if f > best["score"]:
            best.update({
                "score": f, "round": r,
                "en": cand_en, "th_back": cand_th,
                "bert": b, "jaro": j,
            })

        cur_en = cand_en or cur_en  # don’t break chain if perturb failed

    # --- best result summary ---
    out["best_round"]   = best["round"]
    out["best_fitness"] = best["score"]
    out["best_bert"]    = best["bert"]
    out["best_jaro"]    = best["jaro"]
    out["best_en"]      = best["en"]
    out["best_th_back"] = best["th_back"]

    return out


In [88]:
def load_table(path: str) -> pd.DataFrame:
    ext = Path(path).suffix.lower()
    if ext in (".xlsx", ".xls"):
        return pd.read_excel(path)
    if ext == ".csv":
        return pd.read_csv(path)
    raise ValueError("Use .xlsx/.xls or .csv")

def save_table(df: pd.DataFrame, path: str):
    ext = Path(path).suffix.lower()
    if ext in (".xlsx", ".xls"):
        df.to_excel(path, index=False)
    elif ext == ".csv":
        df.to_csv(path, index=False)
    else:
        raise ValueError("Use .xlsx/.xls or .csv")

def run_file(infile: str, outfile: str, rounds: int = 5):
    df = load_table(infile)
    if "original" not in df.columns:
        raise ValueError("Input must have a column named 'original' (Thai).")

    rows = []
    for i in tqdm(range(len(df)), desc="Evaluating"):
        thai = str(df.at[i, "original"]).strip()
        if not thai:
            continue
        try:
            rows.append(evaluate_sentence(thai, rounds=rounds))
        except Exception as e:
            rows.append({"original": thai, "error": f"{type(e).__name__}: {e}"})

    out_df = pd.DataFrame(rows)

    # pretty column ordering
    cols = ["original", "base_en"]
    for r in range(1, rounds + 1):
        cols += [f"round{r}_en", f"round{r}_back_th", f"round{r}_bert", f"round{r}_jaro", f"round{r}_fitness"]
    cols += ["best_round", "best_fitness", "best_bert", "best_jaro", "best_en", "best_th_back", "error"]
    cols = [c for c in cols if c in out_df.columns]
    out_df = out_df.reindex(columns=cols)

    save_table(out_df, outfile)
    display(out_df.head())
    print(f"Saved → {outfile}")


In [89]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [90]:

df = load_table("/content/test_dataset-llm.xlsx")

# clean headers
df = df.rename(columns=lambda x: x.strip())
if "Sentiment" in df.columns:
    df = df.rename(columns={"Sentiment": "original"})

# overwrite the same file
df.to_excel("/content/test_dataset-llm.xlsx", index=False)

# now run with the cleaned file
infile  = "/content/test_dataset-llm.xlsx"
outfile = "/content/scoreds.xlsx"

run_file(infile, outfile, rounds=6)


Evaluating: 100%|██████████| 50/50 [08:16<00:00,  9.94s/it]


Unnamed: 0,original,base_en,round1_en,round1_back_th,round1_bert,round1_jaro,round1_fitness,round2_en,round2_back_th,round2_bert,...,round6_back_th,round6_bert,round6_jaro,round6_fitness,best_round,best_fitness,best_bert,best_jaro,best_en,best_th_back
0,ฉันมีความสุขมากที่ได้พบเพื่อนใหม่,I am very happy to have met new friends.,I am extremely pleased to have made new friends.,ฉันรู้สึกยินดีเป็นอย่างยิ่งที่ได้ทำความรู้จักก...,0.9405,0.605,0.8734,I am very happy to have made new friends.,ฉันรู้สึกมีความสุขมากที่ได้ทำความรู้จักกับเพื่...,0.9611,...,ฉันรู้สึกตื่นเต้นอย่างแท้จริงที่ได้สร้างมิตรภา...,0.9203,0.5496,0.8462,2,0.9288,0.9611,0.7994,I am very happy to have made new friends.,ฉันรู้สึกมีความสุขมากที่ได้ทำความรู้จักกับเพื่...
1,วันนี้อากาศดีและสดชื่น,Today the weather is nice and refreshing.,"Today, the weather is pleasant and invigorating.",วันนี้อากาศดีและสดชื่น,1.0,1.0,1.0,"Today, the weather is delightful and refreshing.",วันนี้อากาศดีและสดชื่นมาก,0.9883,...,วันนี้อากาศดีและสดชื่นมาก,0.9883,0.9474,0.9801,1,1.0,1.0,1.0,"Today, the weather is pleasant and invigorating.",วันนี้อากาศดีและสดชื่น
2,หล่อนรู้สึกเสียใจที่ไม่สามารถไปงานได้,She feels regretful that she cannot attend the...,She feels remorseful that she is unable to att...,เธอรู้สึกเสียใจที่ไม่สามารถเข้าร่วมงานได้,0.9475,0.7965,0.9173,She feels regretful that she cannot attend the...,เธอรู้สึกเสียใจที่ไม่สามารถเข้าร่วมงานได้,0.9475,...,เธอรู้สึกเสียดายที่ไม่สามารถเข้าร่วมงานได้,0.9341,0.6751,0.8823,1,0.9173,0.9475,0.7965,She feels remorseful that she is unable to att...,เธอรู้สึกเสียใจที่ไม่สามารถเข้าร่วมงานได้
3,อาหารมื้อนี้อร่อยสุดๆ,This meal is incredibly delicious.,This dish is extremely tasty.,จานนี้อร่อยมาก,0.9126,0.5582,0.8417,This dish is exceptionally flavorful.,จานนี้มีรสชาติอร่อยเป็นพิเศษ,0.8791,...,จานนี้มีรสชาติอร่อยเป็นพิเศษ,0.8791,0.5548,0.8142,1,0.8417,0.9126,0.5582,This dish is extremely tasty.,จานนี้อร่อยมาก
4,ฉันไม่พอใจกับบริการที่ได้รับ,I am not satisfied with the service I received.,I am not pleased with the service I got.,ฉันไม่พอใจกับบริการที่ได้รับ,1.0,1.0,1.0,I am not satisfied with the service I received.,ฉันไม่พอใจกับบริการที่ฉันได้รับ,0.9585,...,ฉันไม่พอใจกับบริการที่ฉันได้รับ,0.9585,0.9697,0.9608,1,1.0,1.0,1.0,I am not pleased with the service I got.,ฉันไม่พอใจกับบริการที่ได้รับ


Saved → /content/scored.xlsx
