In [35]:
import pandas as pd
import evaluate, sacrebleu
from tqdm import tqdm
import os
import glob

### Get respond

In [None]:
Instruction = pd.read_csv('Thai_Chinese_Dataset.csv')

In [None]:
from google import genai

client = genai.Client(api_key='ํYOUR_KEY')

start_index = 0
batch_num = 1

for index, row in tqdm(Instruction.iterrows()):
    
    '''
    This loop generates responses for each instruction in the Instruction DataFrame using the Gemini-2.5-flash model.
    It saves the responses in batches of 10 to CSV files in the 'respond_batch' directory.

    '''

    # print(f"Processing row {index} \n")

    prompt = f"""You are an intelligent language model. 
    Follow the instruction carefully and respond concisely.

    Instruction: {row['instruction']}
    Input: "{row['input']}"
    Output:"""

    # print(prompt, '\n')

    response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=prompt,
    )
    
    Instruction.loc[index, 'respond'] = response.text

    if (index+1) % 10 == 0 or (index+1) == len(Instruction):
        print(f'BATCH {batch_num} SAVE : from {start_index} to {index}')
        batch_df = Instruction.iloc[start_index:index+1]
        os.makedirs("respond_batch", exist_ok=True)
        batch_df.to_csv(f"respond_batch/batch_{batch_num}.csv", index=True)
        batch_num += 1
        start_index = index+1



In [None]:
# combine the bacthes into one csv file
files = glob.glob("respond_batch/batch_*.csv")
respond_df = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)
respond_df.to_csv("outputs/responded_dataset.csv", index=False)
respond_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,file,type,instruction,input,ref,respond
0,0,0,/kikis_coding/Nectec/InstructionDataset/Datase...,Conversation-10.json,Translate this casual conversation from Chines...,A: 你吃饭了吗？\nB: 吃过了，你呢？,ก: กินข้าวหรือยัง?\nข: กินแล้ว แล้วเธอล่ะ?,output
1,1,1,/kikis_coding/Nectec/InstructionDataset/Datase...,Conversation-10.json,Translate this casual conversation from Chines...,A: 今天天气很好，我们去散步吧。\nB: 好的，我正好也想出去走走。,ก: วันนี้อากาศดี ไปเดินเล่นกันเถอะ\nข: ได้เลย ...,output
2,2,2,/kikis_coding/Nectec/InstructionDataset/Datase...,Conversation-10.json,Translate this casual conversation from Chines...,A: 你周末有什么计划？\nB: 想在家休息，你呢？,ก: เสาร์อาทิตย์นี้มีแผนอะไรไหม?\nข: อยากพักผ่อ...,output
3,3,3,/kikis_coding/Nectec/InstructionDataset/Datase...,Conversation-10.json,Translate this casual conversation from Chines...,A: 你喜欢看电影吗？\nB: 喜欢，尤其是科幻片。,ก: คุณชอบดูหนังไหม?\nข: ชอบ โดยเฉพาะหนังไซไฟ,output
4,4,4,/kikis_coding/Nectec/InstructionDataset/Datase...,Conversation-10.json,Translate this casual conversation from Chines...,A: 你最近怎么样？\nB: 还不错，就是工作有点忙。,ก: ช่วงนี้เป็นยังไงบ้าง?\nข: ก็โอเคนะ แค่ช่วงน...,output
...,...,...,...,...,...,...,...,...
405,85,85,/kikis_coding/Nectec/InstructionDataset/Datase...,idiom_20.json,Translate this Thai idiom to Chinese:,ช้าๆ ได้พร้าเล่มงาม,欲速则不达,output
406,86,86,/kikis_coding/Nectec/InstructionDataset/Datase...,idiom_20.json,Translate this Thai idiom to Chinese:,ตีงูให้หลังหัก,一击即中,output
407,87,87,/kikis_coding/Nectec/InstructionDataset/Datase...,idiom_20.json,Translate this Thai idiom to Chinese:,ตำน้ำพริกละลายแม่น้ำ,徒劳无功,output
408,88,88,/kikis_coding/Nectec/InstructionDataset/Datase...,idiom_20.json,Translate this Thai idiom to Chinese:,น้ำท่วมปาก,守口如瓶,output


### Load Dataset

In [None]:
df = pd.read_csv("outputs/responded_dataset.csv") #llm responses csv

### Evaluate

In [None]:
# chrF++ per-sample (solid for ZH/TH)
chrf = evaluate.load("chrf")
df["chrf"] = [
    chrf.compute(predictions=[h], references=[[r]])["score"]
    for h, r in tqdm(list(zip(df["respond"], df["ref"])),
                     total=len(df), desc="chrF")
]

# sentence BLEU
df["bleu_sent"] = [
    sacrebleu.sentence_bleu(h, [r]).score
    for h, r in tqdm(list(zip(df["respond"], df["ref"])),
                     total=len(df), desc="BLEU (sent)")
]

# corpus BLEU
corpus_bleu = sacrebleu.corpus_bleu(df["respond"].tolist(), [df["ref"].tolist()]).score

# Optional semantic similarity
try:
    bs = evaluate.load("bertscore")
    df["bertscore_f1"] = bs.compute(
        predictions=df["respond"].tolist(),
        references=df["ref"].tolist(),
        lang="th"  # language of the hypothesis strings
    )["f1"]
except Exception as e:
    print("Skipping BERTScore (install torch + bert-score to enable). Reason:", e)

print("\n=== Qwen corpus summary ===")
print(f"chrF++ avg     : {df['chrf'].mean():.3f}")
print(f"BLEU (avg)     : {df['bleu_sent'].mean():.3f}")
print(f"BLEU (corpus)  : {corpus_bleu:.3f}")
if "bertscore_f1" in df:
    print(f"BERTScore F1   : {df['bertscore_f1'].mean():.4f}")

chrF: 100%|██████████| 410/410 [00:05<00:00, 68.79it/s] 
BLEU (sent): 100%|██████████| 410/410 [00:00<00:00, 6075.95it/s]



=== Qwen corpus summary ===
chrF++ avg     : 0.421
BLEU (avg)     : 0.000
BLEU (corpus)  : 0.000
BERTScore F1   : 0.5630


### Save evaluation result

In [None]:
df.to_csv('outputs/evaluations.csv')

### Summary

In [None]:
df = pd.read_csv('outputs/evaluations.csv')

In [None]:
df = df.copy()[['type','chrf', 'bleu_sent', 'bertscore_f1']]

In [None]:
# add domain column
df['domain'] = df['type'].str.replace('.json', '').str.replace('-', '_').str.split('_').str[0]
df['domain'] = df['domain'].str.title().replace({
    'word': 'Word_Alignment',
    'Partial': 'Partial_Translation'
})

In [None]:
df.groupby('domain')[['chrf', 'bleu_sent', 'bertscore_f1']].agg(['mean', 'std'])

Unnamed: 0_level_0,chrf,chrf,bleu_sent,bleu_sent,bertscore_f1,bertscore_f1
Unnamed: 0_level_1,mean,std,mean,std,mean,std
domain,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Conversation,0.0,0.0,0.0,0.0,0.541247,0.010512
Dictionary,0.0,0.0,0.0,0.0,0.607971,0.040265
Domain,0.0,0.0,0.0,0.0,0.634418,0.040957
Error,0.818714,0.349038,0.0,0.0,0.55651,0.013618
Hsk,0.614624,0.069913,0.0,0.0,0.519975,0.011132
Html,1.332404,0.653978,0.0,0.0,0.581172,0.025394
Idiom,0.0,0.0,0.0,0.0,0.651071,0.037839
Partial_Translation,0.549679,0.10707,0.0,0.0,0.544251,0.014828
Sentence,0.0,0.0,0.0,0.0,0.593765,0.017007
Summarization,0.335978,0.192404,0.0,0.0,0.477649,0.034722


In [None]:
df.to_csv('outputs/evaluations_summary.csv')