In [24]:
import pandas as pd
import os
from src.prompts import language_dict

def get_keys_by_value(d, target_value):
    return [key for key, value in d.items() if value == target_value][0]

languages = ["Afrikaans", "Arabic", "Chinese_(Simplified)", "English", "French", "German", "Hebrew", "Indonesian", "Italian", "Japanese", "Korean", "Spanish", "Turkish", "Vietnamese"]
models = [
    "jwhj/Qwen2.5-Math-1.5B-OREO", "nvidia/AceMath-1.5B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "Qwen/Qwen2.5-Math-1.5B-Instruct",
    "PRIME-RL/Eurus-2-7B-PRIME", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    "nvidia/AceMath-7B-Instruct", "Qwen/Qwen2.5-Math-7B-Instruct",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "o3-mini"
]
iso = [get_keys_by_value(language_dict, l) for l in languages]

res = {"model": models}
for i, l in enumerate(languages):
    # for task in ["math500", "math100"]:
    #     df = pd.read_csv(os.path.join("score_result", f"{task}.csv"))
    #     res[f"{l}_{task[-3:]}"] = list(df[l])
    df_500, df_100 = pd.read_csv(os.path.join("score_result", "math500.csv")), pd.read_csv(os.path.join("score_result", "math100.csv"))
    res[iso[i]] = []
    for m in models:
        m = m.replace("/", "_")
        df_500_index, df_100_index = list(df_500["model"]).index(m), list(df_100["model"]).index(m)
        # res[iso[i]].append(abs(list(df_500[l])[df_500_index] - list(df_100[l])[df_100_index]))
        res[iso[i]].append(list(df_500[l])[df_500_index])
res = pd.DataFrame(res)
res.to_excel("check.xlsx", index=False)
# .to_csv(f"result_{task}.csv", index=False)

In [47]:
import scipy.stats
import pickle

with open("boot.pickle", "rb") as f:
    a = pickle.load(f)

print(f"--------------------- Evaluation Score T-test ---------------------")
check_500, check_100 = [], []
for key in a["math500"].keys():
    # tau, p_value = scipy.stats.kendalltau(a["math500"][key], a["math100"][key])  # Welch's t-test 적용
    check_500.append(np.mean(a["math500"][key]))
    check_100.append(np.mean(a["math100"][key]))
    # print(f"{key} - Kendall-Tau: {tau:.4f}, P-Value: {p_value:.4f}")
scipy.stats.kendalltau(check_500, check_100)

--------------------- Evaluation Score T-test ---------------------


SignificanceResult(statistic=0.9555555555555554, pvalue=5.5114638447971785e-06)

In [4]:
from metrics import evaluate_consistency
import os

root_path = "prm_result/1.5B/prm_72B_5_8"
for task in ["math100", "aime2024"]:
    a = evaluate_consistency(os.path.join(root_path, task, "text-completion-openai_Qwen_Qwen2.5-Math-1.5B-Instruct"))
    print(f"Qwen2.5-Math-1.5B-Instruct - {task}: ", a)

100%|██████████| 99/99 [00:04<00:00, 20.66it/s]


Overall Fleiss' kappa (multi-rater agreement): 0.598
Qwen2.5-Math-1.5B-Instruct - math100:  0.5982547552960662


100%|██████████| 30/30 [00:01<00:00, 19.98it/s]


Overall Fleiss' kappa (multi-rater agreement): 0.618
Qwen2.5-Math-1.5B-Instruct - aime2024:  0.6178097740597748


In [None]:
from litellm import batch_completion
from src.prompts import judge_template, language_dict
from datasets import load_dataset
import pandas as pd
import os

os.environ["OPENAI_API_KEY"] = "sk-xxx"
os.environ["HF_TOKEN"] = "hf_xxx"

def get_keys_by_value(d, target_value):
    return [key for key, value in d.items() if value == target_value][0]

path_list = [
    "prm_result/1.5B/prm_72B_5_8", # 이 path 내부에 task로 나뉘는 형식
    # "results"
]
save_path, log_path = "new_mo_result", "new_mo_logs"
os.makedirs(save_path, exist_ok=True)
os.makedirs(log_path, exist_ok=True)

for root_path in path_list:
    if root_path == "prm_result/1.5B/prm_72B_5_8":
        model = "text-completion-openai_Qwen_Qwen2.5-Math-1.5B-Instruct"
    elif root_path == "results":
        model = "deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B"

    for data in ["IMO", "MMO"]:
        if data == "IMO":
            dataset = load_dataset("OLAIR/M-IMO-extended", split="train").to_pandas()
        res = {
            "question": [],
            "response": [],
            "answer": [],
            "model": [],
            "language": [],
            "input_prompt": [],
            "judge": []
        }
        languages = list(language_dict.values())
        result_dict = {key: [] for key in languages}
        for lang in languages:
            if (lang == "Chinese_(Simplified)") and (data == "MMO"):
                lang = "Chinese"
            if os.path.exists(os.path.join(root_path, data, model, f"{lang}.jsonl")):
                df = pd.read_json(os.path.join(root_path, data, model, f"{lang}.jsonl"), lines=True)
                for _,row in df.iterrows():
                    res["model"].append(model)
                    res["language"].append(lang)
                    res["question"].append(row["question"])
                    res["response"].append(row["response"])
                    res["answer"].append(row["answer"])
                    res["input_prompt"].append([
                        {"role": "system", "content": "You are a good judge."},
                        {"role": "user", "content": judge_template.replace("<math_question>", str(row["question"])).replace("<correct_answer>", str(row["answer"])).replace("<model_solution>", str(row["response"]))}
                    ])
            else:
                continue

        print(f"{data} is Evaluating!")
        outputs = batch_completion(
            messages=res["input_prompt"],
            temperature=0,
            max_tokens=4096,
            model="gpt-4o-mini"
        )
        for output in outputs:
            try:
                res["judge"].append(output.choices[0].message.content)
            except:
                res["judge"].append("")
        res = pd.DataFrame(res)
        res.to_csv(f"{log_path}/{data}-{model}_log.csv", index=False)
        print(f"{data} Evaluation is done!")

        for lang in languages:
            if (lang in list(res["language"])) or ("Chinese" in lang):
                if ("Chinese" in lang) and ("MMO" in data):
                    original_lang = lang
                    lang = "Chinese"
                subset = res[res["language"] == lang]
                subset.reset_index(inplace=True, drop=True)
                true, false = 0, 0
                for i,row in subset.iterrows():
                    if "IMO" in data:
                        if not dataset.loc[i, get_keys_by_value(language_dict, lang)]:
                            continue
                    if row.judge == "[[TRUE]]":
                        true += 1
                    else:
                        false += 1
    
                acc = true / (true + false) * 100
                if ("Chinese" in lang) and ("MMO" in data):
                    result_dict[original_lang].append(acc)
                else:
                    result_dict[lang].append(acc)
            else:
                result_dict[lang].append(None)

        result = pd.DataFrame(result_dict)
        result.to_csv(f"{save_path}/{data}-{model}.csv", index=False)