In [None]:
## LM-Eval Harness 를 활용해 언어 모델 평가하기 
# pip install vllm 
# git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
# cd lm-evaluation-harness
# pip install -e .

# lm_eval --model vllm \
#     --model_args pretrained=kakaocorp/kanana-1.5-2.1b-instruct-2505,dtype=auto \
#     --tasks kmmlu_direct \
#     --log_samples \
#     --output_path results \
#     --batch_size auto


In [2]:
# KoSimpleEval 에  새로운 데이터셋 추가하기
# short-form
from datasets import load_dataset

ds = load_dataset("HAERAE-HUB/HRM8K", "GSM8K")
df = ds['test'].to_pandas()
df = df[['question','answer']]
df.columns = ['question','gold']
df.to_csv('hrm8k_gsm.csv',index=False)

In [13]:
df.head()

Unnamed: 0,question,gold,category
0,$y=\frac{2}{x^2+x-6}$의 그래프는 몇 개의 수직 점근선을 가지나요?,2.0,Level 3
1,$30$의 $120\%$와 $20$의 $130\%$의 양의 차는 무엇인가?,10.0,Level 1
2,"$2^8=4^x$일 때, $x$의 값은 얼마입니까?",4.0,Level 1
3,"등차수열 6, 10, 14, 18, ...의 100번째 항은 무엇인가요?",402.0,Level 2
4,Mr. Madoff는 매년 일정한 이자율로 복리 계산되는 펀드에 1000달러를 투자...,7.0,Level 4


In [1]:
# KoSimpleEval 에  새로운 데이터셋 추가하기
# mcqa
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("HAERAE-HUB/KoSimpleEval", "hrm_gsm")

In [2]:
print(ds['test']['question'][0])

Janet의 오리는 하루에 16개의 알을 낳습니다. 그녀는 매일 아침으로 3개를 먹고, 친구들을 위해 머핀을 구울 때 4개를 사용합니다. 남은 계란은 매일 농산물 시장에서 신선한 오리 알 하나당 2달러에 판매합니다. 그녀는 매일 농산물 시장에서 얼마를 버나요?


In [5]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

DATASET = "hrm_gsm"

llm = LLM(
    model='./qwen2.5-0.5b-instruct-q4_0.gguf',
    tokenizer="Qwen/Qwen2.5-0.5B-Instruct",
    trust_remote_code=True,
    dtype="auto",
    max_model_len=2048,
    gpu_memory_utilization=0.6,
)

# ---- load data ----------------------------------------------------------
df = load_dataset('HAERAE-HUB/KoSimpleEval',DATASET, split='test').to_pandas().head(20)

# ---- craft prompts ------------------------------------------------------
prompts = []

for _, row in df.iterrows():

    prompts.append([
        {"role": "system", "content":"You are a helpful Korean assistant."},
        {"role": "user", "content": row['question']+"\n최종 답을 \\boxed{N}의 형태로 적어주세요."}
    ])

# ---- generate -----------------------------------------------------------
sampling_params = SamplingParams(
    temperature=0.6,
    top_p=0.95,
    max_tokens=2048,
    repetition_penalty=1.01
)
outputs = llm.chat(prompts, sampling_params)
df["response"] = [o.outputs[0].text for o in outputs]

Adding requests:   0%|          | 0/20 [00:00<?, ?it/s]

Processed prompts:   0%|                     | 0/20 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.…

In [10]:
## Math Evaluation
import pandas as pd 
from math_verify import parse, verify

output = []
for _,row in df.iterrows():
    # break
    try:
        response = row.response#.split('</think>')[1]
        resp = parse(response)
        
        gold = parse('\\boxed{' + str(row.gold) + '}')
        is_correct0 = verify(gold,resp)
        is_correct1 = row.gold == resp[-1]
    
        output.append(
            any([is_correct0,is_correct1])
        )
    except:
        output.append(False)
df['correct'] = output     

overall_acc = df["correct"].mean()
print(f"Overall accuracy: {overall_acc:.2%}")

Overall accuracy: 10.00%


In [1]:
## Math Evaluation
import pandas as pd 
from math_verify import parse, verify

df = pd.read_csv('HRM_MATH--Qwen_Qwen2.5-1.5B-Instruct.csv')
output = []
for _,row in df.iterrows():
    # break
    try:
        response = row.response#.split('</think>')[1]
        resp = parse(response)
        
        gold = parse('\\boxed{' + str(row.gold) + '}')
        is_correct0 = verify(gold,resp)
        is_correct1 = row.gold == resp[-1]
    
        output.append(
            any([is_correct0,is_correct1])
        )
    except:
        output.append(False)
df['correct'] = output     

overall_acc = df["correct"].mean()
print(f"Overall accuracy: {overall_acc:.2%}")

ModuleNotFoundError: No module named 'math_verify'

In [None]:
# KMMLU-Redux Evaluation
import pandas as pd 
from math_verify import parse

df = pd.read_csv('HRB1_0--Qwen_Qwen2.5-1.5B-Instruct.csv')

output = []
for _,row in df.iterrows():
    try:
        response = row.response.split('</think>')[1]
        resp = parse(response)
        output.append(resp[-1])
    except:
        output.append('-1')
df['pred'] = output     


# 1) Define mappings
num2letter = {1: "A", 2: "B", 3: "C", 4: "D", 5:'E'}
letter2num = {v: k for k, v in num2letter.items()}

# 2) Normalize both sides to numbers (1–4) in a new column pred_num
def to_pred_num(x):
    # if it’s already an integer or a numeric string
    if isinstance(x, (int, float)) or (isinstance(x, str) and x.isdigit()):
        try:
            return int(x)
        except ValueError:
            pass
    # if it’s one of the letters A–D
    if isinstance(x, str) and x.upper() in letter2num:
        return letter2num[x.upper()]
    # otherwise
    return None

df["pred_num"] = df["pred"].apply(to_pred_num)

# 3) Optionally also create gold_letter if you want letter view
df["gold_letter"] = df["gold"].map(num2letter)

# 4) Now mark correctness
df["correct"] = df["pred_num"] == df["gold"]

# 5) Overall accuracy
overall_acc = df["correct"].mean()
print(f"Overall accuracy: {overall_acc:.2%}")

# 6) Accuracy by category
acc_by_cat = df.groupby("category")["correct"].mean().sort_values(ascending=False)
print("\nAccuracy by category:")
print(acc_by_cat)