# Benchmark with KoBEST

In [1]:
!pip install -q datasets transformers peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model_id = "meta-llama/Llama-3.2-1B"
adapter_path = "/content/drive/MyDrive/Llama3_Korean_Finetune/final-korean-adapted-llama1b"
merged_model_path = "./merged_korean_model"

In [2]:
from huggingface_hub import login
from google.colab import drive
login()
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# --- Load Base Model ---
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# --- Load LoRA Adapters ---
print(f"Loading LoRA adapters from {adapter_path}...")
model = PeftModel.from_pretrained(base_model, adapter_path)

# --- Merge and Unload ---
print("Merging adapters into the base model...")
model = model.merge_and_unload()
print("Merge complete.")

# --- Save the Merged Model for the Harness ---
print(f"Saving the final merged model to {merged_model_path}...")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)
print("Merged model saved successfully.")


Loading base model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Loading LoRA adapters from /content/drive/MyDrive/Llama3_Korean_Finetune/final-korean-adapted-adapters...
Merging adapters into the base model...
Merge complete.
Saving the final merged model to ./merged_korean_model...


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Merged model saved successfully.


# Evaluation

In [29]:
# Copy the entire 'evaluate_model' function from your notebook file into this cell.
# This function is well-written and handles everything we need.

import argparse
import json
import logging
import os
import numpy as np
from pathlib import Path
from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.utils import make_table

def _handle_non_serializable(o):
    if isinstance(o, np.int64) or isinstance(o, np.int32):
        return int(o)
    elif isinstance(o, set):
        return list(o)
    else:
        return str(o)

def evaluate_model(model_name, model_args, tasks, num_fewshot=0, batch_size=8, output_path=None, device="cuda"):
    args = argparse.Namespace(
        model=model_name,
        model_args=model_args,
        tasks=tasks,
        num_fewshot=num_fewshot,
        batch_size=str(batch_size),
        max_batch_size=None,
        device=device,
        output_path=output_path,
        limit=None,
        use_cache=None,
        decontamination_ngrams_path=None,
        check_integrity=False,
        write_out=False,
        log_samples=False,
        show_config=False,
        include_path=None,
        gen_kwargs=None,
        verbosity="INFO"
    )

    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    task_names = args.tasks.split(",")
    print(f"Selected Tasks: {task_names}")

    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
        max_batch_size=args.max_batch_size,
        device=args.device,
        use_cache=args.use_cache,
        limit=args.limit,
        check_integrity=args.check_integrity,
        write_out=args.write_out,
        log_samples=args.log_samples,
        gen_kwargs=args.gen_kwargs,
    )


    if results is not None:
        if output_path:
            output_file_path = Path(output_path)
            output_dir = output_file_path.parent
            output_dir.mkdir(parents=True, exist_ok=True)

            with open(output_file_path, "w") as f:
                json.dump(results, f, indent=2, default=_handle_non_serializable, ensure_ascii=False)

        print(make_table(results))
        if "groups" in results:
            print(make_table(results, "groups"))

    return results

In [30]:
# --- Evaluate the VANILLA base model ---
print("="*50)
print("Running evaluation on: Vanilla Llama-3.2-1B")
print("="*50)
vanilla_model_args = f"pretrained={base_model_id},dtype=float16"
vanilla_results = evaluate_model(
    model_name="hf-auto",
    model_args=vanilla_model_args,
    tasks="kobest_hellaswag,kobest_copa,kobest_boolq,kobest_sentineg",
    num_fewshot=0,
    batch_size=8
)

# --- 2. Evaluate YOUR FINETUNED model ---
print("\n")
print("="*50)
print("Running evaluation on: Your Finetuned Model")
print("="*50)
finetuned_model_args = f"pretrained={merged_model_path},dtype=float16"
finetuned_results = evaluate_model(
    model_name="hf-auto",
    model_args=finetuned_model_args,
    tasks="kobest_hellaswag,kobest_copa,kobest_boolq,kobest_sentineg",
    num_fewshot=0,
    batch_size=8
)

Running evaluation on: Vanilla Llama-3.2-1B-Instruct
Selected Tasks: ['kobest_hellaswag', 'kobest_copa', 'kobest_boolq', 'kobest_sentineg']


Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.20k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/2.28M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/578k [00:00<?, ?B/s]

validation.jsonl:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2029 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

train.jsonl:   0%|          | 0.00/864k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/291k [00:00<?, ?B/s]

validation.jsonl:   0%|          | 0.00/141k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3076 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

train.jsonl:   0%|          | 0.00/3.70M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

validation.jsonl:   0%|          | 0.00/692k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3665 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1404 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/700 [00:00<?, ? examples/s]

train.jsonl:   0%|          | 0.00/426k [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/40.7k [00:00<?, ?B/s]

test_originated.jsonl:   0%|          | 0.00/40.3k [00:00<?, ?B/s]

validation.jsonl:   0%|          | 0.00/46.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3649 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/397 [00:00<?, ? examples/s]

Generating test_originated split:   0%|          | 0/397 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/400 [00:00<?, ? examples/s]

100%|██████████| 397/397 [00:00<00:00, 138796.26it/s]
100%|██████████| 1404/1404 [00:00<00:00, 2047.71it/s]
100%|██████████| 1000/1000 [00:00<00:00, 2587.92it/s]
100%|██████████| 500/500 [00:00<00:00, 2840.11it/s]
Running loglikelihood requests: 100%|██████████| 7602/7602 [01:46<00:00, 71.22it/s] 


|     Tasks      |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|----------------|------:|------|-----:|--------|---|-----:|---|------|
|kobest_boolq    |      1|none  |     0|acc     |↑  |0.4986|±  |0.0133|
|                |       |none  |     0|f1      |↑  |0.3732|±  |   N/A|
|kobest_copa     |      1|none  |     0|acc     |↑  |0.5300|±  |0.0158|
|                |       |none  |     0|f1      |↑  |0.5295|±  |   N/A|
|kobest_hellaswag|      1|none  |     0|acc     |↑  |0.3900|±  |0.0218|
|                |       |none  |     0|acc_norm|↑  |0.5080|±  |0.0224|
|                |       |none  |     0|f1      |↑  |0.3857|±  |   N/A|
|kobest_sentineg |      1|none  |     0|acc     |↑  |0.4736|±  |0.0251|
|                |       |none  |     0|f1      |↑  |0.4735|±  |   N/A|


Running evaluation on: Your Finetuned Model
Selected Tasks: ['kobest_hellaswag', 'kobest_copa', 'kobest_boolq', 'kobest_sentineg']


100%|██████████| 397/397 [00:00<00:00, 145376.17it/s]
100%|██████████| 1404/1404 [00:00<00:00, 1998.37it/s]
100%|██████████| 1000/1000 [00:00<00:00, 110521.84it/s]
100%|██████████| 500/500 [00:00<00:00, 2652.89it/s]
Running loglikelihood requests: 100%|██████████| 7602/7602 [01:44<00:00, 72.79it/s] 


|     Tasks      |Version|Filter|n-shot| Metric |   |Value |   |Stderr|
|----------------|------:|------|-----:|--------|---|-----:|---|------|
|kobest_boolq    |      1|none  |     0|acc     |↑  |0.5021|±  |0.0133|
|                |       |none  |     0|f1      |↑  |0.3343|±  |   N/A|
|kobest_copa     |      1|none  |     0|acc     |↑  |0.5250|±  |0.0158|
|                |       |none  |     0|f1      |↑  |0.5239|±  |   N/A|
|kobest_hellaswag|      1|none  |     0|acc     |↑  |0.3660|±  |0.0216|
|                |       |none  |     0|acc_norm|↑  |0.4740|±  |0.0224|
|                |       |none  |     0|f1      |↑  |0.3627|±  |   N/A|
|kobest_sentineg |      1|none  |     0|acc     |↑  |0.4962|±  |0.0251|
|                |       |none  |     0|f1      |↑  |0.3360|±  |   N/A|

