# Evaluation of pretrained and PEFT finetuned models

## Setup

In [1]:
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.eval.evaluation import run_transformer_evaluation, run_adapter_evaluation

## Evaluation configuration

This section contains the list of tasks (evaluations) and models to evaluate.

In [3]:
CUSTOM_TASKS_PATH = "src.eval.custom_usmle_qa"

tasks_to_run = [
    "community|usmle-qa-letter|1|0",
    "community|usmle-qa-text|1|0",
    "community|usmle-qa-letter-text|1|0",
    "community|usmle-qa-mcf|4|0",
    "community|usmle-qa-cf|4|0",
]

# Output location
gcs_bucket_name = "open-llm-finetuning"
output_directory = f"gcs://{gcs_bucket_name}/evaluation_results"

# Define transformer-based models to evaluate
transformer_models_to_eval = [
        {
            "model_name": "meta-llama/Llama-2-7b-hf",
            "use_chat_template": False,
            "batch_size": 4,
        },
        {
            "model_name": "meta-llama/Meta-Llama-3-8B",
            "use_chat_template": False,
            "batch_size": 1,
        },
        {
            "model_name": "Sirius27/BeingWell_llama2_7b",
            "use_chat_template": False,
            "batch_size": 4,
        },
        {
            "model_name": "johnsnowlabs/JSL-MedLlama-3-8B-v1.0",
            "use_chat_template": False,
            "batch_size": 2,
        },
    ]

# Define adapter-based (fine-tuned) models to evaluate
adapter_models_to_eval = [
        {
            "model_name": "pippalap/llama3-8b-usmle-prefix-letters",
            "base_model": "meta-llama/Meta-Llama-3-8B",
            "adapter_weights": True,
            "use_chat_template": False,
            "batch_size": 1,
            "tokenizer_name": "meta-llama/Meta-Llama-3-8B",
        },
        {
            "model_name": "jihbr/usmle-llama8b-dora-letters-v1",
            "adapter_weights": True,
            "base_model": "meta-llama/Meta-Llama-3-8B",
            "use_chat_template": False,
            "batch_size": 1,
            "tokenizer_name": "meta-llama/Meta-Llama-3-8B",
        },
        {
            "model_name": "jihbr/usmle-llama8b-qlora_letters",
            "adapter_weights": True,
            "base_model": "meta-llama/Meta-Llama-3-8B",
            "use_chat_template": False,
            "batch_size": 1,
            "tokenizer_name": "meta-llama/Meta-Llama-3-8B",
        },
    ]

## Run evaluations

This section contains the loops to execute the evaluation runs.

In [None]:
# Loop through each task and run evaluations
for task in tasks_to_run:
    print(f"===== Running Task: {task} =====")

    # Evaluate adapter models
    for model_details in adapter_models_to_eval:
        run_adapter_evaluation(
            adapter_weights=model_details["adapter_weights"],
            model_name=model_details["model_name"],
            base_model=model_details["base_model"],
            tasks=task,
            batch_size=model_details["batch_size"],
            output_dir=output_directory,
            use_chat_template=model_details["use_chat_template"],
            custom_tasks_path=CUSTOM_TASKS_PATH,
            tokenizer_name=model_details.get("tokenizer_name")
        )
    
    # Evaluate transformer models
    for model_details in transformer_models_to_eval:
        run_transformer_evaluation(
            model_name=model_details["model_name"],
            tasks=task,
            batch_size=model_details["batch_size"],
            output_dir=output_directory,
            use_chat_template=model_details["use_chat_template"],
            custom_tasks_path=CUSTOM_TASKS_PATH,
            tokenizer_name=model_details.get("tokenizer_name")
        )
print("===== All Evaluations Complete =====")

===== Running Task: community|usmle-qa-letter-text|1|0 =====
Starting adapter evaluation for adapter: pippalap/llama3-8b-usmle-prefix-letters
On base model: meta-llama/Meta-Llama-3-8B
Tasks: community|usmle-qa-letter-text|1|0
Output Directory: gcs://open-llm-finetuning/evaluation_results


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

If you want to use extended_tasks, make sure you installed their dependencies using `pip install -e .[extended_tasks]`.
You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring.
Splits:   0%|          | 0/1 [00:00<?, ?it/s]

Greedy generation:   0%|          | 1/1273 [00:01<41:25,  1.95s/it][A
Greedy generation:   0%|          | 2/1273 [00:02<19:53,  1.06it/s][A
Greedy generation:   0%|          | 3/1273 [00:02<12:21,  1.71it/s][A
Greedy generation:   0%|          | 4/1273 [00:02<08:47,  2.40it/s][A
Greedy generation:   0%|          | 5/1273 [00:03<13:18,  1.59it/s][A
Greedy generation:   1%|          | 7/1273 [00:03<07:52,  2.68it/s][A
Greedy generation:   1%|          | 8/1273 [00:04<12:11,  1.73it/s][A
Greedy generation:   1%|          | 9/1273 [00:05<15:04,  1.40it/s][A
Greedy generation:   1%|          | 10/1273 [00:06<13:19,  1.58it/s][A
Greedy generation:   1%|          | 12/1273 [00:06<08:32,  2.46it/s][A
Greedy