In [91]:
import os
import json
import numpy as np

In [92]:
def load_json(fn): 
    with open(fn, "r") as f: 
        d = json.loads(f.read())
    return d

In [93]:
exclude = {
    "step1": [4, 7, 9, 11, 18, 21, 30, 32, 34, 36, 40, 42, 44, 47, 53, 56, 62, 68, 77, 80, 85, 88, 89, 100, 105, 113, 114],
    "step2": [25, 26, 33, 53, 60, 76, 80, 87, 107, 114],
    "step3": [5, 21, 34, 41, 63, 66, 67, 88, 96, 102, 113, 118, 130]
}

In [94]:
available_models = [
    "llama-7b-hf",
    "alpaca-lora",
    "medalapca-7b",
    "medalapca-lora-7b-8bit",
    "medalapca-lora-13b-8bit",
    "medalapca-lora-30b-8bit",
]

In [95]:
def accuracy(eval_results): 
    acc = {model: {"step1": [], "step2": [], "step3": []} for model in available_models}
    for model, results in eval_results.items(): 
        for step, answers in results.items(): 
            acc[model][step] = np.mean(answers)
    return acc


In [96]:
eval_results = {model: {"step1": [], "step2": [], "step3": []} for model in available_models}
for model in available_models: 
    for s in [1,2,3]: 
        if not os.path.exists(f"{model}-step{s}.json"): 
            continue
        step = load_json(f"{model}-step{s}.json")
        solutions = load_json(f"step{s}_solutions.json")
        for question in step: 
            if question["no"] in exclude[f"step{s}"]: 
                continue
            correct_option = solutions[str(question["no"])]
            eval_results[model][f"step{s}"].append(question.get("answer0", "").startswith(correct_option))

In [97]:
accuracy(eval_results)

{'llama-7b-hf': {'step1': 0.17391304347826086,
  'step2': 0.10909090909090909,
  'step3': nan},
 'alpaca-lora': {'step1': 0.2608695652173913,
  'step2': 0.2636363636363636,
  'step3': 0.2661290322580645},
 'medalapca-7b': {'step1': 0.2608695652173913,
  'step2': 0.3,
  'step3': 0.3629032258064516},
 'medalapca-lora-7b-8bit': {'step1': 0.1956521739130435,
  'step2': 0.20909090909090908,
  'step3': 0.08064516129032258},
 'medalapca-lora-13b-8bit': {'step1': 0.21739130434782608,
  'step2': 0.15454545454545454,
  'step3': 0.23387096774193547},
 'medalapca-lora-30b-8bit': {'step1': 0.0, 'step2': nan, 'step3': nan}}

In [98]:
def md_table(data): 
    header = [f"{'Model':25}", 'Step1   ', 'Step2   ', 'Step3   ']
    markdown_table = []

    # Add table header
    markdown_table.append('| ' + ' | '.join(header) + ' |')
    markdown_table.append(f'|{"-"*22}' + f'{"-"*5}|{"-"*5}' * (len(header) - 1) + f'{"-"*5}|')

    # Add table rows
    for model, values in data.items():
        row = [f"{model:25}"] + [f"{v:.3f}   " if not (isinstance(v, float) and np.isnan(v)) else f"{'nan':8}" for v in values.values()]
        markdown_table.append('| ' + ' | '.join(row) + ' |')

    # Combine markdown table lines and print
    markdown_table_str = '\n'.join(markdown_table)
    print(markdown_table_str)

In [99]:
md_table(accuracy(eval_results))

| Model                     | Step1    | Step2    | Step3    |
|---------------------------|----------|----------|----------|
| llama-7b-hf               | 0.174    | 0.109    | nan      |
| alpaca-lora               | 0.261    | 0.264    | 0.266    |
| medalapca-7b              | 0.261    | 0.300    | 0.363    |
| medalapca-lora-7b-8bit    | 0.196    | 0.209    | 0.081    |
| medalapca-lora-13b-8bit   | 0.217    | 0.155    | 0.234    |
| medalapca-lora-30b-8bit   | 0.000    | nan      | nan      |


| Model                     | Step1    | Step2    | Step3    |
|---------------------------|----------|----------|----------|
| llama-7b-hf               | 0.174    | 0.109    | nan      |
| alpaca naive              | 0.243    | 0.222    | 0.329    |
| alpaca-lora               | 0.261    | 0.264    | 0.266    |
| chatdoctor                | 0.187    | 0.185    | 0.148    |
| medalapca-7b              | 0.261    | 0.300    | 0.363    |
| medalapca-lora-7b-8bit    | 0.011    | nan      | nan      |
| medalapca-lora-13b-8bit   | 0.217    | 0.155    | 0.234    |
| medalapca-lora-30b-8bit   | 0.000    | nan      | nan      |



In [None]:
| step  | model                     | top n accuracy      |
|-------|---------------------------|---------------------|
| step1 | alpaca_naive              | mean accuracy:  0.243|
| step1 | chatdoctor                | mean accuracy:  0.187|
| step2 | alpaca_naive              | mean accuracy:  0.222|
| step2 | chatdoctor                | mean accuracy:  0.185|
| step3 | alpaca_naive              | mean accuracy:  0.329|
| step3 | chatdoctor                | mean accuracy:  0.148|
