# Check FT/PEFT results

In [None]:
import json
from pathlib import Path
from collections import defaultdict

In [None]:
datasets = ["temporal-nli", "torque", "tddiscourse", "matres"]

In [None]:
for dataset in datasets:
    print(f"<{dataset}>")
    for filepath in Path(f"../output_score/benchmark/{dataset}/").glob("*_ft_*/*.json"):
        with open(filepath, "r") as f:
            data = json.load(f)
        print(f"  {data['args']['model_id']}")
        if dataset == "temporal-nli":
            score = data['individuals']['example-wise-scores']['0']['macro avg']['f1-score']
            print(f"    {metric}: {score:.3f}")
        elif dataset == 'matres':
            score = data['individuals']['example-wise-scores']['0']['micro avg']['f1-score']
            print(f"    {metric}: {score:.3f}")
        else:
            for metric, scores in data['average'].items():
                print(f"    {metric}: {scores['median']:.3f}")
        

In [None]:
for dataset in datasets:
    print(f"<{dataset}>")
    for filepath in Path(f"../output_score/benchmark/{dataset}/").glob("*_peft_*/*.json"):
        with open(filepath, "r") as f:
            data = json.load(f)
        print(f"  {data['args']['model_id']}")
        
        if dataset == "temporal-nli":
            score = data['individuals']['example-wise-scores']['0']['macro avg']['f1-score']
            print(f"    {metric}: {score:.3f}")
        elif dataset == 'matres':
            score = data['individuals']['example-wise-scores']['0']['micro avg']['f1-score']
            print(f"    {metric}: {score:.3f}")
        else:
            for metric, scores in data['average'].items():
                print(f"    {metric}: {scores['median']:.3f}")

In [None]:
dirpath = Path("#TODO: dir")

In [None]:
datasets = ["matres", "torque", "tddiscourse", "temporal-nli"]
best = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
count_log = defaultdict(lambda: defaultdict(int))
for dataset in datasets:
    
    dirpath_dataset = dirpath / dataset
    for dirpath_model in dirpath_dataset.glob('*'):
        
        for filepath_log in dirpath_model.glob('*.log'):
            with open(filepath_log, "r") as f:
                log = json.load(f)
            if log["best"]:
                score = log["best"]["score"]
            else:
                score = .0
            count_log[dataset][dirpath_model.name] += 1
            if best[dataset][dirpath_model.name]["score"] < score:
                best[dataset][dirpath_model.name]["score"] = score
                best[dataset][dirpath_model.name]["path"] = filepath_log.parent / filepath_log.stem

for dataset, values in best.items():
    print(f"\n[{dataset}]")
    for model, _values in values.items():
        path_name = _values['path'].name if _values['path'] else "None"
        print(f"  {model:55}: {_values['score']:.3f}, ({count_log[dataset][model]}), {path_name}")

In [None]:
datasets = ["temporal-nli", "matres", "tddiscourse", "torque"]
model_ids = ["Llama-2-7b-hf", "Llama-2-7b-chat-hf", "Llama-2-13b-hf", "Llama-2-70b-hf", "flan-t5-xl", "flan-t5-xxl", "t5-3b"]

In [None]:
import statistics
dirpath = Path("../output_score/benchmark/")
scores = defaultdict(lambda: defaultdict(list))
for dataset in datasets:
    print(dataset)
    for model_id in model_ids:
        print(f"  {model_id}")
        for filepath in dirpath.glob(f"{dataset}/*{model_id}*few-shot*/*.json"):
            
            with open(filepath, "r") as f:
                data = json.load(f)
            
            if dataset == "temporal-nli":
                _max = statistics.median(
                    [x['macro avg']['f1-score'] for x in data['individuals']['example-wise-scores'].values()]
                )
                scores[dataset][model_id].append(_max)
            elif dataset == 'matres':
                # for k, x in data['individuals']['example-wise-scores'].items():
                #     if 'micro avg' not in x:
                #         print(filepath)
                #         print(k)
                #         print(x)
                _max = statistics.median(
                    [x['micro avg']['f1-score'] if 'micro avg' in x else x['accuracy'] for x in data['individuals']['example-wise-scores'].values()]
                )
                scores[dataset][model_id].append(_max)
            else:
                if dataset == "torque":
                    _max = data['average']['exact-match-relaxed']['median']
                else:
                    _max = data['average']['example-wise-scores']['median']
                scores[dataset][model_id].append(_max)

In [None]:
for dataset, _scores in scores.items():
    print(f"{dataset}")
    for model, __scores in _scores.items():
        print(f"  {model}: {max(__scores):.3f}")