In [1]:
from pprint import pprint
from collections import defaultdict
import json
import re
import sys
from typing import Union, List, Optional, Set, Tuple, Dict, Optional, Callable
from pprint import pprint
from IPython.display import display, HTML
from collections import OrderedDict

import numpy as np
from lab.utils import shorten
import pandas as pd
from pathlib import Path
from machine_learning.analysis.dataframe import (
    pivot_rotate,

    slice_rows,
    slice_cols,
    sort_rows,
    sort_cols,
    aggregate,
    percentize,
    round,

    rename_index,
    rename_cols,
    rename_cells,

    isnan,
    to_latex,
    color_by_rank,
)
from machine_learning.analysis.series import (
    maybe_numeric_series,
)
from machine_learning.analysis.utils import (
    maybe_round,
)
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 1000)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def find_result_paths(top_dir: Union[str, Path], regexps: Optional[List[str]] = None) -> List[str]:
    regexps = regexps or []
    top_dir = Path(top_dir)
    return [
        str(path) for path in top_dir.glob('**/results.tsv')
        if all(re.match(regexp, str(path)) for regexp in regexps)
    ]

In [3]:
def name_method(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
            
    df[COL_METHOD] = df.apply(
        lambda row: '__'.join(str(row[col]) for col in METHOD_DEFINE_COLS),
        axis=1,
    )

    df = sort_cols(df, [COL_METHOD, '.*'])

    return df

def name_task(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df[COL_TASK] = df.apply(
        lambda row: '__'.join(str(row[col]) for col in TASK_DEFINE_COLS),
        axis=1,
    )

    df = sort_cols(df, [COL_TASK, '.*'])
    
    return df

def prettify_df(df: pd.DataFrame) -> pd.DataFrame:
    df = percentize(df)
    df = round(df)
    return df

In [4]:
# --------------- LREC_2024_submission ---------------
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230826.jpn/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230901.overfit/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230904.LLM_FS/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230905.LLM_FS/'

# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230910.preliminary'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230916.jpn/'

# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230916.jpn.FT/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230919.jpn/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230919.jpn.seed--1/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230919.jpn.seed--0-1/'   # LREC_2024_submission!
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20230919.jpn.seed--1/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20231005.jpn.seed--0'

# --------------- NLP_2024 ---------------
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20231203.jpn/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20231203.jpn.no_subproof_for_unknown20231203.jpn/'
# _TOP_DIR = '../outputs/02.aggregate_tf_results.py/20231206.new_models/'
_TOP_DIR = '../outputs/02.aggregate_tf_results.py/20231213.jpn'

In [5]:
# version = 'LREC_2024_submission'
version = 'NLP_2024'

In [6]:
if version == 'NLP_2024':
    COL_DATASET = 'FLD_dataset_uname'

    METHOD_RENAMES = OrderedDict([
        ('^retrieva-jp/t5-base-long$', 'retrieva-t5-base'),
        ('^retrieva-jp/t5-xl$', 'retrieva-t5-xl'),
    
        # ('^line-corporation/japanese-large-lm-1.7b$', 'line-1B'),
        # ('^line-corporation/japanese-large-lm-1.7b-instruction-sft$', 'line-1B-instruct'),
        ('^line-corporation/japanese-large-lm-3.6b$', 'line-4B'),
        ('^line-corporation/japanese-large-lm-3.6b-instruction-sft$', 'line-4B-instruct'),
       
        ('^rinna/japanese-gpt-neox-3.6b$', 'rinna-4B'),
        ('^rinna/japanese-gpt-neox-3.6b-instruction-ppo$', 'rinna-4B-instruct'),
        
        # ('^cyberagent/open-calm-medium$', 'calm-0.4B'),
        # ('^cyberagent/open-calm-1b$', 'calm-1B'),
        # ('^cyberagent/open-calm-3b$', 'calm-3B'),
        ('^cyberagent/open-calm-7b$', 'calm-7B'),
        ('^cyberagent/calm2-7b$', 'calm2-7B'),
        ('^cyberagent/calm2-7b-chat$', 'calm2-7B-instruct'),
        
        ('^stabilityai/japanese-stablelm-base-alpha-7b$', 'stablelm-7B'),
        ('^stabilityai/japanese-stablelm-instruct-alpha-7b-v2$', 'stablelm-7B-instruct'),
        
        ('^elyza/ELYZA-japanese-Llama-2-7b-fast$', 'elyza-7B'),
        ('^elyza/ELYZA-japanese-Llama-2-7b-fast-instruct$', 'elyza-7B-instruct'),
        
        ('^matsuo-lab/weblab-10b$$', 'weblab-10B'),
        ('^matsuo-lab/weblab-10b-instruction-sft$', 'weblab-10B-instruct'),
        
        ('^stockmark/stockmark-13b$', 'stockmark-13B'),
        ('^pfnet/plamo-13b$', 'plamo-13B'),
        ('^llm-jp/llm-jp-13b-v1.0$', 'llmjp-13B'),
        ('^llm-jp/llm-jp-13b-instruct-full-jaster-v1.0$', 'llmjp-13B-instruct'),

        ('^tokyotech-llm/Swallow-13b-hf$', 'swallow-13b'),
        ('^tokyotech-llm/Swallow-13b-instruct-hf$', 'swallow-13b'),
    ])
    
    LRATE = 1e-05
    DO_SLICE_BY_LRATE = True
    
    METRIC_RENAMES = OrderedDict([
        ('train/FLD_proof_eval_strct.D-all.proof_accuracy.zero_one', 'prf.strct'),
        # ('train/FLD_proof_eval_strct.D-all.answer_accuracy', 'ans'),
    ])

elif version == 'LREC_2024_submission':
    COL_DATASET = 'dataset_uname'

    METHOD_RENAMES = OrderedDict([
        ('^retrieva-jp/t5-base-long$', 'retrieva-t5-base'),
        ('^retrieva-jp/t5-xl$', 'retrieva-t5-xl'),
    
        # ('^line-corporation/japanese-large-lm-1.7b$', 'line-1B'),
        # ('^line-corporation/japanese-large-lm-1.7b-instruction-sft$', 'line-1B-instruct'),
        ('^line-corporation/japanese-large-lm-3.6b$', 'line-4B'),
        ('^line-corporation/japanese-large-lm-3.6b-instruction-sft$', 'line-4B-instruct'),
       
        ('^rinna/japanese-gpt-neox-3.6b$', 'rinna-4B'),
        ('^rinna/japanese-gpt-neox-3.6b-instruction-ppo$', 'rinna-4B-instruct'),
        
        # ('^cyberagent/open-calm-medium$', 'calm-0.4B'),
        # ('^cyberagent/open-calm-1b$', 'calm-1B'),
        # ('^cyberagent/open-calm-3b$', 'calm-3B'),
        ('^cyberagent/open-calm-7b$', 'calm-7B'),
        
        ('^stabilityai/japanese-stablelm-base-alpha-7b$', 'stablelm-7B'),
        
        ('^elyza/ELYZA-japanese-Llama-2-7b-fast$', 'elyza-7B'),
        ('^elyza/ELYZA-japanese-Llama-2-7b-fast-instruct$', 'elyza-7B-instruct'),
        
        ('^matsuo-lab/weblab-10b$$', 'weblab-10B'),
        ('^matsuo-lab/weblab-10b-instruction-sft$', 'weblab-10B-instruct'),
        
        ('^pfnet/plamo-13b$', 'plamo-13B'),

    ])
    
    LRATE = 1e-05
    DO_SLICE_BY_LRATE = False
    
    METRIC_RENAMES = OrderedDict([
        # ('eval/extr_stps.D-all.proof_accuracy.zero_one', 'prf.extr'),
        ('eval/strct.D-all.proof_accuracy.zero_one', 'prf.strct'),
        # ('eval/strct.D-all.answer_accuracy', 'ans'),
    ])


else:
    raise ValueError()

COL_LEARNING = 'learning'
COL_MODEL_NAME_OR_PATH = 'model_name_or_path'
COL_LRATE = 'learning_rate'

TASK_DEFINE_COLS = [COL_DATASET, COL_LEARNING]
COL_TASK = 'task'

METHOD_DEFINE_COLS = [COL_MODEL_NAME_OR_PATH]
COL_METHOD = 'method'

METRIC_NAMES = list(METRIC_RENAMES.values())

COLOR_SCALE_LOWER = 0
COLOR_SCALE_UPPER = 100
COLOR_PARETTE_LOWER = 3
COLOR_PARETTE_UPPER = 70

DARK = True

In [7]:
result_paths = find_result_paths(_TOP_DIR)
if len(result_paths) == 0:
    raise Exception(f'Results not found under {_TOP_DIR}')
elif len(result_paths) == 1:
    results_path = result_paths[0]
else:
    print('Choose the result fomr the following paths:')
    pprint(result_paths)
    results_path = input('path = ')

In [8]:
print(f'loading results from {str(results_path)}')
master_df = pd.read_csv(results_path, sep='\t')
master_df

loading results from ../outputs/02.aggregate_tf_results.py/20231213.jpn/results.tsv


Unnamed: 0,FLD_dataset_uname,learning,model_name_or_path,seed,learning_rate,eval/extr_stps.D-0.proof_accuracy.zero_one,eval/extr_stps.D-1.proof_accuracy.zero_one,eval/extr_stps.D-2.proof_accuracy.zero_one,eval/extr_stps.D-3.proof_accuracy.zero_one,eval/extr_stps.D-4.proof_accuracy.zero_one,eval/extr_stps.D-5.proof_accuracy.zero_one,eval/extr_stps.D-6.proof_accuracy.zero_one,eval/extr_stps.D-7.proof_accuracy.zero_one,eval/extr_stps.D-8.proof_accuracy.zero_one,eval/extr_stps.D-None.proof_accuracy.zero_one,eval/extr_stps.D-all.proof_accuracy.zero_one,eval/strct.D-0.proof_accuracy.zero_one,eval/strct.D-1.proof_accuracy.zero_one,eval/strct.D-2.proof_accuracy.zero_one,eval/strct.D-3.proof_accuracy.zero_one,eval/strct.D-4.proof_accuracy.zero_one,eval/strct.D-5.proof_accuracy.zero_one,eval/strct.D-6.proof_accuracy.zero_one,eval/strct.D-7.proof_accuracy.zero_one,eval/strct.D-8.proof_accuracy.zero_one,eval/strct.D-None.proof_accuracy.zero_one,eval/strct.D-all.proof_accuracy.zero_one,eval/extr_stps.D-0.answer_accuracy,eval/extr_stps.D-1.answer_accuracy,eval/extr_stps.D-2.answer_accuracy,eval/extr_stps.D-3.answer_accuracy,eval/extr_stps.D-4.answer_accuracy,eval/extr_stps.D-5.answer_accuracy,eval/extr_stps.D-6.answer_accuracy,eval/extr_stps.D-7.answer_accuracy,eval/extr_stps.D-8.answer_accuracy,eval/extr_stps.D-None.answer_accuracy,eval/extr_stps.D-all.answer_accuracy,eval/strct.D-0.answer_accuracy,eval/strct.D-1.answer_accuracy,eval/strct.D-2.answer_accuracy,eval/strct.D-3.answer_accuracy,eval/strct.D-4.answer_accuracy,eval/strct.D-5.answer_accuracy,eval/strct.D-6.answer_accuracy,eval/strct.D-7.answer_accuracy,eval/strct.D-8.answer_accuracy,eval/strct.D-None.answer_accuracy,eval/strct.D-all.answer_accuracy,train/FLD_proof_eval_extr_stps.D-0.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-1.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-2.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-3.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-4.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-5.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-6.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-7.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-8.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-None.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-all.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-0.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-1.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-2.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-3.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-4.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-5.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-6.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-7.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-8.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-None.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-all.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-0.answer_accuracy,train/FLD_proof_eval_extr_stps.D-1.answer_accuracy,train/FLD_proof_eval_extr_stps.D-2.answer_accuracy,train/FLD_proof_eval_extr_stps.D-3.answer_accuracy,train/FLD_proof_eval_extr_stps.D-4.answer_accuracy,train/FLD_proof_eval_extr_stps.D-5.answer_accuracy,train/FLD_proof_eval_extr_stps.D-6.answer_accuracy,train/FLD_proof_eval_extr_stps.D-7.answer_accuracy,train/FLD_proof_eval_extr_stps.D-8.answer_accuracy,train/FLD_proof_eval_extr_stps.D-None.answer_accuracy,train/FLD_proof_eval_extr_stps.D-all.answer_accuracy,train/FLD_proof_eval_strct.D-0.answer_accuracy,train/FLD_proof_eval_strct.D-1.answer_accuracy,train/FLD_proof_eval_strct.D-2.answer_accuracy,train/FLD_proof_eval_strct.D-3.answer_accuracy,train/FLD_proof_eval_strct.D-4.answer_accuracy,train/FLD_proof_eval_strct.D-5.answer_accuracy,train/FLD_proof_eval_strct.D-6.answer_accuracy,train/FLD_proof_eval_strct.D-7.answer_accuracy,train/FLD_proof_eval_strct.D-8.answer_accuracy,train/FLD_proof_eval_strct.D-None.answer_accuracy,train/FLD_proof_eval_strct.D-all.answer_accuracy
0,20231213.jpn.D1,LLM_FS.shot-10,cyberagent/calm2-7b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.066667,0.000000,,,,,,,,0.575758,0.198020,0.066667,0.000000,,,,,,,,0.575758,0.198020,0.200000,0.113208,,,,,,,,0.575758,0.277228,0.200000,0.113208,,,,,,,,0.575758,0.277228
1,20231213.jpn.D1,LLM_FS.shot-10,cyberagent/calm2-7b-chat,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.018868,,,,,,,,0.363636,0.128713,0.000000,0.018868,,,,,,,,0.363636,0.128713,0.400000,0.264151,,,,,,,,0.363636,0.316832,0.400000,0.264151,,,,,,,,0.363636,0.316832
2,20231213.jpn.D1,LLM_FS.shot-10,elyza/ELYZA-japanese-Llama-2-7b-fast,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,,,,,,,,1.000000,0.326733,0.000000,0.000000,,,,,,,,1.000000,0.326733,0.000000,0.000000,,,,,,,,1.000000,0.326733,0.000000,0.000000,,,,,,,,1.000000,0.326733
3,20231213.jpn.D1,LLM_FS.shot-10,elyza/ELYZA-japanese-Llama-2-7b-fast-instruct,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.056604,,,,,,,,0.121212,0.069307,0.000000,0.056604,,,,,,,,0.121212,0.069307,0.666667,0.528302,,,,,,,,0.121212,0.415842,0.666667,0.528302,,,,,,,,0.121212,0.415842
4,20231213.jpn.D1,LLM_FS.shot-10,line-corporation/japanese-large-lm-3.6b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,,,,,,,,0.303030,0.099010,0.000000,0.000000,,,,,,,,0.303030,0.099010,0.400000,0.301887,,,,,,,,0.303030,0.316832,0.400000,0.301887,,,,,,,,0.303030,0.316832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,20231213.jpn.D8,LLM_FS.shot-30000,rinna/japanese-gpt-neox-3.6b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.142857,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.287129,0.000000,0.142857,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.287129,0.000000,0.428571,0.083333,0.0,0.00,0.000000,0.0,0.0,0.0,1.000000,0.316832,0.000000,0.428571,0.083333,0.0,0.00,0.000000,0.0,0.0,0.0,1.000000,0.316832
344,20231213.jpn.D8,LLM_FS.shot-30000,rinna/japanese-gpt-neox-3.6b-instruction-ppo,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.428571,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.316832,0.000000,0.285714,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.306931,0.000000,0.714286,0.083333,0.0,0.00,0.000000,0.0,0.0,0.0,1.000000,0.336634,0.000000,0.714286,0.083333,0.0,0.00,0.000000,0.0,0.0,0.0,1.000000,0.336634
345,20231213.jpn.D8,LLM_FS.shot-30000,stabilityai/japanese-stablelm-base-alpha-7b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000000,0.714286,0.250000,0.0,0.0,0.0,0.0,0.0,0.0,0.964286,0.356436,1.000000,0.571429,0.250000,0.0,0.0,0.0,0.0,0.0,0.0,0.964286,0.346535,1.000000,0.714286,0.250000,0.0,0.00,0.076923,0.0,0.0,0.0,0.964286,0.366337,1.000000,0.714286,0.250000,0.0,0.00,0.076923,0.0,0.0,0.0,0.964286,0.366337
346,20231213.jpn.D8,LLM_FS.shot-30000,stabilityai/japanese-stablelm-instruct-alpha-7b-v2,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000000,0.714286,0.250000,0.0,0.0,0.0,0.0,0.0,0.0,0.964286,0.356436,1.000000,0.714286,0.250000,0.0,0.0,0.0,0.0,0.0,0.0,0.964286,0.356436,1.000000,0.714286,0.250000,0.0,0.25,0.000000,0.0,0.0,0.0,0.964286,0.366337,1.000000,0.714286,0.250000,0.0,0.25,0.000000,0.0,0.0,0.0,0.964286,0.366337


In [9]:
df = name_method(master_df)
df = rename_cells(df, [COL_METHOD], METHOD_RENAMES)
df = sort_rows(df, COL_METHOD, [f'^{name}$' for name in METHOD_RENAMES.values()])


if DO_SLICE_BY_LRATE:
    df = slice_rows(df, lambda row: row[COL_LRATE] == LRATE)
    if len(df) == 0:
        raise ValueError()


def rename_task(task_name: str):
    task_name = re.sub('_wo_dist', '-', task_name)
    task_name = re.sub('.*jpn.', '', task_name)
    task_name = re.sub('__LLM_FS.shot-', '.', task_name)
    return task_name

df = name_task(df)

task_renames = OrderedDict(
    sorted(
        [(f'^{task_name}$', rename_task(task_name))
        for task_name in df[COL_TASK].unique()],
        key = lambda name_rename: name_rename[1]
    )
)
print(task_renames)
df = rename_cells(df, [COL_TASK], task_renames)

TASK_NAMES = [task_name for task_name in task_renames.values()
              if task_name in df[COL_TASK].values]
for task_name in df[COL_TASK].values:
    if task_name not in TASK_NAMES:
        TASK_NAMES.append(task_name)
MAJOR_TASK = TASK_NAMES[0]
 
df = rename_cols(df, METRIC_RENAMES)
                  
df = aggregate(
    df,
    [COL_METHOD, COL_TASK],
    {metric_name: lambda vals: np.mean([val for val in vals if not isnan(val)]) for metric_name in METRIC_NAMES},
)

df

OrderedDict([('^20231213.jpn.D1_wo_dist__LLM_FS.shot-10$', 'D1-.10'), ('^20231213.jpn.D1_wo_dist__LLM_FS.shot-100$', 'D1-.100'), ('^20231213.jpn.D1_wo_dist__LLM_FS.shot-1000$', 'D1-.1000'), ('^20231213.jpn.D1_wo_dist__LLM_FS.shot-10000$', 'D1-.10000'), ('^20231213.jpn.D1_wo_dist__LLM_FS.shot-30000$', 'D1-.30000'), ('^20231213.jpn.D1__LLM_FS.shot-10$', 'D1.10'), ('^20231213.jpn.D1__LLM_FS.shot-100$', 'D1.100'), ('^20231213.jpn.D1__LLM_FS.shot-1000$', 'D1.1000'), ('^20231213.jpn.D1__LLM_FS.shot-10000$', 'D1.10000'), ('^20231213.jpn.D1__LLM_FS.shot-30000$', 'D1.30000'), ('^20231213.jpn.D3__LLM_FS.shot-10$', 'D3.10'), ('^20231213.jpn.D3__LLM_FS.shot-100$', 'D3.100'), ('^20231213.jpn.D3__LLM_FS.shot-1000$', 'D3.1000'), ('^20231213.jpn.D3__LLM_FS.shot-10000$', 'D3.10000'), ('^20231213.jpn.D3__LLM_FS.shot-30000$', 'D3.30000'), ('^20231213.jpn.D8__LLM_FS.shot-10$', 'D8.10'), ('^20231213.jpn.D8__LLM_FS.shot-100$', 'D8.100'), ('^20231213.jpn.D8__LLM_FS.shot-1000$', 'D8.1000'), ('^20231213.jpn.D8

Unnamed: 0,method,task,FLD_dataset_uname,learning,model_name_or_path,seed,learning_rate,eval/extr_stps.D-0.proof_accuracy.zero_one,eval/extr_stps.D-1.proof_accuracy.zero_one,eval/extr_stps.D-2.proof_accuracy.zero_one,eval/extr_stps.D-3.proof_accuracy.zero_one,eval/extr_stps.D-4.proof_accuracy.zero_one,eval/extr_stps.D-5.proof_accuracy.zero_one,eval/extr_stps.D-6.proof_accuracy.zero_one,eval/extr_stps.D-7.proof_accuracy.zero_one,eval/extr_stps.D-8.proof_accuracy.zero_one,eval/extr_stps.D-None.proof_accuracy.zero_one,eval/extr_stps.D-all.proof_accuracy.zero_one,eval/strct.D-0.proof_accuracy.zero_one,eval/strct.D-1.proof_accuracy.zero_one,eval/strct.D-2.proof_accuracy.zero_one,eval/strct.D-3.proof_accuracy.zero_one,eval/strct.D-4.proof_accuracy.zero_one,eval/strct.D-5.proof_accuracy.zero_one,eval/strct.D-6.proof_accuracy.zero_one,eval/strct.D-7.proof_accuracy.zero_one,eval/strct.D-8.proof_accuracy.zero_one,eval/strct.D-None.proof_accuracy.zero_one,eval/strct.D-all.proof_accuracy.zero_one,eval/extr_stps.D-0.answer_accuracy,eval/extr_stps.D-1.answer_accuracy,eval/extr_stps.D-2.answer_accuracy,eval/extr_stps.D-3.answer_accuracy,eval/extr_stps.D-4.answer_accuracy,eval/extr_stps.D-5.answer_accuracy,eval/extr_stps.D-6.answer_accuracy,eval/extr_stps.D-7.answer_accuracy,eval/extr_stps.D-8.answer_accuracy,eval/extr_stps.D-None.answer_accuracy,eval/extr_stps.D-all.answer_accuracy,eval/strct.D-0.answer_accuracy,eval/strct.D-1.answer_accuracy,eval/strct.D-2.answer_accuracy,eval/strct.D-3.answer_accuracy,eval/strct.D-4.answer_accuracy,eval/strct.D-5.answer_accuracy,eval/strct.D-6.answer_accuracy,eval/strct.D-7.answer_accuracy,eval/strct.D-8.answer_accuracy,eval/strct.D-None.answer_accuracy,eval/strct.D-all.answer_accuracy,train/FLD_proof_eval_extr_stps.D-0.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-1.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-2.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-3.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-4.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-5.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-6.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-7.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-8.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-None.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-all.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-0.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-1.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-2.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-3.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-4.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-5.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-6.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-7.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-8.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-None.proof_accuracy.zero_one,prf.strct,train/FLD_proof_eval_extr_stps.D-0.answer_accuracy,train/FLD_proof_eval_extr_stps.D-1.answer_accuracy,train/FLD_proof_eval_extr_stps.D-2.answer_accuracy,train/FLD_proof_eval_extr_stps.D-3.answer_accuracy,train/FLD_proof_eval_extr_stps.D-4.answer_accuracy,train/FLD_proof_eval_extr_stps.D-5.answer_accuracy,train/FLD_proof_eval_extr_stps.D-6.answer_accuracy,train/FLD_proof_eval_extr_stps.D-7.answer_accuracy,train/FLD_proof_eval_extr_stps.D-8.answer_accuracy,train/FLD_proof_eval_extr_stps.D-None.answer_accuracy,train/FLD_proof_eval_extr_stps.D-all.answer_accuracy,train/FLD_proof_eval_strct.D-0.answer_accuracy,train/FLD_proof_eval_strct.D-1.answer_accuracy,train/FLD_proof_eval_strct.D-2.answer_accuracy,train/FLD_proof_eval_strct.D-3.answer_accuracy,train/FLD_proof_eval_strct.D-4.answer_accuracy,train/FLD_proof_eval_strct.D-5.answer_accuracy,train/FLD_proof_eval_strct.D-6.answer_accuracy,train/FLD_proof_eval_strct.D-7.answer_accuracy,train/FLD_proof_eval_strct.D-8.answer_accuracy,train/FLD_proof_eval_strct.D-None.answer_accuracy,train/FLD_proof_eval_strct.D-all.answer_accuracy
"(line-4B, D1.10)",line-4B,D1.10,20231213.jpn.D1,LLM_FS.shot-10,line-corporation/japanese-large-lm-3.6b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,,,,,,,,0.303030,0.099010,0.000000,0.000000,,,,,,,,0.303030,0.099010,0.400000,0.301887,,,,,,,,0.303030,0.316832,0.400000,0.301887,,,,,,,,0.303030,0.316832
"(line-4B, D1.100)",line-4B,D1.100,20231213.jpn.D1,LLM_FS.shot-100,line-corporation/japanese-large-lm-3.6b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.037736,,,,,,,,0.303030,0.118812,0.000000,0.037736,,,,,,,,0.303030,0.118812,0.200000,0.566038,,,,,,,,0.303030,0.425743,0.200000,0.566038,,,,,,,,0.303030,0.425743
"(line-4B, D1.1000)",line-4B,D1.1000,20231213.jpn.D1,LLM_FS.shot-1000,line-corporation/japanese-large-lm-3.6b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.133333,0.226415,,,,,,,,0.393939,0.267327,0.133333,0.226415,,,,,,,,0.393939,0.267327,0.666667,0.698113,,,,,,,,0.393939,0.594059,0.666667,0.698113,,,,,,,,0.393939,0.594059
"(line-4B, D1.10000)",line-4B,D1.10000,20231213.jpn.D1,LLM_FS.shot-10000,line-corporation/japanese-large-lm-3.6b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.800000,0.283019,,,,,,,,0.515152,0.435644,0.800000,0.283019,,,,,,,,0.515152,0.435644,1.000000,0.735849,,,,,,,,0.515152,0.702970,1.000000,0.735849,,,,,,,,0.515152,0.702970
"(line-4B, D1.30000)",line-4B,D1.30000,20231213.jpn.D1,LLM_FS.shot-30000,line-corporation/japanese-large-lm-3.6b,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.933333,0.716981,,,,,,,,0.969697,0.831683,0.933333,0.716981,,,,,,,,0.969697,0.831683,0.933333,0.886792,,,,,,,,0.969697,0.920792,0.933333,0.886792,,,,,,,,0.969697,0.920792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(swallow-13b, D3.10000)",swallow-13b,D3.10000,20231213.jpn.D3,LLM_FS.shot-10000,tokyotech-llm/Swallow-13b-hf,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000000,0.347826,0.05,0.0,,,,,,0.966667,0.405941,1.000000,0.304348,0.05,0.0,,,,,,0.966667,0.371287,1.000000,0.521739,0.300000,0.04,,,,,,0.966667,0.504951,1.000000,0.521739,0.300000,0.04,,,,,,0.966667,0.504951
"(swallow-13b, D8.10)",swallow-13b,D8.10,20231213.jpn.D8,LLM_FS.shot-10,tokyotech-llm/Swallow-13b-hf,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.034653,0.000000,0.000000,0.083333,0.00,0.0,0.0,0.0,0.181818,0.0,0.000000,0.029703,0.000000,0.000000,0.083333,0.00,0.0,0.0,0.0,0.181818,0.0,0.000000,0.029703
"(swallow-13b, D8.100)",swallow-13b,D8.100,20231213.jpn.D8,LLM_FS.shot-100,tokyotech-llm/Swallow-13b-hf,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.142857,0.000000,0.20,0.0,0.0,0.0,0.090909,0.0,0.000000,0.039604,0.000000,0.142857,0.000000,0.20,0.0,0.0,0.0,0.090909,0.0,0.000000,0.039604
"(swallow-13b, D8.1000)",swallow-13b,D8.1000,20231213.jpn.D8,LLM_FS.shot-1000,tokyotech-llm/Swallow-13b-hf,0,0.00001,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.000000,0.285714,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,0.287129,1.000000,0.285714,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,0.282178,1.000000,0.428571,0.083333,0.00,0.0,0.0,0.0,0.000000,0.0,0.928571,0.306931,1.000000,0.428571,0.083333,0.00,0.0,0.0,0.0,0.000000,0.0,0.928571,0.306931


In [10]:
task_dfs: Dict[str, pd.DataFrame] = OrderedDict()
for task_name in TASK_NAMES:
    task_dfs[task_name] = slice_rows(
        df,
        lambda row: row[COL_TASK] == task_name
    )

task_dfs[MAJOR_TASK]

Unnamed: 0,method,task,FLD_dataset_uname,learning,model_name_or_path,seed,learning_rate,eval/extr_stps.D-0.proof_accuracy.zero_one,eval/extr_stps.D-1.proof_accuracy.zero_one,eval/extr_stps.D-2.proof_accuracy.zero_one,eval/extr_stps.D-3.proof_accuracy.zero_one,eval/extr_stps.D-4.proof_accuracy.zero_one,eval/extr_stps.D-5.proof_accuracy.zero_one,eval/extr_stps.D-6.proof_accuracy.zero_one,eval/extr_stps.D-7.proof_accuracy.zero_one,eval/extr_stps.D-8.proof_accuracy.zero_one,eval/extr_stps.D-None.proof_accuracy.zero_one,eval/extr_stps.D-all.proof_accuracy.zero_one,eval/strct.D-0.proof_accuracy.zero_one,eval/strct.D-1.proof_accuracy.zero_one,eval/strct.D-2.proof_accuracy.zero_one,eval/strct.D-3.proof_accuracy.zero_one,eval/strct.D-4.proof_accuracy.zero_one,eval/strct.D-5.proof_accuracy.zero_one,eval/strct.D-6.proof_accuracy.zero_one,eval/strct.D-7.proof_accuracy.zero_one,eval/strct.D-8.proof_accuracy.zero_one,eval/strct.D-None.proof_accuracy.zero_one,eval/strct.D-all.proof_accuracy.zero_one,eval/extr_stps.D-0.answer_accuracy,eval/extr_stps.D-1.answer_accuracy,eval/extr_stps.D-2.answer_accuracy,eval/extr_stps.D-3.answer_accuracy,eval/extr_stps.D-4.answer_accuracy,eval/extr_stps.D-5.answer_accuracy,eval/extr_stps.D-6.answer_accuracy,eval/extr_stps.D-7.answer_accuracy,eval/extr_stps.D-8.answer_accuracy,eval/extr_stps.D-None.answer_accuracy,eval/extr_stps.D-all.answer_accuracy,eval/strct.D-0.answer_accuracy,eval/strct.D-1.answer_accuracy,eval/strct.D-2.answer_accuracy,eval/strct.D-3.answer_accuracy,eval/strct.D-4.answer_accuracy,eval/strct.D-5.answer_accuracy,eval/strct.D-6.answer_accuracy,eval/strct.D-7.answer_accuracy,eval/strct.D-8.answer_accuracy,eval/strct.D-None.answer_accuracy,eval/strct.D-all.answer_accuracy,train/FLD_proof_eval_extr_stps.D-0.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-1.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-2.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-3.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-4.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-5.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-6.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-7.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-8.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-None.proof_accuracy.zero_one,train/FLD_proof_eval_extr_stps.D-all.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-0.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-1.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-2.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-3.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-4.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-5.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-6.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-7.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-8.proof_accuracy.zero_one,train/FLD_proof_eval_strct.D-None.proof_accuracy.zero_one,prf.strct,train/FLD_proof_eval_extr_stps.D-0.answer_accuracy,train/FLD_proof_eval_extr_stps.D-1.answer_accuracy,train/FLD_proof_eval_extr_stps.D-2.answer_accuracy,train/FLD_proof_eval_extr_stps.D-3.answer_accuracy,train/FLD_proof_eval_extr_stps.D-4.answer_accuracy,train/FLD_proof_eval_extr_stps.D-5.answer_accuracy,train/FLD_proof_eval_extr_stps.D-6.answer_accuracy,train/FLD_proof_eval_extr_stps.D-7.answer_accuracy,train/FLD_proof_eval_extr_stps.D-8.answer_accuracy,train/FLD_proof_eval_extr_stps.D-None.answer_accuracy,train/FLD_proof_eval_extr_stps.D-all.answer_accuracy,train/FLD_proof_eval_strct.D-0.answer_accuracy,train/FLD_proof_eval_strct.D-1.answer_accuracy,train/FLD_proof_eval_strct.D-2.answer_accuracy,train/FLD_proof_eval_strct.D-3.answer_accuracy,train/FLD_proof_eval_strct.D-4.answer_accuracy,train/FLD_proof_eval_strct.D-5.answer_accuracy,train/FLD_proof_eval_strct.D-6.answer_accuracy,train/FLD_proof_eval_strct.D-7.answer_accuracy,train/FLD_proof_eval_strct.D-8.answer_accuracy,train/FLD_proof_eval_strct.D-None.answer_accuracy,train/FLD_proof_eval_strct.D-all.answer_accuracy
"(line-4B, D1-.10)",line-4B,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,line-corporation/japanese-large-lm-3.6b,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,1.0,0.326733,0.0,0.0,,,,,,,,1.0,0.326733,0.0,0.0,,,,,,,,1.0,0.326733,0.0,0.0,,,,,,,,1.0,0.326733
"(line-4B-instruct, D1-.10)",line-4B-instruct,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,line-corporation/japanese-large-lm-3.6b-instruction-sft,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.176471,0.137255,,,,,,,,0.606061,0.29703,0.176471,0.137255,,,,,,,,0.606061,0.29703,0.176471,0.215686,,,,,,,,0.606061,0.336634,0.176471,0.215686,,,,,,,,0.606061,0.336634
"(rinna-4B, D1-.10)",rinna-4B,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,rinna/japanese-gpt-neox-3.6b,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.411765,0.254902,,,,,,,,0.060606,0.217822,0.411765,0.254902,,,,,,,,0.060606,0.217822,0.411765,0.509804,,,,,,,,0.060606,0.346535,0.411765,0.509804,,,,,,,,0.060606,0.346535
"(rinna-4B-instruct, D1-.10)",rinna-4B-instruct,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,rinna/japanese-gpt-neox-3.6b-instruction-ppo,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,,,,,,,,1.0,0.326733,0.0,0.0,,,,,,,,1.0,0.326733,0.0,0.0,,,,,,,,1.0,0.326733,0.0,0.0,,,,,,,,1.0,0.326733
"(calm2-7B, D1-.10)",calm2-7B,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,cyberagent/calm2-7b,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.215686,,,,,,,,0.69697,0.336634,0.0,0.215686,,,,,,,,0.69697,0.336634,0.0,0.352941,,,,,,,,0.69697,0.405941,0.0,0.352941,,,,,,,,0.69697,0.405941
"(calm2-7B-instruct, D1-.10)",calm2-7B-instruct,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,cyberagent/calm2-7b-chat,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.058824,0.0,,,,,,,,1.0,0.336634,0.058824,0.0,,,,,,,,1.0,0.336634,0.058824,0.0,,,,,,,,1.0,0.336634,0.058824,0.0,,,,,,,,1.0,0.336634
"(stablelm-7B, D1-.10)",stablelm-7B,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,stabilityai/japanese-stablelm-base-alpha-7b,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.176471,0.078431,,,,,,,,0.939394,0.376238,0.176471,0.078431,,,,,,,,0.939394,0.376238,0.176471,0.078431,,,,,,,,0.939394,0.376238,0.176471,0.078431,,,,,,,,0.939394,0.376238
"(stablelm-7B-instruct, D1-.10)",stablelm-7B-instruct,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,stabilityai/japanese-stablelm-instruct-alpha-7b-v2,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.235294,0.254902,,,,,,,,0.545455,0.346535,0.235294,0.254902,,,,,,,,0.545455,0.346535,0.235294,0.352941,,,,,,,,0.545455,0.39604,0.235294,0.352941,,,,,,,,0.545455,0.39604
"(elyza-7B, D1-.10)",elyza-7B,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,elyza/ELYZA-japanese-Llama-2-7b-fast,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.137255,,,,,,,,1.0,0.39604,0.0,0.137255,,,,,,,,1.0,0.39604,0.0,0.137255,,,,,,,,1.0,0.39604,0.0,0.137255,,,,,,,,1.0,0.39604
"(elyza-7B-instruct, D1-.10)",elyza-7B-instruct,D1-.10,20231213.jpn.D1_wo_dist,LLM_FS.shot-10,elyza/ELYZA-japanese-Llama-2-7b-fast-instruct,0,1e-05,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.176471,0.235294,,,,,,,,0.636364,0.356436,0.176471,0.235294,,,,,,,,0.636364,0.356436,0.176471,0.313726,,,,,,,,0.636364,0.39604,0.176471,0.313726,,,,,,,,0.636364,0.39604


In [11]:
metric_dfs: Dict[str, pd.DataFrame] = OrderedDict()
for task_name, task_df in task_dfs.items():
    metric_dfs[task_name] = slice_cols(task_df, [COL_TASK, COL_METHOD] + METRIC_NAMES)

for task_name in TASK_NAMES:
    print('\n\n')
    print(f'========================= {task_name} ========================')
    display(metric_dfs[task_name])






Unnamed: 0,task,method,prf.strct
"(line-4B, D1-.10)",D1-.10,line-4B,0.326733
"(line-4B-instruct, D1-.10)",D1-.10,line-4B-instruct,0.29703
"(rinna-4B, D1-.10)",D1-.10,rinna-4B,0.217822
"(rinna-4B-instruct, D1-.10)",D1-.10,rinna-4B-instruct,0.326733
"(calm2-7B, D1-.10)",D1-.10,calm2-7B,0.336634
"(calm2-7B-instruct, D1-.10)",D1-.10,calm2-7B-instruct,0.336634
"(stablelm-7B, D1-.10)",D1-.10,stablelm-7B,0.376238
"(stablelm-7B-instruct, D1-.10)",D1-.10,stablelm-7B-instruct,0.346535
"(elyza-7B, D1-.10)",D1-.10,elyza-7B,0.39604
"(elyza-7B-instruct, D1-.10)",D1-.10,elyza-7B-instruct,0.356436







Unnamed: 0,task,method,prf.strct
"(line-4B, D1-.100)",D1-.100,line-4B,0.613861
"(line-4B-instruct, D1-.100)",D1-.100,line-4B-instruct,0.594059
"(rinna-4B, D1-.100)",D1-.100,rinna-4B,0.554455
"(rinna-4B-instruct, D1-.100)",D1-.100,rinna-4B-instruct,0.683168
"(calm2-7B, D1-.100)",D1-.100,calm2-7B,0.633663
"(calm2-7B-instruct, D1-.100)",D1-.100,calm2-7B-instruct,0.554455
"(stablelm-7B, D1-.100)",D1-.100,stablelm-7B,0.623762
"(stablelm-7B-instruct, D1-.100)",D1-.100,stablelm-7B-instruct,0.584158
"(elyza-7B, D1-.100)",D1-.100,elyza-7B,0.772277
"(elyza-7B-instruct, D1-.100)",D1-.100,elyza-7B-instruct,0.673267







Unnamed: 0,task,method,prf.strct
"(line-4B, D1-.1000)",D1-.1000,line-4B,0.871287
"(line-4B-instruct, D1-.1000)",D1-.1000,line-4B-instruct,0.90099
"(rinna-4B, D1-.1000)",D1-.1000,rinna-4B,0.950495
"(rinna-4B-instruct, D1-.1000)",D1-.1000,rinna-4B-instruct,0.930693
"(calm2-7B, D1-.1000)",D1-.1000,calm2-7B,0.930693
"(calm2-7B-instruct, D1-.1000)",D1-.1000,calm2-7B-instruct,0.80198
"(stablelm-7B, D1-.1000)",D1-.1000,stablelm-7B,0.940594
"(stablelm-7B-instruct, D1-.1000)",D1-.1000,stablelm-7B-instruct,0.940594
"(elyza-7B, D1-.1000)",D1-.1000,elyza-7B,0.970297
"(elyza-7B-instruct, D1-.1000)",D1-.1000,elyza-7B-instruct,0.980198







Unnamed: 0,task,method,prf.strct
"(line-4B, D1-.10000)",D1-.10000,line-4B,0.960396
"(line-4B-instruct, D1-.10000)",D1-.10000,line-4B-instruct,0.960396
"(rinna-4B, D1-.10000)",D1-.10000,rinna-4B,0.960396
"(rinna-4B-instruct, D1-.10000)",D1-.10000,rinna-4B-instruct,0.950495
"(calm2-7B, D1-.10000)",D1-.10000,calm2-7B,0.970297
"(calm2-7B-instruct, D1-.10000)",D1-.10000,calm2-7B-instruct,0.970297
"(stablelm-7B, D1-.10000)",D1-.10000,stablelm-7B,0.990099
"(stablelm-7B-instruct, D1-.10000)",D1-.10000,stablelm-7B-instruct,0.990099
"(elyza-7B, D1-.10000)",D1-.10000,elyza-7B,0.990099
"(elyza-7B-instruct, D1-.10000)",D1-.10000,elyza-7B-instruct,0.990099







Unnamed: 0,task,method,prf.strct
"(line-4B, D1-.30000)",D1-.30000,line-4B,1.0
"(line-4B-instruct, D1-.30000)",D1-.30000,line-4B-instruct,1.0
"(rinna-4B, D1-.30000)",D1-.30000,rinna-4B,1.0
"(rinna-4B-instruct, D1-.30000)",D1-.30000,rinna-4B-instruct,1.0
"(calm2-7B, D1-.30000)",D1-.30000,calm2-7B,1.0
"(calm2-7B-instruct, D1-.30000)",D1-.30000,calm2-7B-instruct,1.0
"(stablelm-7B, D1-.30000)",D1-.30000,stablelm-7B,1.0
"(stablelm-7B-instruct, D1-.30000)",D1-.30000,stablelm-7B-instruct,1.0
"(elyza-7B, D1-.30000)",D1-.30000,elyza-7B,0.990099
"(elyza-7B-instruct, D1-.30000)",D1-.30000,elyza-7B-instruct,1.0







Unnamed: 0,task,method,prf.strct
"(line-4B, D1.10)",D1.10,line-4B,0.09901
"(line-4B-instruct, D1.10)",D1.10,line-4B-instruct,0.227723
"(rinna-4B, D1.10)",D1.10,rinna-4B,0.19802
"(rinna-4B-instruct, D1.10)",D1.10,rinna-4B-instruct,0.207921
"(calm2-7B, D1.10)",D1.10,calm2-7B,0.19802
"(calm2-7B-instruct, D1.10)",D1.10,calm2-7B-instruct,0.128713
"(stablelm-7B, D1.10)",D1.10,stablelm-7B,0.29703
"(stablelm-7B-instruct, D1.10)",D1.10,stablelm-7B-instruct,0.207921
"(elyza-7B, D1.10)",D1.10,elyza-7B,0.326733
"(elyza-7B-instruct, D1.10)",D1.10,elyza-7B-instruct,0.069307







Unnamed: 0,task,method,prf.strct
"(line-4B, D1.100)",D1.100,line-4B,0.118812
"(line-4B-instruct, D1.100)",D1.100,line-4B-instruct,0.079208
"(rinna-4B, D1.100)",D1.100,rinna-4B,0.09901
"(rinna-4B-instruct, D1.100)",D1.100,rinna-4B-instruct,0.079208
"(calm2-7B, D1.100)",D1.100,calm2-7B,0.089109
"(calm2-7B-instruct, D1.100)",D1.100,calm2-7B-instruct,0.059406
"(stablelm-7B, D1.100)",D1.100,stablelm-7B,0.19802
"(stablelm-7B-instruct, D1.100)",D1.100,stablelm-7B-instruct,0.128713
"(elyza-7B, D1.100)",D1.100,elyza-7B,0.118812
"(elyza-7B-instruct, D1.100)",D1.100,elyza-7B-instruct,0.118812







Unnamed: 0,task,method,prf.strct
"(line-4B, D1.1000)",D1.1000,line-4B,0.267327
"(line-4B-instruct, D1.1000)",D1.1000,line-4B-instruct,0.188119
"(rinna-4B, D1.1000)",D1.1000,rinna-4B,0.168317
"(rinna-4B-instruct, D1.1000)",D1.1000,rinna-4B-instruct,0.227723
"(calm2-7B, D1.1000)",D1.1000,calm2-7B,0.336634
"(calm2-7B-instruct, D1.1000)",D1.1000,calm2-7B-instruct,0.079208
"(stablelm-7B, D1.1000)",D1.1000,stablelm-7B,0.356436
"(stablelm-7B-instruct, D1.1000)",D1.1000,stablelm-7B-instruct,0.326733
"(elyza-7B, D1.1000)",D1.1000,elyza-7B,0.514852
"(elyza-7B-instruct, D1.1000)",D1.1000,elyza-7B-instruct,0.554455







Unnamed: 0,task,method,prf.strct
"(line-4B, D1.10000)",D1.10000,line-4B,0.435644
"(line-4B-instruct, D1.10000)",D1.10000,line-4B-instruct,0.49505
"(rinna-4B, D1.10000)",D1.10000,rinna-4B,0.386139
"(rinna-4B-instruct, D1.10000)",D1.10000,rinna-4B-instruct,0.386139
"(calm2-7B, D1.10000)",D1.10000,calm2-7B,0.792079
"(stablelm-7B, D1.10000)",D1.10000,stablelm-7B,0.762376
"(stablelm-7B-instruct, D1.10000)",D1.10000,stablelm-7B-instruct,0.722772
"(elyza-7B, D1.10000)",D1.10000,elyza-7B,0.90099
"(elyza-7B-instruct, D1.10000)",D1.10000,elyza-7B-instruct,0.881188
"(weblab-10B, D1.10000)",D1.10000,weblab-10B,0.831683







Unnamed: 0,task,method,prf.strct
"(line-4B, D1.30000)",D1.30000,line-4B,0.831683
"(line-4B-instruct, D1.30000)",D1.30000,line-4B-instruct,0.80198
"(rinna-4B, D1.30000)",D1.30000,rinna-4B,0.653465
"(rinna-4B-instruct, D1.30000)",D1.30000,rinna-4B-instruct,0.752475
"(calm2-7B, D1.30000)",D1.30000,calm2-7B,0.970297
"(calm2-7B-instruct, D1.30000)",D1.30000,calm2-7B-instruct,0.742574
"(stablelm-7B, D1.30000)",D1.30000,stablelm-7B,0.980198
"(stablelm-7B-instruct, D1.30000)",D1.30000,stablelm-7B-instruct,0.920792
"(elyza-7B, D1.30000)",D1.30000,elyza-7B,0.970297
"(elyza-7B-instruct, D1.30000)",D1.30000,elyza-7B-instruct,0.970297







Unnamed: 0,task,method,prf.strct
"(line-4B, D3.10)",D3.10,line-4B,0.009901
"(line-4B-instruct, D3.10)",D3.10,line-4B-instruct,0.009901
"(rinna-4B, D3.10)",D3.10,rinna-4B,0.049505
"(rinna-4B-instruct, D3.10)",D3.10,rinna-4B-instruct,0.0
"(calm2-7B, D3.10)",D3.10,calm2-7B,0.0
"(calm2-7B-instruct, D3.10)",D3.10,calm2-7B-instruct,0.0
"(stablelm-7B, D3.10)",D3.10,stablelm-7B,0.049505
"(stablelm-7B-instruct, D3.10)",D3.10,stablelm-7B-instruct,0.059406
"(elyza-7B, D3.10)",D3.10,elyza-7B,0.09901
"(elyza-7B-instruct, D3.10)",D3.10,elyza-7B-instruct,0.059406







Unnamed: 0,task,method,prf.strct
"(line-4B, D3.100)",D3.100,line-4B,0.09901
"(line-4B-instruct, D3.100)",D3.100,line-4B-instruct,0.079208
"(rinna-4B, D3.100)",D3.100,rinna-4B,0.069307
"(rinna-4B-instruct, D3.100)",D3.100,rinna-4B-instruct,0.089109
"(calm2-7B, D3.100)",D3.100,calm2-7B,0.059406
"(calm2-7B-instruct, D3.100)",D3.100,calm2-7B-instruct,0.039604
"(stablelm-7B, D3.100)",D3.100,stablelm-7B,0.059406
"(stablelm-7B-instruct, D3.100)",D3.100,stablelm-7B-instruct,0.079208
"(elyza-7B, D3.100)",D3.100,elyza-7B,0.079208
"(elyza-7B-instruct, D3.100)",D3.100,elyza-7B-instruct,0.039604







Unnamed: 0,task,method,prf.strct
"(line-4B, D3.1000)",D3.1000,line-4B,0.128713
"(line-4B-instruct, D3.1000)",D3.1000,line-4B-instruct,0.118812
"(rinna-4B, D3.1000)",D3.1000,rinna-4B,0.138614
"(rinna-4B-instruct, D3.1000)",D3.1000,rinna-4B-instruct,0.09901
"(calm2-7B, D3.1000)",D3.1000,calm2-7B,0.089109
"(calm2-7B-instruct, D3.1000)",D3.1000,calm2-7B-instruct,0.069307
"(stablelm-7B, D3.1000)",D3.1000,stablelm-7B,0.148515
"(stablelm-7B-instruct, D3.1000)",D3.1000,stablelm-7B-instruct,0.118812
"(elyza-7B, D3.1000)",D3.1000,elyza-7B,0.089109
"(elyza-7B-instruct, D3.1000)",D3.1000,elyza-7B-instruct,0.09901







Unnamed: 0,task,method,prf.strct
"(line-4B, D3.10000)",D3.10000,line-4B,0.29703
"(line-4B-instruct, D3.10000)",D3.10000,line-4B-instruct,0.29703
"(rinna-4B, D3.10000)",D3.10000,rinna-4B,0.277228
"(rinna-4B-instruct, D3.10000)",D3.10000,rinna-4B-instruct,0.29703
"(calm2-7B, D3.10000)",D3.10000,calm2-7B,0.306931
"(calm2-7B-instruct, D3.10000)",D3.10000,calm2-7B-instruct,0.257426
"(stablelm-7B, D3.10000)",D3.10000,stablelm-7B,0.465347
"(stablelm-7B-instruct, D3.10000)",D3.10000,stablelm-7B-instruct,0.485149
"(elyza-7B, D3.10000)",D3.10000,elyza-7B,0.534653
"(elyza-7B-instruct, D3.10000)",D3.10000,elyza-7B-instruct,0.554455







Unnamed: 0,task,method,prf.strct
"(line-4B, D3.30000)",D3.30000,line-4B,0.386139
"(line-4B-instruct, D3.30000)",D3.30000,line-4B-instruct,0.39604
"(rinna-4B, D3.30000)",D3.30000,rinna-4B,0.336634
"(rinna-4B-instruct, D3.30000)",D3.30000,rinna-4B-instruct,0.356436
"(calm2-7B, D3.30000)",D3.30000,calm2-7B,0.60396
"(calm2-7B-instruct, D3.30000)",D3.30000,calm2-7B-instruct,0.514852
"(stablelm-7B, D3.30000)",D3.30000,stablelm-7B,0.663366
"(stablelm-7B-instruct, D3.30000)",D3.30000,stablelm-7B-instruct,0.60396
"(elyza-7B, D3.30000)",D3.30000,elyza-7B,0.60396
"(elyza-7B-instruct, D3.30000)",D3.30000,elyza-7B-instruct,0.643564







Unnamed: 0,task,method,prf.strct
"(line-4B, D8.10)",D8.10,line-4B,0.0
"(line-4B-instruct, D8.10)",D8.10,line-4B-instruct,0.0
"(rinna-4B, D8.10)",D8.10,rinna-4B,0.069307
"(rinna-4B-instruct, D8.10)",D8.10,rinna-4B-instruct,0.059406
"(calm2-7B, D8.10)",D8.10,calm2-7B,0.0
"(calm2-7B-instruct, D8.10)",D8.10,calm2-7B-instruct,0.019802
"(stablelm-7B, D8.10)",D8.10,stablelm-7B,0.0
"(stablelm-7B-instruct, D8.10)",D8.10,stablelm-7B-instruct,0.0
"(elyza-7B, D8.10)",D8.10,elyza-7B,0.0
"(elyza-7B-instruct, D8.10)",D8.10,elyza-7B-instruct,0.0







Unnamed: 0,task,method,prf.strct
"(line-4B, D8.100)",D8.100,line-4B,0.029703
"(line-4B-instruct, D8.100)",D8.100,line-4B-instruct,0.029703
"(rinna-4B, D8.100)",D8.100,rinna-4B,0.029703
"(rinna-4B-instruct, D8.100)",D8.100,rinna-4B-instruct,0.039604
"(calm2-7B, D8.100)",D8.100,calm2-7B,0.029703
"(calm2-7B-instruct, D8.100)",D8.100,calm2-7B-instruct,0.019802
"(stablelm-7B, D8.100)",D8.100,stablelm-7B,0.019802
"(stablelm-7B-instruct, D8.100)",D8.100,stablelm-7B-instruct,0.029703
"(elyza-7B, D8.100)",D8.100,elyza-7B,0.0
"(elyza-7B-instruct, D8.100)",D8.100,elyza-7B-instruct,0.0







Unnamed: 0,task,method,prf.strct
"(line-4B, D8.1000)",D8.1000,line-4B,0.019802
"(line-4B-instruct, D8.1000)",D8.1000,line-4B-instruct,0.039604
"(rinna-4B, D8.1000)",D8.1000,rinna-4B,0.019802
"(rinna-4B-instruct, D8.1000)",D8.1000,rinna-4B-instruct,0.049505
"(calm2-7B, D8.1000)",D8.1000,calm2-7B,0.059406
"(calm2-7B-instruct, D8.1000)",D8.1000,calm2-7B-instruct,0.039604
"(stablelm-7B, D8.1000)",D8.1000,stablelm-7B,0.049505
"(stablelm-7B-instruct, D8.1000)",D8.1000,stablelm-7B-instruct,0.029703
"(elyza-7B, D8.1000)",D8.1000,elyza-7B,0.0
"(elyza-7B-instruct, D8.1000)",D8.1000,elyza-7B-instruct,0.0







Unnamed: 0,task,method,prf.strct
"(line-4B, D8.10000)",D8.10000,line-4B,0.277228
"(line-4B-instruct, D8.10000)",D8.10000,line-4B-instruct,0.277228
"(rinna-4B, D8.10000)",D8.10000,rinna-4B,0.287129
"(rinna-4B-instruct, D8.10000)",D8.10000,rinna-4B-instruct,0.287129
"(calm2-7B, D8.10000)",D8.10000,calm2-7B,0.316832
"(calm2-7B-instruct, D8.10000)",D8.10000,calm2-7B-instruct,0.277228
"(stablelm-7B, D8.10000)",D8.10000,stablelm-7B,0.316832
"(stablelm-7B-instruct, D8.10000)",D8.10000,stablelm-7B-instruct,0.306931
"(elyza-7B, D8.10000)",D8.10000,elyza-7B,0.356436
"(elyza-7B-instruct, D8.10000)",D8.10000,elyza-7B-instruct,0.346535







Unnamed: 0,task,method,prf.strct
"(line-4B, D8.30000)",D8.30000,line-4B,0.316832
"(line-4B-instruct, D8.30000)",D8.30000,line-4B-instruct,0.306931
"(rinna-4B, D8.30000)",D8.30000,rinna-4B,0.287129
"(rinna-4B-instruct, D8.30000)",D8.30000,rinna-4B-instruct,0.306931
"(calm2-7B, D8.30000)",D8.30000,calm2-7B,0.405941
"(calm2-7B-instruct, D8.30000)",D8.30000,calm2-7B-instruct,0.326733
"(stablelm-7B, D8.30000)",D8.30000,stablelm-7B,0.346535
"(stablelm-7B-instruct, D8.30000)",D8.30000,stablelm-7B-instruct,0.356436
"(elyza-7B, D8.30000)",D8.30000,elyza-7B,0.445545
"(elyza-7B-instruct, D8.30000)",D8.30000,elyza-7B-instruct,0.435644


In [12]:

pretty_dfs: Dict[str, pd.DataFrame] = OrderedDict()

for task_name, metric_df in metric_dfs.items():
    pretty_df = prettify_df(metric_df)
    pretty_df = rename_cols(pretty_df, METRIC_RENAMES)
    
    pretty_df.index = pretty_df[COL_METHOD]
    pretty_df = pretty_df.drop(columns=[COL_METHOD])
    
    pretty_df = color_by_rank(pretty_df,
                              'col',
                              scale_lower=COLOR_SCALE_LOWER,
                              scale_upper=COLOR_SCALE_UPPER,
                              color_lower=COLOR_PARETTE_LOWER,
                              color_upper=COLOR_PARETTE_UPPER)
    
    pretty_dfs[task_name] = pretty_df


for task_name in TASK_NAMES:
    print('\n\n')
    print(f'========================= {task_name} ========================')
    display(pretty_dfs[task_name])






Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1-.10,\cellcolor{blue!24} 32.7
line-4B-instruct,D1-.10,\cellcolor{blue!22} 29.7
rinna-4B,D1-.10,\cellcolor{blue!17} 21.8
rinna-4B-instruct,D1-.10,\cellcolor{blue!24} 32.7
calm2-7B,D1-.10,\cellcolor{blue!25} 33.7
calm2-7B-instruct,D1-.10,\cellcolor{blue!25} 33.7
stablelm-7B,D1-.10,\cellcolor{blue!28} 37.6
stablelm-7B-instruct,D1-.10,\cellcolor{blue!26} 34.7
elyza-7B,D1-.10,\cellcolor{blue!29} 39.6
elyza-7B-instruct,D1-.10,\cellcolor{blue!26} 35.6







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1-.100,\cellcolor{blue!44} 61.4
line-4B-instruct,D1-.100,\cellcolor{blue!42} 59.4
rinna-4B,D1-.100,\cellcolor{blue!40} 55.4
rinna-4B-instruct,D1-.100,\cellcolor{blue!48} 68.3
calm2-7B,D1-.100,\cellcolor{blue!45} 63.4
calm2-7B-instruct,D1-.100,\cellcolor{blue!40} 55.4
stablelm-7B,D1-.100,\cellcolor{blue!44} 62.4
stablelm-7B-instruct,D1-.100,\cellcolor{blue!42} 58.4
elyza-7B,D1-.100,\cellcolor{blue!54} 77.2
elyza-7B-instruct,D1-.100,\cellcolor{blue!48} 67.3







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1-.1000,\cellcolor{blue!61} 87.1
line-4B-instruct,D1-.1000,\cellcolor{blue!63} 90.1
rinna-4B,D1-.1000,\cellcolor{blue!66} 95.0
rinna-4B-instruct,D1-.1000,\cellcolor{blue!65} 93.1
calm2-7B,D1-.1000,\cellcolor{blue!65} 93.1
calm2-7B-instruct,D1-.1000,\cellcolor{blue!56} 80.2
stablelm-7B,D1-.1000,\cellcolor{blue!66} 94.1
stablelm-7B-instruct,D1-.1000,\cellcolor{blue!66} 94.1
elyza-7B,D1-.1000,\cellcolor{blue!67} 97.0
elyza-7B-instruct,D1-.1000,\cellcolor{blue!68} 98.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1-.10000,\cellcolor{blue!67} 96.0
line-4B-instruct,D1-.10000,\cellcolor{blue!67} 96.0
rinna-4B,D1-.10000,\cellcolor{blue!67} 96.0
rinna-4B-instruct,D1-.10000,\cellcolor{blue!66} 95.0
calm2-7B,D1-.10000,\cellcolor{blue!67} 97.0
calm2-7B-instruct,D1-.10000,\cellcolor{blue!67} 97.0
stablelm-7B,D1-.10000,\cellcolor{blue!69} 99.0
stablelm-7B-instruct,D1-.10000,\cellcolor{blue!69} 99.0
elyza-7B,D1-.10000,\cellcolor{blue!69} 99.0
elyza-7B-instruct,D1-.10000,\cellcolor{blue!69} 99.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1-.30000,\cellcolor{blue!70} 100.0
line-4B-instruct,D1-.30000,\cellcolor{blue!70} 100.0
rinna-4B,D1-.30000,\cellcolor{blue!70} 100.0
rinna-4B-instruct,D1-.30000,\cellcolor{blue!70} 100.0
calm2-7B,D1-.30000,\cellcolor{blue!70} 100.0
calm2-7B-instruct,D1-.30000,\cellcolor{blue!70} 100.0
stablelm-7B,D1-.30000,\cellcolor{blue!70} 100.0
stablelm-7B-instruct,D1-.30000,\cellcolor{blue!70} 100.0
elyza-7B,D1-.30000,\cellcolor{blue!69} 99.0
elyza-7B-instruct,D1-.30000,\cellcolor{blue!70} 100.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1.10,\cellcolor{blue!9} 9.9
line-4B-instruct,D1.10,\cellcolor{blue!18} 22.8
rinna-4B,D1.10,\cellcolor{blue!16} 19.8
rinna-4B-instruct,D1.10,\cellcolor{blue!16} 20.8
calm2-7B,D1.10,\cellcolor{blue!16} 19.8
calm2-7B-instruct,D1.10,\cellcolor{blue!11} 12.9
stablelm-7B,D1.10,\cellcolor{blue!22} 29.7
stablelm-7B-instruct,D1.10,\cellcolor{blue!16} 20.8
elyza-7B,D1.10,\cellcolor{blue!24} 32.7
elyza-7B-instruct,D1.10,\cellcolor{blue!7} 6.9







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1.100,\cellcolor{blue!10} 11.9
line-4B-instruct,D1.100,\cellcolor{blue!8} 7.9
rinna-4B,D1.100,\cellcolor{blue!9} 9.9
rinna-4B-instruct,D1.100,\cellcolor{blue!8} 7.9
calm2-7B,D1.100,\cellcolor{blue!8} 8.9
calm2-7B-instruct,D1.100,\cellcolor{blue!6} 5.9
stablelm-7B,D1.100,\cellcolor{blue!16} 19.8
stablelm-7B-instruct,D1.100,\cellcolor{blue!11} 12.9
elyza-7B,D1.100,\cellcolor{blue!10} 11.9
elyza-7B-instruct,D1.100,\cellcolor{blue!10} 11.9







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1.1000,\cellcolor{blue!20} 26.7
line-4B-instruct,D1.1000,\cellcolor{blue!15} 18.8
rinna-4B,D1.1000,\cellcolor{blue!14} 16.8
rinna-4B-instruct,D1.1000,\cellcolor{blue!18} 22.8
calm2-7B,D1.1000,\cellcolor{blue!25} 33.7
calm2-7B-instruct,D1.1000,\cellcolor{blue!8} 7.9
stablelm-7B,D1.1000,\cellcolor{blue!26} 35.6
stablelm-7B-instruct,D1.1000,\cellcolor{blue!24} 32.7
elyza-7B,D1.1000,\cellcolor{blue!37} 51.5
elyza-7B-instruct,D1.1000,\cellcolor{blue!40} 55.4







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1.10000,\cellcolor{blue!32} 43.6
line-4B-instruct,D1.10000,\cellcolor{blue!36} 49.5
rinna-4B,D1.10000,\cellcolor{blue!28} 38.6
rinna-4B-instruct,D1.10000,\cellcolor{blue!28} 38.6
calm2-7B,D1.10000,\cellcolor{blue!56} 79.2
stablelm-7B,D1.10000,\cellcolor{blue!54} 76.2
stablelm-7B-instruct,D1.10000,\cellcolor{blue!51} 72.3
elyza-7B,D1.10000,\cellcolor{blue!63} 90.1
elyza-7B-instruct,D1.10000,\cellcolor{blue!62} 88.1
weblab-10B,D1.10000,\cellcolor{blue!58} 83.2







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D1.30000,\cellcolor{blue!58} 83.2
line-4B-instruct,D1.30000,\cellcolor{blue!56} 80.2
rinna-4B,D1.30000,\cellcolor{blue!46} 65.3
rinna-4B-instruct,D1.30000,\cellcolor{blue!53} 75.2
calm2-7B,D1.30000,\cellcolor{blue!67} 97.0
calm2-7B-instruct,D1.30000,\cellcolor{blue!52} 74.3
stablelm-7B,D1.30000,\cellcolor{blue!68} 98.0
stablelm-7B-instruct,D1.30000,\cellcolor{blue!64} 92.1
elyza-7B,D1.30000,\cellcolor{blue!67} 97.0
elyza-7B-instruct,D1.30000,\cellcolor{blue!67} 97.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D3.10,\cellcolor{blue!3} 1.0
line-4B-instruct,D3.10,\cellcolor{blue!3} 1.0
rinna-4B,D3.10,\cellcolor{blue!6} 5.0
rinna-4B-instruct,D3.10,\cellcolor{blue!3} 0.0
calm2-7B,D3.10,\cellcolor{blue!3} 0.0
calm2-7B-instruct,D3.10,\cellcolor{blue!3} 0.0
stablelm-7B,D3.10,\cellcolor{blue!6} 5.0
stablelm-7B-instruct,D3.10,\cellcolor{blue!6} 5.9
elyza-7B,D3.10,\cellcolor{blue!9} 9.9
elyza-7B-instruct,D3.10,\cellcolor{blue!6} 5.9







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D3.100,\cellcolor{blue!9} 9.9
line-4B-instruct,D3.100,\cellcolor{blue!8} 7.9
rinna-4B,D3.100,\cellcolor{blue!7} 6.9
rinna-4B-instruct,D3.100,\cellcolor{blue!8} 8.9
calm2-7B,D3.100,\cellcolor{blue!6} 5.9
calm2-7B-instruct,D3.100,\cellcolor{blue!5} 4.0
stablelm-7B,D3.100,\cellcolor{blue!6} 5.9
stablelm-7B-instruct,D3.100,\cellcolor{blue!8} 7.9
elyza-7B,D3.100,\cellcolor{blue!8} 7.9
elyza-7B-instruct,D3.100,\cellcolor{blue!5} 4.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D3.1000,\cellcolor{blue!11} 12.9
line-4B-instruct,D3.1000,\cellcolor{blue!10} 11.9
rinna-4B,D3.1000,\cellcolor{blue!12} 13.9
rinna-4B-instruct,D3.1000,\cellcolor{blue!9} 9.9
calm2-7B,D3.1000,\cellcolor{blue!8} 8.9
calm2-7B-instruct,D3.1000,\cellcolor{blue!7} 6.9
stablelm-7B,D3.1000,\cellcolor{blue!12} 14.9
stablelm-7B-instruct,D3.1000,\cellcolor{blue!10} 11.9
elyza-7B,D3.1000,\cellcolor{blue!8} 8.9
elyza-7B-instruct,D3.1000,\cellcolor{blue!9} 9.9







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D3.10000,\cellcolor{blue!22} 29.7
line-4B-instruct,D3.10000,\cellcolor{blue!22} 29.7
rinna-4B,D3.10000,\cellcolor{blue!21} 27.7
rinna-4B-instruct,D3.10000,\cellcolor{blue!22} 29.7
calm2-7B,D3.10000,\cellcolor{blue!23} 30.7
calm2-7B-instruct,D3.10000,\cellcolor{blue!20} 25.7
stablelm-7B,D3.10000,\cellcolor{blue!34} 46.5
stablelm-7B-instruct,D3.10000,\cellcolor{blue!35} 48.5
elyza-7B,D3.10000,\cellcolor{blue!38} 53.5
elyza-7B-instruct,D3.10000,\cellcolor{blue!40} 55.4







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D3.30000,\cellcolor{blue!28} 38.6
line-4B-instruct,D3.30000,\cellcolor{blue!29} 39.6
rinna-4B,D3.30000,\cellcolor{blue!25} 33.7
rinna-4B-instruct,D3.30000,\cellcolor{blue!26} 35.6
calm2-7B,D3.30000,\cellcolor{blue!43} 60.4
calm2-7B-instruct,D3.30000,\cellcolor{blue!37} 51.5
stablelm-7B,D3.30000,\cellcolor{blue!47} 66.3
stablelm-7B-instruct,D3.30000,\cellcolor{blue!43} 60.4
elyza-7B,D3.30000,\cellcolor{blue!43} 60.4
elyza-7B-instruct,D3.30000,\cellcolor{blue!46} 64.4







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D8.10,\cellcolor{blue!3} 0.0
line-4B-instruct,D8.10,\cellcolor{blue!3} 0.0
rinna-4B,D8.10,\cellcolor{blue!7} 6.9
rinna-4B-instruct,D8.10,\cellcolor{blue!6} 5.9
calm2-7B,D8.10,\cellcolor{blue!3} 0.0
calm2-7B-instruct,D8.10,\cellcolor{blue!4} 2.0
stablelm-7B,D8.10,\cellcolor{blue!3} 0.0
stablelm-7B-instruct,D8.10,\cellcolor{blue!3} 0.0
elyza-7B,D8.10,\cellcolor{blue!3} 0.0
elyza-7B-instruct,D8.10,\cellcolor{blue!3} 0.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D8.100,\cellcolor{blue!5} 3.0
line-4B-instruct,D8.100,\cellcolor{blue!5} 3.0
rinna-4B,D8.100,\cellcolor{blue!5} 3.0
rinna-4B-instruct,D8.100,\cellcolor{blue!5} 4.0
calm2-7B,D8.100,\cellcolor{blue!5} 3.0
calm2-7B-instruct,D8.100,\cellcolor{blue!4} 2.0
stablelm-7B,D8.100,\cellcolor{blue!4} 2.0
stablelm-7B-instruct,D8.100,\cellcolor{blue!5} 3.0
elyza-7B,D8.100,\cellcolor{blue!3} 0.0
elyza-7B-instruct,D8.100,\cellcolor{blue!3} 0.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D8.1000,\cellcolor{blue!4} 2.0
line-4B-instruct,D8.1000,\cellcolor{blue!5} 4.0
rinna-4B,D8.1000,\cellcolor{blue!4} 2.0
rinna-4B-instruct,D8.1000,\cellcolor{blue!6} 5.0
calm2-7B,D8.1000,\cellcolor{blue!6} 5.9
calm2-7B-instruct,D8.1000,\cellcolor{blue!5} 4.0
stablelm-7B,D8.1000,\cellcolor{blue!6} 5.0
stablelm-7B-instruct,D8.1000,\cellcolor{blue!5} 3.0
elyza-7B,D8.1000,\cellcolor{blue!3} 0.0
elyza-7B-instruct,D8.1000,\cellcolor{blue!3} 0.0







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D8.10000,\cellcolor{blue!21} 27.7
line-4B-instruct,D8.10000,\cellcolor{blue!21} 27.7
rinna-4B,D8.10000,\cellcolor{blue!22} 28.7
rinna-4B-instruct,D8.10000,\cellcolor{blue!22} 28.7
calm2-7B,D8.10000,\cellcolor{blue!24} 31.7
calm2-7B-instruct,D8.10000,\cellcolor{blue!21} 27.7
stablelm-7B,D8.10000,\cellcolor{blue!24} 31.7
stablelm-7B-instruct,D8.10000,\cellcolor{blue!23} 30.7
elyza-7B,D8.10000,\cellcolor{blue!26} 35.6
elyza-7B-instruct,D8.10000,\cellcolor{blue!26} 34.7







Unnamed: 0_level_0,task,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1
line-4B,D8.30000,\cellcolor{blue!24} 31.7
line-4B-instruct,D8.30000,\cellcolor{blue!23} 30.7
rinna-4B,D8.30000,\cellcolor{blue!22} 28.7
rinna-4B-instruct,D8.30000,\cellcolor{blue!23} 30.7
calm2-7B,D8.30000,\cellcolor{blue!30} 40.6
calm2-7B-instruct,D8.30000,\cellcolor{blue!24} 32.7
stablelm-7B,D8.30000,\cellcolor{blue!26} 34.7
stablelm-7B-instruct,D8.30000,\cellcolor{blue!26} 35.6
elyza-7B,D8.30000,\cellcolor{blue!32} 44.6
elyza-7B-instruct,D8.30000,\cellcolor{blue!32} 43.6


In [13]:
def horizontal_concat(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    dfs = [df.copy() for df in dfs]
    # align index for horizontal concat
    # for df in dfs:
    #     df.index = range(len(df))
    return pd.concat(dfs, axis=1)

colored_concat_df = horizontal_concat(
    [pretty_df for task_name, pretty_df in pretty_dfs.items()]
)
colored_concat_df = colored_concat_df.drop(columns=[COL_TASK], axis=1)

print('    '.join([task_name for task_name in pretty_dfs.keys()]))
colored_concat_df

D1-.10    D1-.100    D1-.1000    D1-.10000    D1-.30000    D1.10    D1.100    D1.1000    D1.10000    D1.30000    D3.10    D3.100    D3.1000    D3.10000    D3.30000    D8.10    D8.100    D8.1000    D8.10000    D8.30000


Unnamed: 0_level_0,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct,prf.strct
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
line-4B,\cellcolor{blue!24} 32.7,\cellcolor{blue!44} 61.4,\cellcolor{blue!61} 87.1,\cellcolor{blue!67} 96.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!9} 9.9,\cellcolor{blue!10} 11.9,\cellcolor{blue!20} 26.7,\cellcolor{blue!32} 43.6,\cellcolor{blue!58} 83.2,\cellcolor{blue!3} 1.0,\cellcolor{blue!9} 9.9,\cellcolor{blue!11} 12.9,\cellcolor{blue!22} 29.7,\cellcolor{blue!28} 38.6,\cellcolor{blue!3} 0.0,\cellcolor{blue!5} 3.0,\cellcolor{blue!4} 2.0,\cellcolor{blue!21} 27.7,\cellcolor{blue!24} 31.7
line-4B-instruct,\cellcolor{blue!22} 29.7,\cellcolor{blue!42} 59.4,\cellcolor{blue!63} 90.1,\cellcolor{blue!67} 96.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!18} 22.8,\cellcolor{blue!8} 7.9,\cellcolor{blue!15} 18.8,\cellcolor{blue!36} 49.5,\cellcolor{blue!56} 80.2,\cellcolor{blue!3} 1.0,\cellcolor{blue!8} 7.9,\cellcolor{blue!10} 11.9,\cellcolor{blue!22} 29.7,\cellcolor{blue!29} 39.6,\cellcolor{blue!3} 0.0,\cellcolor{blue!5} 3.0,\cellcolor{blue!5} 4.0,\cellcolor{blue!21} 27.7,\cellcolor{blue!23} 30.7
rinna-4B,\cellcolor{blue!17} 21.8,\cellcolor{blue!40} 55.4,\cellcolor{blue!66} 95.0,\cellcolor{blue!67} 96.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!16} 19.8,\cellcolor{blue!9} 9.9,\cellcolor{blue!14} 16.8,\cellcolor{blue!28} 38.6,\cellcolor{blue!46} 65.3,\cellcolor{blue!6} 5.0,\cellcolor{blue!7} 6.9,\cellcolor{blue!12} 13.9,\cellcolor{blue!21} 27.7,\cellcolor{blue!25} 33.7,\cellcolor{blue!7} 6.9,\cellcolor{blue!5} 3.0,\cellcolor{blue!4} 2.0,\cellcolor{blue!22} 28.7,\cellcolor{blue!22} 28.7
rinna-4B-instruct,\cellcolor{blue!24} 32.7,\cellcolor{blue!48} 68.3,\cellcolor{blue!65} 93.1,\cellcolor{blue!66} 95.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!16} 20.8,\cellcolor{blue!8} 7.9,\cellcolor{blue!18} 22.8,\cellcolor{blue!28} 38.6,\cellcolor{blue!53} 75.2,\cellcolor{blue!3} 0.0,\cellcolor{blue!8} 8.9,\cellcolor{blue!9} 9.9,\cellcolor{blue!22} 29.7,\cellcolor{blue!26} 35.6,\cellcolor{blue!6} 5.9,\cellcolor{blue!5} 4.0,\cellcolor{blue!6} 5.0,\cellcolor{blue!22} 28.7,\cellcolor{blue!23} 30.7
calm2-7B,\cellcolor{blue!25} 33.7,\cellcolor{blue!45} 63.4,\cellcolor{blue!65} 93.1,\cellcolor{blue!67} 97.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!16} 19.8,\cellcolor{blue!8} 8.9,\cellcolor{blue!25} 33.7,\cellcolor{blue!56} 79.2,\cellcolor{blue!67} 97.0,\cellcolor{blue!3} 0.0,\cellcolor{blue!6} 5.9,\cellcolor{blue!8} 8.9,\cellcolor{blue!23} 30.7,\cellcolor{blue!43} 60.4,\cellcolor{blue!3} 0.0,\cellcolor{blue!5} 3.0,\cellcolor{blue!6} 5.9,\cellcolor{blue!24} 31.7,\cellcolor{blue!30} 40.6
calm2-7B-instruct,\cellcolor{blue!25} 33.7,\cellcolor{blue!40} 55.4,\cellcolor{blue!56} 80.2,\cellcolor{blue!67} 97.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!11} 12.9,\cellcolor{blue!6} 5.9,\cellcolor{blue!8} 7.9,,\cellcolor{blue!52} 74.3,\cellcolor{blue!3} 0.0,\cellcolor{blue!5} 4.0,\cellcolor{blue!7} 6.9,\cellcolor{blue!20} 25.7,\cellcolor{blue!37} 51.5,\cellcolor{blue!4} 2.0,\cellcolor{blue!4} 2.0,\cellcolor{blue!5} 4.0,\cellcolor{blue!21} 27.7,\cellcolor{blue!24} 32.7
stablelm-7B,\cellcolor{blue!28} 37.6,\cellcolor{blue!44} 62.4,\cellcolor{blue!66} 94.1,\cellcolor{blue!69} 99.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!22} 29.7,\cellcolor{blue!16} 19.8,\cellcolor{blue!26} 35.6,\cellcolor{blue!54} 76.2,\cellcolor{blue!68} 98.0,\cellcolor{blue!6} 5.0,\cellcolor{blue!6} 5.9,\cellcolor{blue!12} 14.9,\cellcolor{blue!34} 46.5,\cellcolor{blue!47} 66.3,\cellcolor{blue!3} 0.0,\cellcolor{blue!4} 2.0,\cellcolor{blue!6} 5.0,\cellcolor{blue!24} 31.7,\cellcolor{blue!26} 34.7
stablelm-7B-instruct,\cellcolor{blue!26} 34.7,\cellcolor{blue!42} 58.4,\cellcolor{blue!66} 94.1,\cellcolor{blue!69} 99.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!16} 20.8,\cellcolor{blue!11} 12.9,\cellcolor{blue!24} 32.7,\cellcolor{blue!51} 72.3,\cellcolor{blue!64} 92.1,\cellcolor{blue!6} 5.9,\cellcolor{blue!8} 7.9,\cellcolor{blue!10} 11.9,\cellcolor{blue!35} 48.5,\cellcolor{blue!43} 60.4,\cellcolor{blue!3} 0.0,\cellcolor{blue!5} 3.0,\cellcolor{blue!5} 3.0,\cellcolor{blue!23} 30.7,\cellcolor{blue!26} 35.6
elyza-7B,\cellcolor{blue!29} 39.6,\cellcolor{blue!54} 77.2,\cellcolor{blue!67} 97.0,\cellcolor{blue!69} 99.0,\cellcolor{blue!69} 99.0,\cellcolor{blue!24} 32.7,\cellcolor{blue!10} 11.9,\cellcolor{blue!37} 51.5,\cellcolor{blue!63} 90.1,\cellcolor{blue!67} 97.0,\cellcolor{blue!9} 9.9,\cellcolor{blue!8} 7.9,\cellcolor{blue!8} 8.9,\cellcolor{blue!38} 53.5,\cellcolor{blue!43} 60.4,\cellcolor{blue!3} 0.0,\cellcolor{blue!3} 0.0,\cellcolor{blue!3} 0.0,\cellcolor{blue!26} 35.6,\cellcolor{blue!32} 44.6
elyza-7B-instruct,\cellcolor{blue!26} 35.6,\cellcolor{blue!48} 67.3,\cellcolor{blue!68} 98.0,\cellcolor{blue!69} 99.0,\cellcolor{blue!70} 100.0,\cellcolor{blue!7} 6.9,\cellcolor{blue!10} 11.9,\cellcolor{blue!40} 55.4,\cellcolor{blue!62} 88.1,\cellcolor{blue!67} 97.0,\cellcolor{blue!6} 5.9,\cellcolor{blue!5} 4.0,\cellcolor{blue!9} 9.9,\cellcolor{blue!40} 55.4,\cellcolor{blue!46} 64.4,\cellcolor{blue!3} 0.0,\cellcolor{blue!3} 0.0,\cellcolor{blue!3} 0.0,\cellcolor{blue!26} 34.7,\cellcolor{blue!32} 43.6


# The latex outputs

## The upper part of the table

## The values of the table

In [None]:
def add_gpt_row(latex_str: str) -> str:
    lines: List[str] = []
    for line in latex_str.split('\n'):
        if re.match(f'^ *{COL_METHOD}.*', line):
            gpt_line = re.sub('\&[^\&]*', '& - ', line).replace(COL_METHOD, 'GPT-4') + '  \\\\'
            lines.append(line)
            lines.append(gpt_line)
        else:
            lines.append(line)
    return '\n'.join(lines)
    
latex_str = to_latex(colored_concat_df, with_index=True)
print(add_gpt_row(latex_str))

In [None]:
def task_name_to_shot(task_name: str) -> str:
    return re.sub('000([^0-9]*)$', ',000\g<1>', task_name.split('.')[-1])

def task_name_to_dataset_name(task_name: str) -> str:
    return '.'.join(task_name.split('.')[:-1])

num_metrics = len(METRIC_RENAMES)
num_task = len(pretty_dfs)

dataset_names = []
for task_name in pretty_dfs: 
    dataset_name = task_name_to_dataset_name(task_name)
    if dataset_name not in dataset_names:
        dataset_names.append(dataset_name)
num_datasets = len(dataset_names)
num_shot_settings = int(num_task / num_datasets)

dataset_row = '{}    &    ' + '  &  '.join([f'\multicolumn{{{num_shot_settings * num_metrics}}}{{c}}{{{dataset_name}}}'
                                            for dataset_name in dataset_names]) + '    \\\\'
dataset_underline = '    '.join(['\cmidrule(l{\\tabcolsep}r{\\tabcolsep})' + f'{{{2 + (num_shot_settings * num_metrics) * i_col}-{2 + (num_shot_settings * num_metrics) * (i_col + 1) - 1}}}'
                                 for i_col in range(num_datasets)])

shot_row = '{}    &    ' + '  &  '.join([f'\multicolumn{{{num_metrics}}}{{c}}{{\\scriptsize {"$n$=" if i == 0 else ""}{task_name_to_shot(task_name)}}}'
                                         for i, task_name in enumerate(pretty_dfs.keys())]) + '    \\\\'
shot_underline = '    '.join(['\cmidrule(l{\\tabcolsep}r{\\tabcolsep})' + f'{{{2 + num_metrics * i_col}-{2 + num_metrics * (i_col + 1) - 1}}}'
                              for i_col in range(num_task)])

print('\\toprule')
print(dataset_row.replace('_', '\_'))
print()
print(dataset_underline.replace('_', '\_'))
print()
print(shot_row.replace('_', '\_'))
print()
print(shot_underline.replace('_', '\_'))
