In [20]:
import collections
import json
import os
import re
from dataclasses import dataclass, field
from pprint import pprint
from typing import NamedTuple

import numpy as np
import pandas as pd
import statsmodels.stats.api as sms
import torch
from colorama import Fore, Style
from datasets import load_dataset
from fuzzywuzzy import fuzz
from transformers import TrainingArguments
from tqdm import tqdm

In [2]:
# Copied from train.py
@dataclass
class LeiaTrainingArguments(TrainingArguments):
    model_name_or_path: str | None = field(default=None)
    use_flash_attention_2: bool = field(default=False)

    wikipedia_dataset_dir: str | None = field(default=None)

    trans_insertion_strategy: str = field(default="none")
    trans_insertion_prob: float = field(default=1.0)
    trans_insertion_prob_decay: bool = field(default=False)
    trans_insertion_min_prob: float = field(default=0.0)
    disable_trans_token_loss: bool = field(default=False)

    max_length: int = field(default=1024)

    eval_tasks: str | None = field(default=None)
    max_eval_samples_for_tasks: int | None = field(default=None)
    num_fewshot_samples_for_tasks: str | None = field(default=None)
    use_dynamic_generation_length: bool = field(default=True)


class Task(NamedTuple):
    name: str
    language: str


class Result(NamedTuple):
    task: Task
    metric: str
    value: float


class Experiment(NamedTuple):
    name: str
    args: LeiaTrainingArguments | None
    results: list[Result]
    predictions: dict[Task, list[dict]]

In [3]:
TASK2LANGUAGE = {
    "jemhopqa": "ja",
    "jaqket": "ja",
    "jcommonsenseqa": "ja",
    "jsquad": "ja",
    "niilc": "ja",
}


def parse_file_name(file_name: str) -> Task | None:
    if file_name.endswith("_metrics.json"):
        task_name = file_name[: -len("_metrics.json")]
    elif file_name.endswith("_predictions.jsonl"):
        task_name = file_name[: -len("_predictions.jsonl")]

    match_obj = re.match(r"^([a-z_]+)_([a-z]{2})$", task_name)
    if match_obj:
        task_name, language = match_obj.group(1), match_obj.group(2)
        return Task(task_name, language)
    elif task_name in TASK2LANGUAGE:
        return Task(task_name, TASK2LANGUAGE[task_name])

    print(f"Could not parse task name: {task_name}")
    return None

In [4]:
def load_experiments(runs_dir: str) -> dict[str, Experiment]:
    experiments: dict[str, Experiment] = {}
    for run_dir in os.listdir(runs_dir):
        if not os.path.isdir(os.path.join(runs_dir, run_dir, "results")):
            tqdm.write(f"results directory not found. Skipping {run_dir}")
            continue

        args = None
        if os.path.exists(os.path.join(runs_dir, run_dir, "training_args.bin")):
            args = torch.load(os.path.join(runs_dir, run_dir, "training_args.bin"))

        results = []
        predictions = {}
        for result_file in os.listdir(os.path.join(runs_dir, run_dir, "results")):
            if result_file.endswith("_metrics.json"):
                with open(os.path.join(runs_dir, run_dir, "results", result_file)) as f:
                    task = parse_file_name(result_file)
                    if task is not None:
                        for metric, value in json.load(f).items():
                            if metric != "num_fewshot_samples":
                                results.append(Result(task, metric, value))

            elif result_file.endswith("_predictions.jsonl"):
                task = parse_file_name(result_file)
                if task is not None:
                    with open(os.path.join(runs_dir, run_dir, "results", result_file)) as f:
                        predictions[task] = [json.loads(line) for line in f]

        experiments[run_dir] = Experiment(run_dir, args, results, predictions)

    return experiments

In [5]:
experiments = load_experiments("../runs")

## Experiments with LLaMA 2 7B


### All Results


In [6]:
target_tasks = ["xcodah", "xcsqa"]
languages = ["ar", "es", "hi", "ja", "ru", "sw", "zh"]

In [7]:
data = collections.defaultdict(lambda: collections.defaultdict(dict))

for experiment_name, experiment in sorted(experiments.items(), key=lambda x: x[0]):
    if experiment.args is None:
        if experiment_name != "Llama-2-7b-hf":
            continue
    else:
        if experiment.args.seed != 42:  # default seed
            continue
        if experiment_name.startswith("swallow"):
            continue

        experiment_name = f"leia_{experiment.args.trans_insertion_strategy}_p{experiment.args.trans_insertion_prob}"
        if experiment.args.disable_trans_token_loss:
            experiment_name += "_no_token_loss"

    for result in experiment.results:
        data[experiment_name][result.task.name][result.task.language] = result.value

for task in target_tasks:
    rows = []
    row_names = []
    for model_name, results in sorted(data.items(), key=lambda x: x[0]):
        rows.append([results.get(task, {}).get(language) for language in languages])
        row_names.append(model_name)

    df = pd.DataFrame(rows, columns=languages, index=row_names)
    df = df.round(3)
    display(f"{task}:", df)

'xcodah:'

Unnamed: 0,ar,es,hi,ja,ru,sw,zh
Llama-2-7b-hf,0.303,0.453,0.297,0.303,0.343,0.287,0.367
leia_left_p0.5,0.33,0.467,0.303,0.35,0.377,0.307,0.393
leia_left_p1.0,0.32,0.467,0.303,0.34,0.383,0.3,0.38
leia_none_p1.0,0.3,0.453,0.27,0.3,0.347,0.29,0.383
leia_replace_p0.5,0.33,0.463,0.303,0.353,0.37,0.307,0.39
leia_replace_p1.0,0.32,0.467,0.3,0.347,0.38,0.3,0.39
leia_right_p0.5,0.333,0.467,0.307,0.35,0.377,0.303,0.393
leia_right_p0.5_no_token_loss,0.323,0.463,0.303,0.353,0.377,0.307,0.39
leia_right_p1.0,0.323,0.467,0.303,0.343,0.383,0.303,0.383


'xcsqa:'

Unnamed: 0,ar,es,hi,ja,ru,sw,zh
Llama-2-7b-hf,0.21,0.451,0.191,0.344,0.36,0.16,0.401
leia_left_p0.5,0.22,0.46,0.184,0.353,0.358,0.159,0.405
leia_left_p1.0,0.219,0.46,0.193,0.346,0.356,0.162,0.399
leia_none_p1.0,0.209,0.449,0.182,0.349,0.355,0.159,0.396
leia_replace_p0.5,0.218,0.458,0.184,0.355,0.36,0.159,0.403
leia_replace_p1.0,0.218,0.455,0.187,0.346,0.356,0.163,0.401
leia_right_p0.5,0.22,0.458,0.183,0.355,0.36,0.161,0.404
leia_right_p0.5_no_token_loss,0.216,0.461,0.185,0.355,0.361,0.161,0.403
leia_right_p1.0,0.218,0.456,0.193,0.346,0.358,0.164,0.399


### Significance tests


In [8]:
llama2_data = collections.defaultdict(dict)
baseline_data = collections.defaultdict(lambda: collections.defaultdict(list))
leia_data = collections.defaultdict(lambda: collections.defaultdict(list))

for experiment_name, experiment in experiments.items():
    if experiment_name.startswith("swallow"):
        continue

    if experiment.name == "Llama-2-7b-hf":
        for result in experiment.results:
            llama2_data[result.task.name][result.task.language] = result.value

    if experiment.args is not None:
        if experiment.args.trans_insertion_strategy == "none":
            for result in experiment.results:
                baseline_data[result.task.name][result.task.language].append(result.value)

        elif experiment.args.trans_insertion_strategy == "right" and experiment.args.trans_insertion_prob == 0.5:
            if not experiment.args.disable_trans_token_loss:
                for result in experiment.results:
                    leia_data[result.task.name][result.task.language].append(result.value)

for task_name in target_tasks:
    rows = []
    for language in languages:
        assert len(baseline_data[task_name][language]) == 5
        baseline_mean = np.mean(baseline_data[task_name][language])
        baseline_lower, baseline_higher = sms.DescrStatsW(baseline_data[task_name][language]).tconfint_mean()

        assert len(leia_data[task_name][language]) == 5
        leia_mean = np.mean(leia_data[task_name][language])
        leia_lower, leia_higher = sms.DescrStatsW(leia_data[task_name][language]).tconfint_mean()

        llama2_score = llama2_data[task_name][language]

        rows.append(
            [
                leia_mean,
                leia_higher - leia_mean,
                baseline_mean,
                baseline_higher - baseline_mean,
                llama2_score,
                leia_lower,
                baseline_higher,
            ]
        )
    col_names = [
        "leia_mean",
        "leia_interval",
        "baseline_mean",
        "baseline_interval",
        "llama2",
        "leia_lower",
        "baseline_higher",
    ]

    df = pd.DataFrame(rows, columns=col_names, index=languages)
    df = df.transpose()
    df = df.round(3)
    display(f"{task_name}:", df)

'xcodah:'

Unnamed: 0,ar,es,hi,ja,ru,sw,zh
leia_mean,0.328,0.466,0.306,0.349,0.375,0.304,0.391
leia_interval,0.005,0.002,0.002,0.004,0.002,0.002,0.002
baseline_mean,0.307,0.455,0.272,0.304,0.344,0.29,0.383
baseline_interval,0.006,0.004,0.002,0.003,0.009,0.0,0.003
llama2,0.303,0.453,0.297,0.303,0.343,0.287,0.367
leia_lower,0.323,0.464,0.304,0.345,0.372,0.302,0.389
baseline_higher,0.313,0.458,0.274,0.307,0.353,0.29,0.386


'xcsqa:'

Unnamed: 0,ar,es,hi,ja,ru,sw,zh
leia_mean,0.219,0.457,0.184,0.354,0.361,0.16,0.405
leia_interval,0.002,0.001,0.002,0.002,0.002,0.001,0.001
baseline_mean,0.213,0.448,0.182,0.345,0.357,0.159,0.397
baseline_interval,0.003,0.002,0.002,0.003,0.003,0.001,0.001
llama2,0.21,0.451,0.191,0.344,0.36,0.16,0.401
leia_lower,0.217,0.457,0.182,0.352,0.359,0.158,0.403
baseline_higher,0.215,0.45,0.184,0.348,0.359,0.16,0.398


## Experiments based on Swallow-7B


### Results


In [9]:
target_tasks = ["xcodah", "xcsqa", "jcommonsenseqa", "niilc", "jemhopqa", "jaqket"]

In [10]:
lm_eval_harness_results: dict[str, dict[str, list[float]]] = collections.defaultdict(
    lambda: collections.defaultdict(list)
)
lm_eval_harness_results_dir = "../jp-lm-evaluation-harness/results"

for result_dir in os.listdir(lm_eval_harness_results_dir):
    if not os.path.exists(os.path.join(lm_eval_harness_results_dir, result_dir, "results.json")):
        print("results.json not found. Skipping", result_dir)
        continue

    model_name = result_dir.split("_seed")[0]
    with open(os.path.join(lm_eval_harness_results_dir, result_dir, "results.json")) as f:
        for key, results in json.load(f)["results"].items():
            if key == "jcommonsenseqa-1.1-0.2.1":
                lm_eval_harness_results[model_name]["jcommonsenseqa"].append(results["acc"])
            elif key == "jaqket_v1-0.1-0.2":
                lm_eval_harness_results[model_name]["jaqket"].append(results["acc"])

In [11]:
llm_jp_eval_results: dict[str, dict[str, list[float]]] = collections.defaultdict(lambda: collections.defaultdict(list))
llm_jp_eval_results_dir = "../llm-jp-eval/results"

for result_dir in os.listdir(llm_jp_eval_results_dir):
    if not os.path.exists(os.path.join(llm_jp_eval_results_dir, result_dir, "score_eval.json")):
        print("score_eval.json not found. Skipping", result_dir)
        continue

    model_name = result_dir.split("_seed")[0]
    with open(os.path.join(llm_jp_eval_results_dir, result_dir, "score_eval.json")) as f:
        for key, score in json.load(f).items():
            task_name = key.split("_")[0]
            if task_name in ("jemhopqa", "niilc"):
                llm_jp_eval_results[model_name][task_name].append(float(score))

In [12]:
data = collections.defaultdict(lambda: collections.defaultdict(list))

for experiment_name, experiment in experiments.items():
    if experiment.name in ("Llama-2-7b-hf", "Swallow-7b-hf"):
        for result in experiment.results:
            data[experiment_name][result.task.name] = [result.value]

    if not experiment_name.startswith("swallow"):
        continue

    if experiment.args is not None:
        if not experiment.args.disable_trans_token_loss:
            experiment_name = experiment_name.split("_seed")[0]
            for result in experiment.results:
                data[experiment_name][result.task.name].append(result.value)

for experiment_name in experiments.keys():
    experiment_name = experiment_name.split("_seed")[0]
    for key, values in lm_eval_harness_results[experiment_name].items():
        data[experiment_name][key] = values
    for key, values in llm_jp_eval_results[experiment_name].items():
        data[experiment_name][key] = values

In [13]:
rows = []

for task_name in target_tasks:
    baseline_results = data["swallow_none_lr5e-6_step50"][task_name]
    assert len(baseline_results) == 5
    baseline_mean = np.mean(baseline_results)
    baseline_lower, baseline_higher = sms.DescrStatsW(baseline_results).tconfint_mean()

    leia_results = data["swallow_right_p0.5_lr5e-6_step50"][task_name]
    assert len(leia_results) == 5
    leia_mean = np.mean(leia_results)
    leia_lower, leia_higher = sms.DescrStatsW(leia_results).tconfint_mean()

    llama2_score = data["Llama-2-7b-hf"][task_name][0]
    swallow_score = data["Swallow-7b-hf"][task_name][0]

    rows.append(
        [
            leia_mean,
            leia_higher - leia_mean,
            baseline_mean,
            baseline_higher - baseline_mean,
            llama2_score,
            swallow_score,
            leia_lower,
            baseline_higher,
        ]
    )

col_names = [
    "leia_mean",
    "leia_interval",
    "baseline_mean",
    "baseline_interval",
    "llama2",
    "swallow",
    "leia_lower",
    "baseline_higher",
]

df = pd.DataFrame(rows, columns=col_names, index=target_tasks)
df = df.transpose()
df = df.round(3)
display(df)

Unnamed: 0,xcodah,xcsqa,jcommonsenseqa,niilc,jemhopqa,jaqket
leia_mean,0.425,0.421,0.806,0.603,0.545,0.413
leia_interval,0.002,0.001,0.002,0.002,0.001,0.006
baseline_mean,0.407,0.396,0.793,0.58,0.503,0.35
baseline_interval,0.003,0.002,0.001,0.003,0.008,0.008
llama2,0.287,0.21,0.678,0.328,0.448,0.314
swallow,0.42,0.41,0.803,0.595,0.508,0.391
leia_lower,0.423,0.42,0.804,0.602,0.544,0.407
baseline_higher,0.411,0.398,0.794,0.583,0.511,0.357


### Analysis


In [29]:
xcodah_en_dataset = load_dataset("xcsr", "X-CODAH-en", split="validation")
xcodah_en_dataset = {example["id"]: example for example in xcodah_en_dataset}

for item1, item2 in zip(
    experiments["ja_llama2_none_lr5e-6_step50"].predictions[Task("xcodah", "ja")],
    experiments["ja_llama2_right_p0.5_lr5e-6_step50"].predictions[Task("xcodah", "ja")],
):
    assert item1["example"]["id"] == item2["example"]["id"]

    if item1["prediction"] != item2["prediction"]:
        label = ["A", "B", "C", "D"].index(item1["example"]["answerKey"])
        # print(item1['example']['question']['choices']['text'])
        print(Fore.GREEN if item1["prediction"] == label else Fore.RED, end="")
        print("LEIAなし:", item1["example"]["question"]["choices"]["text"][item1["prediction"]])
        print(
            "LEIAなし (en):",
            xcodah_en_dataset[item1["example"]["id"]]["question"]["choices"]["text"][item1["prediction"]],
        )
        print(Fore.GREEN if item2["prediction"] == label else Fore.RED, end="")
        print("LEIAあり:", item1["example"]["question"]["choices"]["text"][item2["prediction"]])
        print(
            "LEIAなし (en):",
            xcodah_en_dataset[item2["example"]["id"]]["question"]["choices"]["text"][item2["prediction"]],
        )
        print(Style.RESET_ALL, end="")
        print("正解:", item1["example"]["question"]["choices"]["text"][label])
        print("正解 (en):", xcodah_en_dataset[item1["example"]["id"]]["question"]["choices"]["text"][label])
        print("---")

[32mLEIAなし: サッカーの試合が行われています。 サッカー選手がチームメイトにボールを渡す
LEIAなし (en): A football game is in progress. A football player practices on the field
[31mLEIAあり: サッカーの試合が行われています。 フィールドで練習するサッカー選手
LEIAなし (en): A football game is in progress. A football player passes the ball to a teammate
[0m正解: サッカーの試合が行われています。 サッカー選手がチームメイトにボールを渡す
正解 (en): A football game is in progress. A football player practices on the field
---
[31mLEIAなし: イレブンマイナステン
LEIAなし (en): eleven minus ten ten
[32mLEIAあり: イレブンマイナスワン
LEIAなし (en): eleven minus ten nine
[0m正解: イレブンマイナスワン
正解 (en): eleven minus ten nine
---
[31mLEIAなし: 電話が鳴る音がする。 音量を上げてみました。
LEIAなし (en): I hear my phone ring. I turn up the volume.
[32mLEIAあり: 電話が鳴る音がする。 私はそれに答える。
LEIAなし (en): I hear my phone ring. I answer it.
[0m正解: 電話が鳴る音がする。 私はそれに答える。
正解 (en): I hear my phone ring. I answer it.
---
[32mLEIAなし: 曲芸師は、ポリアモリーが彼らのためのものではないことを知っていた。 曲芸師は3人の恋人を扱えなくなった。
LEIAなし (en): The juggler knew that polyamory was not for them. The juggler juggled frequently at a loc

In [31]:
xcsqa_en_dataset = load_dataset("xcsr", "X-CSQA-en", split="validation")
xcsqa_en_dataset = {example["id"]: example for example in xcsqa_en_dataset}

for item1, item2 in zip(
    experiments["ja_llama2_none_lr5e-6_step50"].predictions[Task("xcsqa", "ja")],
    experiments["ja_llama2_right_p0.5_lr5e-6_step50"].predictions[Task("xcsqa", "ja")],
):
    assert item1["example"]["id"] == item2["example"]["id"]
    if item1["prediction"] != item2["prediction"]:
        label = ["A", "B", "C", "D", "E"].index(item1["example"]["answerKey"])
        # print(item1['example']['question']['choices']['text'])
        print(Style.RESET_ALL, end="")
        print("問題文:", item1["example"]["question"]["stem"])
        print("問題文 (en):", xcsqa_en_dataset[item1["example"]["id"]]["question"]["stem"])
        print(Fore.GREEN if item1["prediction"] == label else Fore.RED, end="")
        print(
            "LEIAなし:",
            item1["example"]["question"]["choices"]["text"][item1["prediction"]],
        )
        print(
            "LEIAなし (en):",
            xcsqa_en_dataset[item1["example"]["id"]]["question"]["choices"]["text"][item1["prediction"]],
        )
        print(Fore.GREEN if item2["prediction"] == label else Fore.RED, end="")
        print(
            "LEIAあり:",
            item2["example"]["question"]["choices"]["text"][item2["prediction"]],
        )
        print(
            "LEIAあり (en):",
            xcsqa_en_dataset[item2["example"]["id"]]["question"]["choices"]["text"][item2["prediction"]],
        )
        print("正解:", item2["example"]["question"]["choices"]["text"][label])
        print("正解 (en):", xcsqa_en_dataset[item2["example"]["id"]]["question"]["choices"]["text"][label])
        print("---")

[0m問題文: テニスコートでミトン型になっているとしたら、あなたはどの状態ですか？
問題文 (en): Which state are you in if you're at a tennis court in a mitten-shaped state?
[31mLEIAなし: タウン
LEIAなし (en): michigan
[31mLEIAあり: カントリークラブ
LEIAあり (en): country club
正解: ミシガン
正解 (en): florida
---
[0m問題文: 並んで何を達成するのか？
問題文 (en): What will you accomplish by standing in line?
[31mLEIAなし: 怒り
LEIAなし (en): anger
[31mLEIAあり: 疲労
LEIAあり (en): wait turn
正解: 順番待ち
正解 (en): fatigue
---
[0m問題文: なんで隣に座りたくない人がいるの？
問題文 (en): Why would someone not want to sit down next to you?
[32mLEIAなし: 屁
LEIAなし (en): relax
[31mLEIAあり: 飲む
LEIAあり (en): drink
正解: 屁
正解 (en): relax
---
[0m問題文: 誰かに助けてもらう気持ちとは？
問題文 (en): What can be the feeling of someone  giving assistance?
[31mLEIAなし: 死
LEIAなし (en): death
[31mLEIAあり: 幸福
LEIAあり (en): happiness
正解: きもちいい
正解 (en): trouble
---
[0m問題文: ロビーを歩いて戦略を語った幹部たちは、何を話し合っていたのだろうか？
問題文 (en): The executives talked strategy walking through the lobby, what were they likely discussing?
[31mLEIAなし: オペラ
LEIAなし (en): game plan
[31mLE

In [16]:
llm_jp_eval_predictions: dict[str, dict[list[dict]]] = {}
llm_jp_eval_results_dir = "../llm-jp-eval/results"

for result_dir in os.listdir(llm_jp_eval_results_dir):
    if not os.path.exists(os.path.join(llm_jp_eval_results_dir, result_dir, "output_eval.json")):
        print("output_eval.json not found. Skipping", result_dir)
        continue

    with open(os.path.join(llm_jp_eval_results_dir, result_dir, "output_eval.json")) as f:
        output_data = json.load(f)
        llm_jp_eval_predictions[result_dir] = output_data

In [17]:
for item1, item2 in zip(
    llm_jp_eval_predictions["swallow_none_lr5e-6_step50"]["niilc"],
    llm_jp_eval_predictions["swallow_right_p0.5_lr5e-6_step50"]["niilc"],
):
    if item1["pred"] != item2["pred"]:
        score1 = fuzz.token_sort_ratio(item1["pred"], item1["gold"]) / 100.0
        score2 = fuzz.token_sort_ratio(item2["pred"], item1["gold"]) / 100.0
        print(Style.RESET_ALL, end="")
        print("問題文:", item1["input"])

        print(Fore.GREEN if score1 > score2 else Fore.RED, end="")
        print("LEIAなし:", item1["pred"])
        print(Fore.GREEN if score1 <= score2 else Fore.RED, end="")
        print("LEIAあり:", item2["pred"])
        print(Style.RESET_ALL, end="")
        print("正解:", item1["gold"])
        print("---")

[0m問題文: 質問：初めてノート型パソコンを作ったメーカーは？
[31mLEIAなし: IBM
[32mLEIAあり: Apple
[0m正解: 東芝
---
[0m問題文: 質問：日本で採用されている携帯電話の通信方式は？
[31mLEIAなし: 第3世代携帯電話の方式はW-CDMA,第4世代携帯電話の方式はLTE
[32mLEIAあり: 第3世代移動通信システム(3G)のW-CDMA方式
[0m正解: TACS,HiCAP,PDC,cdmaOne,CDMA2001,W-CDMA,LTE,モバイルWiMAX,AXGP,LTE-Advanced,WiMAX 2.1,LTE-?x
---
[0m問題文: 質問：ガラスはどの化学物質から出来ているの？
[32mLEIAなし: 二酸化ケイ素
[31mLEIAあり: 二酸化ケイ素(SiO2)
[0m正解: ケイ酸塩
---
[0m問題文: 質問：大韓民国と朝鮮民主主義人民共和国を隔てている緯度は？
[32mLEIAなし: 北緯38度
[31mLEIAあり: 北緯38度線
[0m正解: 北緯38度
---
[0m問題文: 質問：獲った魚を蓄えて置く所の名前は？
[32mLEIAなし: 生簀
[31mLEIAあり: 冷蔵庫
[0m正解: 生け簀
---
[0m問題文: 質問：ジャンケンで「グー」は何に勝つ？
[31mLEIAなし: 「パー」
[32mLEIAあり: 「チョキ」
[0m正解: チョキ
---
[0m問題文: 質問：エジソンは何を発明した？
[31mLEIAなし: 電球,蓄音機,電話,映画,ラジオ,電気モーター,映写機,白熱電球,電信機,電気自動車,電気鉄道,電気洗濯機,電気掃除機,電気アイロン,電気冷蔵庫,電気炊飯器,電気扇風機,電気ストーブ,電気ヒーター,電気コンロ,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロン,電気掃除機,電気アイロ

In [18]:
for item1, item2 in zip(
    llm_jp_eval_predictions["swallow_none_lr5e-6_step50"]["jemhopqa"],
    llm_jp_eval_predictions["swallow_right_p0.5_lr5e-6_step50"]["jemhopqa"],
):
    if item1["pred"] != item2["pred"]:
        score1 = fuzz.token_sort_ratio(item1["pred"], item1["gold"]) / 100.0
        score2 = fuzz.token_sort_ratio(item2["pred"], item1["gold"]) / 100.0
        print(Style.RESET_ALL, end="")
        print("問題文:", item1["input"])

        print(Fore.GREEN if score1 > score2 else Fore.RED, end="")
        print("LEIAなし:", item1["pred"])
        print(Fore.GREEN if score1 <= score2 else Fore.RED, end="")
        print("LEIAあり:", item2["pred"])
        print(Style.RESET_ALL, end="")
        print("正解:", item1["gold"])
        print("---")

[0m問題文: 質問：IPodを製作している企業の本社所在地は？
[31mLEIAなし: アメリカ合衆国カリフォルニア州
[32mLEIAあり: アメリカ合衆国カリフォルニア州クパチーノ
[0m正解: カリフォルニア州クパチーノ
---
[0m問題文: 質問：東條英機が死没した施設は何という戦争の後に設置されましたか？
[31mLEIAなし: 東京裁判
[32mLEIAあり: 東京裁判所
[0m正解: 第二次世界大戦
---
[0m問題文: 質問：藤島ジュリー景子の母親が死亡した年月日はいつですか？
[32mLEIAなし: 2018年7月17日
[31mLEIAあり: 2013年7月27日
[0m正解: 2021年8月14日
---
[0m問題文: 質問：豊岡市と伊丹市の管轄区域は、どちらも但馬県民局ですか？
[31mLEIAなし: YES
[32mLEIAあり: NO
[0m正解: NO
---
[0m問題文: 質問：『男はつらいよ』の主演俳優の誕生日は何月何日？
[32mLEIAなし: 1月1日
[31mLEIAあり: 1937年1月1日
[0m正解: 3月10日
---
[0m問題文: 質問：エアバスA380とボーイング747はどちらも客室は2階建てですか？
[31mLEIAなし: NO
[32mLEIAあり: YES
[0m正解: YES
---
[0m問題文: 質問：犬飼貴丈が所属する事務所の代表者は誰ですか？
[31mLEIAなし: 犬飼貴丈
[32mLEIAあり: バーニングプロダクション
[0m正解: 周防郁雄
---
[0m問題文: 質問：香取神宮と日光二荒山神社、千葉県にあるのはどちらですか？
[31mLEIAなし: 日光二荒山神社
[32mLEIAあり: 香取神宮
[0m正解: 香取神宮
---
[0m問題文: 質問：YOSHIKIとPATA、誕生日が早いのはYOSHIKIですか？
[31mLEIAなし: YES
[32mLEIAあり: NO
[0m正解: NO
---
[0m問題文: 質問：佐々木主浩が所属する団体の現組織の設立年月日はいつですか？
[31mLEIAなし: 1999年12月1日
[32mLEIAあり: 1998年12月1日
[0m正解: 40466