In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import evallm
from automata.fa.dfa import DFA
import tqdm.auto as tqdm

In [3]:
import itertools
from permacache import stable_hash

In [4]:
from evallm.llm.llm import model_specs
from evallm.prompting.transducer_prompt import (
    ChainOfThoughtPromptRealExampleNoExplanation,
    BasicSequencePrompt,
    BasicSequencePromptSlightlyMoreExplanation,
    BasicSequencePromptNoChat,
    SequencePromptWithExplanation,
    SequencePromptWithExplanationChainOfThought,
    RedGreenRoomPrompt1,
    BasicInstructionTransducerPrompter,
)
from evallm.experiments.transducer_experiment import (
    current_transducer_experiments,
    compute_relative_to_null,
    compute_relative_to_ngram,
    print_example,
    bottom_quartile_outcome,
    current_dfa_sample_spec,
    run_transducer_experiment_just_stats,
    run_transducer_experiment,
)
from evallm.experiments.transducer_plotting import (
    plot_all_absolute_results_single_graph,
    plot_absolute_results_barchart,
)
from evallm.utils.bootstrap import boostrap_mean

In [5]:
num_states = 3
num_sequence_symbols = 30
num_repeats_per_dfa = 30
sample_dfa_spec = current_dfa_sample_spec(num_states=num_states)
setting_kwargs = dict(
    num_sequence_symbols=num_sequence_symbols,
    sample_dfa_spec=sample_dfa_spec,
    num_states=num_states,
)

In [6]:
model_by_key = {
    "llama3-8B": "meta-llama/Meta-Llama-3-8B",
    "llama3-70B": "meta-llama/Meta-Llama-3-70B",
    "llama3.1-8B-Instruct": "meta-llama/Llama-3.1-8B-Instruct",
    "mistral-nemo-minitron-8B": "nvidia/Mistral-NeMo-Minitron-8B-Base",
    "mistral-nemo-base-12B": "/scratch/kavig/mistral_models/Nemo-Base",
    "mistral-nemo-instruct-12B": "/scratch/kavig/mistral_models/Nemo-Instruct",
    "gemma-7b": "google/gemma-7b",
    "falcon-7b": "tiiuae/falcon-7b",
    "gpt-3.5-instruct": "gpt-3.5-turbo-instruct",
    "gpt-3.5-chat": "gpt-3.5-turbo-0125",
    "gpt-4o-mini": "gpt-4o-mini-2024-07-18",
    "gpt-4o": "gpt-4o-2024-05-13",
    "claude-3.5": "claude-3-5-sonnet-20241022",
}
prompt_by_key = {
    "Basic": {
        "non-chat": BasicSequencePromptNoChat.for_setting(setting_kwargs),
        "chat": BasicSequencePrompt.for_setting(setting_kwargs),
    },
    "More-Expl": {
        "chat": BasicSequencePromptSlightlyMoreExplanation.for_setting(setting_kwargs)
    },
    # "full-expl": {"chat": SequencePromptWithExplanation.for_setting(setting_kwargs)},
    "COT": {
        "chat": SequencePromptWithExplanationChainOfThought.for_setting(setting_kwargs)
    },
    "Red-Green": {"chat": RedGreenRoomPrompt1.for_setting(setting_kwargs)},
}

In [7]:
def for_model_and_prompt(model, num_dfas, *prompts):
    model_key = model_by_key[model]
    if model_specs[model_key].is_chat:
        prompt_kind = "chat"
    else:
        prompt_kind = "non-chat"
    return {
        (model, prompt): run_transducer_experiment(
            model_key,
            sample_dfa_spec,
            prompt_by_key[prompt][prompt_kind],
            num_repeats_per_dfa=num_repeats_per_dfa,
            num_dfas=num_dfas,
        )
        for prompt in prompts
    }

In [8]:
deterministic_baseline_outcomes = run_transducer_experiment_just_stats(
    "none",
    sample_dfa_spec,
    BasicInstructionTransducerPrompter(num_sequence_symbols, strip=True),
    num_repeats_per_dfa=num_repeats_per_dfa,
    num_dfas=1000,
)
model_outcomes = {
    **for_model_and_prompt("llama3-8B", 1000, "Basic"),
    **for_model_and_prompt("llama3-70B", 1000, "Basic"),
    **for_model_and_prompt("llama3.1-8B-Instruct", 1000, "Basic"),
    **for_model_and_prompt("llama3.1-8B-Instruct", 1000, "Basic"),
    **for_model_and_prompt("mistral-nemo-minitron-8B", 1000, "Basic"),
    **for_model_and_prompt("mistral-nemo-base-12B", 1000, "Basic"),
    **for_model_and_prompt("mistral-nemo-instruct-12B", 1000, "Basic"),
    **for_model_and_prompt("gemma-7b", 1000, "Basic"),
    **for_model_and_prompt("falcon-7b", 1000, "Basic"),
    **for_model_and_prompt("gpt-3.5-instruct", 100, "Basic"),
    **for_model_and_prompt("gpt-3.5-chat", 100, "Basic"),
    **for_model_and_prompt(
        "gpt-4o-mini",
        100,
        "Basic",
        "More-Expl",
        # "full-expl",
        "COT",
        "Red-Green",
    ),
    **for_model_and_prompt("gpt-4o", 30, "Basic"),
    **for_model_and_prompt(
        "claude-3.5",
        30,
        "Basic",
        "More-Expl",
        "COT",
        "Red-Green",
    ),
}

In [9]:
no_prompt = "Basic"

accuracies = defaultdict(dict)
accuracies[r"\textsc{Null}"][no_prompt] = [
    r.null_success_rate for r in deterministic_baseline_outcomes
]
for ngram in range(2, 2 + 5):
    accuracies[rf"{ngram}-\textsc{{Gram}}"][no_prompt] = [
        r.kgram_success_rates_each[ngram - 2] for r in deterministic_baseline_outcomes
    ]
accuracies[r"\textsc{BruteForce}"][no_prompt] = [
    r.brute_force_inference for r in deterministic_baseline_outcomes
]
for model, prompt in model_outcomes:
    accuracies[model][prompt] = [
        r.success_rate_binary_ignore_na for r in model_outcomes[model, prompt]
    ]

accuracies_mean = {
    mod: {prompt: np.mean(accuracies[mod][prompt]) for prompt in accuracies[mod]}
    for mod in accuracies
}
best_acc_mean_by_mod = {k: max(v.values()) for k, v in accuracies_mean.items()}

In [10]:
prompts = list(prompt_by_key)

In [11]:
def display_acc(acc, mod):
    acc = np.array(acc) * 100
    mu = np.mean(acc)
    lo, hi = boostrap_mean(acc)
    prefix = format_by_mod.get(mod, "")
    return prefix + f"{mu:.1f} ({lo:.1f}--{hi:.1f})"

In [12]:
models_sorted = sorted(accuracies, key=lambda k: best_acc_mean_by_mod[k])[::-1]

In [13]:
format_by_mod = {}
assert "BruteForce" in models_sorted[0]
format_by_mod[models_sorted[0]] = r"\cellcolor{lightgray}"
format_by_mod[models_sorted[1]] = r"\bf "

In [14]:
table_alignments = "|" + "|".join("r" + "c" * len(prompts)) + "|"
table = ""
table += r"\begin{tabular}{%s}" % table_alignments + "\n"
table += r"\hline" + "\n"
table += " & ".join(["Model"] + [rf"\textsc{{{x}}}" for x in prompts]) + r"\\" + "\n"
table += r"\hline" + "\n"
for mod in models_sorted:
    table += format_by_mod.get(mod, "") + mod + " &"
    if "no-prompt" in accuracies[mod]:
        table += (
            r"\multicolumn{%s}{l|}{%s}"
            % (
                len(prompts),
                display_acc(accuracies[mod]["no-prompt"], mod),
            )
            + r"\\"
            + "\n"
        )
        table += r"\hline" + "\n"
        continue
    for prompt in prompts:
        # if prompt == "no-prompt":
        #     prompt = "basic"
        if prompt not in accuracies[mod]:
            table += "--&"
            continue
        table += display_acc(accuracies[mod][prompt], mod) + "&"
    assert table[-1] == "&"
    table = table[:-1]
    table += r"\\" + "\n"
    table += r"\hline" + "\n"
table += r"\end{tabular}"

print(table)

\begin{tabular}{|r|c|c|c|c|}
\hline
Model & \textsc{Basic} & \textsc{More-Expl} & \textsc{COT} & \textsc{Red-Green}\\
\hline
\cellcolor{lightgray}\textsc{BruteForce} &\cellcolor{lightgray}96.0 (95.8--96.3)&--&--&--\\
\hline
\bf 6-\textsc{Gram} &\bf 93.5 (93.1--93.9)&--&--&--\\
\hline
5-\textsc{Gram} &93.4 (93.0--93.7)&--&--&--\\
\hline
4-\textsc{Gram} &91.1 (90.6--91.6)&--&--&--\\
\hline
mistral-nemo-minitron-8B &88.6 (88.0--89.1)&--&--&--\\
\hline
mistral-nemo-instruct-12B &88.0 (87.5--88.5)&--&--&--\\
\hline
mistral-nemo-base-12B &87.9 (87.4--88.4)&--&--&--\\
\hline
gpt-3.5-instruct &87.8 (85.9--89.6)&--&--&--\\
\hline
llama3-70B &87.7 (87.2--88.3)&--&--&--\\
\hline
llama3-8B &87.5 (86.9--88.0)&--&--&--\\
\hline
claude-3.5 &86.9 (83.3--90.0)&87.1 (83.9--90.2)&73.2 (68.8--77.4)&82.9 (78.9--86.9)\\
\hline
3-\textsc{Gram} &87.0 (86.4--87.6)&--&--&--\\
\hline
llama3.1-8B-Instruct &85.9 (85.3--86.5)&--&--&--\\
\hline
falcon-7b &84.9 (84.3--85.5)&--&--&--\\
\hline
gpt-4o &83.7 (80.1--86.9)