In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
from evallm.experiments.transducer_experiment import (
    current_dfa_sample_spec,
)

from evallm.experiments.models_display import model_by_display_key

from evallm.experiments.transducer_plotting import produce_table

In [3]:
from evallm.experiments.sequence_completion.sequence_completion_prompt import (
    SequencePromptDirectAlien,
    SequencePromptDirectAlienWithSpaces,
)

from evallm.experiments.sequence_completion.sequence_completion_experiments import (
    compute_model_scores,
    compute_ngram_scores,
    compute_ngram_scores_with_prefix,
    compute_random_baseline_scores,
    compute_brute_force_scores,
)

In [4]:
spec = current_dfa_sample_spec(3)

In [5]:
num_sequence_symbols = 8
current_setting = dict(
    dfa_spec=spec,
    num_sequences=30,
    num_sequence_symbols=num_sequence_symbols,
    num_sequence_symbols_prompt=num_sequence_symbols // 2,
    num_instances=100,
)

In [6]:
prompts_by_key = {
    "Basic": SequencePromptDirectAlien.for_setting,
    "Basic-Space": SequencePromptDirectAlienWithSpaces.for_setting,
}

In [7]:
def for_model(model, count, *prompts):
    return {
        (model, prompt): compute_model_scores(
            count,
            current_setting,
            model_by_display_key[model],
            prompts_by_key[prompt],
            na_mode="ignore",
        )
        for prompt in prompts
    }

In [8]:
results = {
    # **for_model("mistral-nemo-minitron-8B", 1000, "Basic"),
    **for_model("llama3-8B", 1000, "Basic"),
    # **for_model("llama3-70B", 1000, "Basic"),
    # **for_model("llama3.1-8B-Instruct", 1000, "Basic"),
    **for_model("mistral-nemo-minitron-8B", 1000, "Basic", "Basic-Space"),
    # **for_model("mistral-nemo-base-12B", 1000, "Basic"),
    # **for_model("mistral-nemo-instruct-12B", 1000, "Basic"),
    # **for_model("gemma-7b", 1000, "Basic"),
    # **for_model("falcon-7b", 1000, "Basic"),
    **for_model("gpt-3.5-instruct", 30, "Basic", "Basic-Space"),
    **for_model("gpt-3.5-chat", 30, "Basic", "Basic-Space"),
    **for_model("gpt-4o-mini", 1000, "Basic", "Basic-Space"),
    **for_model("gpt-4o", 100, "Basic"),
    **for_model("gpt-4o", 30, "Basic-Space"),
    **for_model("claude-3.5", 30, "Basic", "Basic-Space"),
}

In [9]:
results[r"\textsc{Random}", "Basic"] = compute_random_baseline_scores(
    1000, setting=current_setting
)

results[rf"$n$-\textsc{{Gram-Heuristic-NoPrefix}}", "Basic"] = compute_ngram_scores(
    1000, setting=current_setting
)
results[rf"$n$-\textsc{{Gram-Heuristic}}", "Basic"] = compute_ngram_scores_with_prefix(
    1000, setting=current_setting
)

results[r"\textsc{BruteForce}", "Basic"] = compute_brute_force_scores(
    100, current_setting
)

In [10]:
results_nested = {m: {} for m, _ in results}
for m, p in results:
    results_nested[m][p] = results[m, p]

In [11]:
produce_table(results_nested, list(prompts_by_key))

\begin{tabular}{|r|c|c|}
\hline
Model & \textsc{Basic} & \textsc{Basic-Space}\\
\hline
\cellcolor{lightgray}\textsc{BruteForce} &\cellcolor{lightgray}99.8 (99.7--99.9)&--\\
\hline
\bf $n$-\textsc{Gram-Heuristic} &\bf 90.9 (90.3--91.6)&--\\
\hline
claude-3.5 &85.5 (80.9--89.9)&invalid\\
\hline
$n$-\textsc{Gram-Heuristic-NoPrefix} &83.8 (82.8--84.8)&--\\
\hline
gpt-4o &82.3 (79.7--84.8)&70.6 (63.0--77.4)\\
\hline
gpt-4o-mini &68.2 (67.1--69.2)&66.8 (65.7--67.9)\\
\hline
\textsc{Random} &52.7 (51.2--54.0)&--\\
\hline
gpt-3.5-chat &invalid&invalid\\
\hline
gpt-3.5-instruct &invalid&59.1 (49.8--67.8)\\
\hline
mistral-nemo-minitron-8B &73.6 (72.5--74.7)&76.3 (75.4--77.2)\\
\hline
llama3-8B &70.4 (69.0--71.7)&--\\
\hline
\end{tabular}
