In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
from evallm.experiments.transducer_experiment import (
    current_dfa_sample_spec,
)

from evallm.experiments.models_display import model_by_display_key

from evallm.experiments.transducer_plotting import produce_table

In [3]:
from evallm.experiments.sequence_completion.sequence_completion_prompt import (
    SequencePromptDirectAlien,
    SequencePromptDirectAlienWithSpaces,
    SequencePromptDirectAlien2,
    SequencePromptDirectAlien3,
    SequencePromptDirectAlien2WithSpaces,
    SequencePromptDirectAlien3WithSpaces,
    MoreExplanationPrompt,
    MoreExplanationPromptCOT,
    RedGreenPrompt,
)

from evallm.experiments.sequence_completion.sequence_completion_experiments import (
    compute_model_scores,
    compute_ngram_scores,
    compute_ngram_scores_with_prefix,
    compute_random_baseline_scores,
    compute_brute_force_scores,
    get_examples,
    run_model,
)

In [4]:
spec = current_dfa_sample_spec(3)

In [5]:
num_sequence_symbols = 8
current_setting = dict(
    dfa_spec=spec,
    num_sequences=30,
    num_sequence_symbols=num_sequence_symbols,
    num_sequence_symbols_prompt=num_sequence_symbols // 2,
    num_instances=100,
)

In [6]:
example = get_examples(0, current_setting)

In [7]:
prompts_by_key = {
    "Basic": SequencePromptDirectAlien.for_setting,
    "Basic-Space": SequencePromptDirectAlienWithSpaces.for_setting,
    "Basic2": SequencePromptDirectAlien2.for_setting,
    "Basic2-Space": SequencePromptDirectAlien2WithSpaces.for_setting,
    "Basic3": SequencePromptDirectAlien3.for_setting,
    "Basic3-Space": SequencePromptDirectAlien3WithSpaces.for_setting,
    "More-Expl": MoreExplanationPrompt.for_setting,
    "COT": MoreExplanationPromptCOT.for_setting,
    "Red-Green": RedGreenPrompt.for_setting,
}

In [8]:
def for_model(model, count, *prompts):
    return {
        (model, prompt): compute_model_scores(
            count,
            current_setting,
            model_by_display_key[model],
            prompts_by_key[prompt],
            na_mode="count-na",
        )
        for prompt in prompts
    }

In [9]:
results = {
    # **for_model("mistral-nemo-minitron-8B", 1000, "Basic"),
    **for_model("llama3-8B", 1000, "Basic"),
    # **for_model("llama3-70B", 1000, "Basic"),
    # **for_model("llama3.1-8B-Instruct", 1000, "Basic"),
    **for_model("mistral-nemo-minitron-8B", 1000, "Basic", "Basic-Space"),
    # **for_model("mistral-nemo-base-12B", 1000, "Basic"),
    # **for_model("mistral-nemo-instruct-12B", 1000, "Basic"),
    # **for_model("gemma-7b", 1000, "Basic"),
    # **for_model("falcon-7b", 1000, "Basic"),
    **for_model("gpt-3.5-instruct", 30, "Basic", "Basic-Space"),
    **for_model("gpt-3.5-chat", 30, "Basic", "Basic-Space"),
    **for_model(
        "gpt-3.5-instruct", 30, "Basic2", "Basic3", "Basic2-Space", "Basic3-Space"
    ),
    **for_model("gpt-4o-mini", 1000, "Basic", "Basic-Space"),
    **for_model("gpt-4o-mini", 30, "Basic2", "Basic3", "Basic2-Space", "Basic3-Space"),
    **for_model("gpt-4o-mini", 30, "More-Expl", "COT", "Red-Green"),
    **for_model("gpt-4o", 100, "Basic"),
    **for_model("gpt-4o", 30, "Basic-Space", "More-Expl"),
    **for_model("claude-3.5", 30, "Basic", "Basic-Space"),
    **for_model("claude-3.5", 2, "Basic2", "Basic2-Space"),
}

In [10]:
# results[r"\textsc{Random}", "Basic"] = compute_random_baseline_scores(
#     1000, setting=current_setting
# )

# results[rf"$n$-\textsc{{Gram-Heuristic-NoPrefix}}", "Basic"] = compute_ngram_scores(
#     1000, setting=current_setting
# )
# results[rf"$n$-\textsc{{Gram-Heuristic}}", "Basic"] = compute_ngram_scores_with_prefix(
#     1000, setting=current_setting
# )

# results[r"\textsc{BruteForce}", "Basic"] = compute_brute_force_scores(
#     100, current_setting
# )

In [11]:
results_nested = {m: {} for m, _ in results}
for m, p in results:
    results_nested[m][p] = results[m, p]

In [12]:
produce_table(results_nested, list(prompts_by_key), handle_brute_force=False)

\begin{tabular}{|r|c|c|c|c|c|c|c|c|c|}
\hline
Model & \textsc{Basic} & \textsc{Basic-Space} & \textsc{Basic2} & \textsc{Basic2-Space} & \textsc{Basic3} & \textsc{Basic3-Space} & \textsc{More-Expl} & \textsc{COT} & \textsc{Red-Green}\\
\hline
\bf claude-3.5 &\bf 9.1 (7.6--10.7)&\bf 98.6 (97.9--99.2)&\bf 0.5 (0.0--1.0)&\bf 100.0 (100.0--100.0)&--&--&--&--&--\\
\hline
gpt-4o &0.0 (0.0--0.1)&0.0 (0.0--0.0)&--&--&--&--&100.0 (100.0--100.0)&--&--\\
\hline
gpt-3.5-chat &98.7 (98.2--99.3)&99.7 (99.5--99.9)&--&--&--&--&--&--&--\\
\hline
gpt-3.5-instruct &50.4 (47.5--53.1)&0.8 (0.5--1.0)&1.0 (0.6--1.3)&0.0 (0.0--0.0)&62.8 (60.3--65.2)&57.6 (54.3--60.7)&--&--&--\\
\hline
gpt-4o-mini &0.0 (0.0--0.0)&0.0 (0.0--0.0)&0.0 (0.0--0.0)&0.0 (0.0--0.0)&0.0 (0.0--0.0)&0.0 (0.0--0.0)&15.0 (13.7--16.3)&1.4 (0.9--1.9)&0.2 (0.0--0.3)\\
\hline
mistral-nemo-minitron-8B &0.3 (0.3--0.4)&0.0 (0.0--0.0)&--&--&--&--&--&--&--\\
\hline
llama3-8B &0.0 (0.0--0.1)&--&--&--&--&--&--&--&--\\
\hline
\end{tabular}


In [13]:
prompter = prompts_by_key["Basic2-Space"](current_setting)
responses = run_model(model_by_display_key["gpt-3.5-instruct"], prompter, *example)

In [14]:
print(prompter.display_prompt(None, *example[1][0], is_chat=True)["user"])

The following strings come from an alien language that follows a simple grammar. Infer the alien grammar using the example strings. Then, add a suffix to the final string using between 1 and 4 characters such that the full string follows the grammar. Output only the necessary suffix to complete the final string, and nothing else.

c a c b b c b c
c b c a a c b b
c c b a c a c a
c a b a a a c a
b a c b a b c a
a a c b a a a a
b c c b a b b b
b b b c c b b b
c a a a b a b a
c c b a a a a a
c b a c a c b a
a c b c c b a a
c b b a a c a b
c c b a a b a a
c a a b b b b b
a a b c b b b a
a c a a c b c c
a c c a c b a b
a b b a b c a b
c b c b c b c c
c c a c c c c a
a b b c c a c b
b a a b a c a b
c a c a a b a b
a c a c b a c a
c a c c a a c b
b c a c b b b b
c a b c b b b c
a c c c c c a b
c b c a b b c a
b c b b


In [15]:
responses

[' c c c c\n\n\na',
 ' b a b c\n\n\nSuffix: a',
 ' c c b b\n\n\nSuffix: a',
 ' c b c b\n\n\nSuffix: b',
 ' c b c c\n\n\nSuffix: c',
 ' c a c c\n\n\nSuffix: c',
 ' c b c b\n\n\nSuffix: a',
 ' c c c c\n\n\nSuffix: b',
 ' b c c c\n\n\nSuffix: a',
 ' b c c c\n\n\na',
 ' c c c c\n\n\nSuffix: c',
 ' b b c b\n\n\nSuffix: c',
 ' c c c c\n\n\na',
 ' c c c b\nb c b b b b b c\n\n\na',
 ' c c c c\n\n\nSuffix: c',
 ' c c c c\n\n\nSuffix: c',
 ' c c c c\n\n\nSuffix: a',
 ' c b b c\n\n\nSuffix: a',
 ' b b c c\n\n\nSuffix: a',
 ' b c b c\n\n\na',
 ' c c c c\n\n\na',
 ' b c b a\n\n\nSuffix: a',
 ' b b b b\n\n\nSuffix: a',
 ' b c c b\n\n\nSuffix: a',
 ' c c c c\n\n\na',
 ' c c c c\n\n\nSuffix: b',
 ' c b c c\n\n\nSuffix: c',
 ' b c b a\n\n\na',
 ' c c a c\n\na',
 ' b b c c\n\n\nSuffix: a',
 ' c c c c\n\n\nSuffix: a',
 ' c c c c\n\n\nSuffix: a',
 ' c c c b\n\n\nSuffix: a',
 ' b c c b\n\n\nSuffix: a',
 ' c b c c\n\n\nSuffix: c',
 ' b c c c\n\n\na',
 ' c c b c\n\n\nSuffix: c',
 ' c c c c\n\n\na',
 ' b b c 