In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
import evallm
from evallm.experiments.transducer_experiment import (
    current_dfa_sample_spec,
)

from evallm.experiments.models_display import model_by_display_key

from evallm.experiments.transducer_plotting import produce_table

In [3]:
from evallm.experiments.sequence_completion.sequence_completion_prompt import (
    SequencePromptDirectAlien,
    SequencePromptDirectAlienWithSpaces,
    SequencePromptDirectAlien2,
    SequencePromptDirectAlien3,
    SequencePromptDirectAlien2WithSpaces,
    SequencePromptDirectAlien2WithCommas,
    SequencePromptDirectAlien3WithSpaces,
    SequencePromptDirectAlien2WithCommasSequence,
    MoreExplanationPrompt,
    MoreExplanationPrompt2,
    RedGreenPrompt,
    MoreExplanationPromptCOT,
)

from evallm.experiments.sequence_completion.sequence_completion_experiments import (
    compute_model_scores,
    compute_ngram_scores,
    compute_ngram_scores_with_prefix,
    compute_random_baseline_scores,
    compute_brute_force_scores,
    get_examples,
    run_model,
    compute_true_ngrams,
)

In [4]:
spec = current_dfa_sample_spec(3)

In [5]:
num_sequence_symbols = 10
current_setting = dict(
    dfa_spec=spec,
    num_sequences=30,
    num_sequence_symbols=num_sequence_symbols,
    num_sequence_symbols_prompt=num_sequence_symbols // 2,
    num_instances=30,
)

In [6]:
example = get_examples(0, current_setting)

In [7]:
prompts_by_key = {
    "Basic": SequencePromptDirectAlien2.for_setting,
    "Basic-Commas": SequencePromptDirectAlien2WithCommas.for_setting,
    "More-Expl": MoreExplanationPrompt2.for_setting,
    "COT": MoreExplanationPromptCOT.for_setting,
    "Red-Green": RedGreenPrompt.for_setting,
}

In [8]:
def for_model(model, count, *prompts):
    return {
        (model, prompt): compute_model_scores(
            count,
            current_setting,
            model_by_display_key[model],
            prompts_by_key[prompt],
            na_mode="ignore",
        )
        for prompt in prompts
    }

In [9]:
results = {
    # **for_model("mistral-nemo-minitron-8B", 1000, "Basic"),
    **for_model("llama3-8B", 1000, "Basic", "Basic-Commas"),
    **for_model("llama3-70B", 1000, "Basic"),
    **for_model("qwen-2.5-coder-instruct-7B", 1000, "Basic"),
    **for_model("qwen-2.5-coder-instruct-32B", 1000, "Basic", "Basic-Commas"),
    **for_model("qwen-2.5-coder-7B", 1000, "Basic", "Basic-Commas"),
    # **for_model("llama3.1-8B-Instruct", 1000, "Basic"),
    **for_model("mistral-nemo-minitron-8B", 1000, "Basic"),
    # **for_model("mistral-nemo-base-12B", 1000, "Basic"),
    # **for_model("mistral-nemo-instruct-12B", 1000, "Basic"),
    # **for_model("gemma-7b", 1000, "Basic"),
    # **for_model("falcon-7b", 1000, "Basic"),
    **for_model("gpt-3.5-instruct", 100, "Basic", "Basic-Commas"),
    **for_model("gpt-3.5-chat", 100, "Basic", "Basic-Commas"),
    **for_model(
        "gpt-4o-mini", 100, "Basic", "Basic-Commas", "More-Expl", "COT", "Red-Green"
    ),
    **for_model("gpt-4o", 30, "Basic", "Basic-Commas", "More-Expl"),
    **for_model("gpt-4o", 30, "COT", "Red-Green"),
}

In [10]:
amount_baselines = 100

results[r"\textsc{Random}", "Basic"] = compute_random_baseline_scores(
    amount_baselines, setting=current_setting
)

results[r"\textsc{Common-Suffix}", "Basic"] = compute_ngram_scores(
    amount_baselines, setting=current_setting
)

results[r"$\textsc{BruteForce}_S$", "Basic"] = compute_brute_force_scores(
    44, current_setting
)

for ngram in (2, 3, 4, 5, 6):
    results[rf"{ngram}-$\textsc{{Gram}}_S$", "Basic"] = compute_true_ngrams(
        ngram, amount_baselines, current_setting
    )

In [11]:
results_nested = {m: {} for m, _ in results}
for m, p in results:
    results_nested[m][p] = results[m, p]

In [12]:
produce_table(results_nested, list(prompts_by_key))

\begin{tabular}{|r|c|c|c|c|c|}
\hline
Model & \textsc{Basic} & \textsc{Basic-Commas} & \textsc{More-Expl} & \textsc{COT} & \textsc{Red-Green}\\
\hline
\cellcolor{lightgray}$\textsc{BruteForce}_S$ &\cellcolor{lightgray}100.0 (100.0--100.0)&--&--&--&--\\
\hline
\bf 6-$\textsc{Gram}_S$ &\bf 91.9 (89.3--94.0)&--&--&--&--\\
\hline
5-$\textsc{Gram}_S$ &91.4 (88.8--93.7)&--&--&--&--\\
\hline
4-$\textsc{Gram}_S$ &90.5 (87.8--92.9)&--&--&--&--\\
\hline
3-$\textsc{Gram}_S$ &86.3 (83.2--88.8)&--&--&--&--\\
\hline
\textsc{Common-Suffix} &83.4 (79.8--86.8)&--&--&--&--\\
\hline
2-$\textsc{Gram}_S$ &82.3 (78.8--85.4)&--&--&--&--\\
\hline
qwen-2.5-coder-7B &79.5 (78.4--80.5)&60.7 (59.3--62.1)&--&--&--\\
\hline
qwen-2.5-coder-instruct-7B &79.5 (78.3--80.5)&--&--&--&--\\
\hline
qwen-2.5-coder-instruct-32B &79.2 (78.0--80.3)&55.2 (53.7--56.7)&--&--&--\\
\hline
mistral-nemo-minitron-8B &78.7 (77.5--79.8)&--&--&--&--\\
\hline
gpt-4o &72.1 (65.9--78.2)&66.8 (58.5--74.8)&invalid&67.4 (60.8--73.8)&74.4 (69.9-

In [13]:
# prompter = prompts_by_key["Basic-Commas"](current_setting)
# responses = run_model(model_by_display_key["qwen-2.5-coder-7B"], prompter, *example)

In [14]:
# print(prompter.display_prompt(None, *example[1][0], is_chat=True)["user"])

In [15]:
# responses