In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
import evallm
from evallm.experiments.transducer_experiment import (
    current_dfa_sample_spec,
)

from evallm.experiments.models_display import model_by_display_key

from evallm.experiments.transducer_plotting import produce_table

In [3]:
from evallm.experiments.sequence_completion.sequence_completion_prompt import (
    SequencePromptDirectAlien,
    SequencePromptDirectAlienWithSpaces,
    SequencePromptDirectAlien2,
    SequencePromptDirectAlien3,
    SequencePromptDirectAlien2WithSpaces,
    SequencePromptDirectAlien3WithSpaces,
    MoreExplanationPrompt,
    MoreExplanationPrompt2,
    RedGreenPrompt,
    MoreExplanationPromptCOT,
)

from evallm.experiments.sequence_completion.sequence_completion_experiments import (
    compute_model_scores,
    compute_ngram_scores,
    compute_ngram_scores_with_prefix,
    compute_random_baseline_scores,
    compute_brute_force_scores,
    get_examples,
    run_model,
    compute_true_ngrams,
)

In [4]:
spec = current_dfa_sample_spec(3)

In [5]:
num_sequence_symbols = 10
current_setting = dict(
    dfa_spec=spec,
    num_sequences=30,
    num_sequence_symbols=num_sequence_symbols,
    num_sequence_symbols_prompt=num_sequence_symbols // 2,
    num_instances=30,
)

In [6]:
example = get_examples(0, current_setting)

In [7]:
prompts_by_key = {
    "Basic": SequencePromptDirectAlien2.for_setting,
    "More-Expl": MoreExplanationPrompt2.for_setting,
    "COT": MoreExplanationPromptCOT.for_setting,
    "Red-Green": RedGreenPrompt.for_setting,
}

In [8]:
def for_model(model, count, *prompts):
    return {
        (model, prompt): compute_model_scores(
            count,
            current_setting,
            model_by_display_key[model],
            prompts_by_key[prompt],
            na_mode="ignore",
        )
        for prompt in prompts
    }

In [9]:
results = {
    # **for_model("mistral-nemo-minitron-8B", 1000, "Basic"),
    # **for_model("llama3-8B", 1000, "Basic"),
    **for_model("llama3-70B", 1000, "Basic"),
    # **for_model("llama3.1-8B-Instruct", 1000, "Basic"),
    # **for_model("mistral-nemo-minitron-8B", 1000, "Basic"),
    # **for_model("mistral-nemo-base-12B", 1000, "Basic"),
    # **for_model("mistral-nemo-instruct-12B", 1000, "Basic"),
    # **for_model("gemma-7b", 1000, "Basic"),
    # **for_model("falcon-7b", 1000, "Basic"),
    **for_model("gpt-3.5-instruct", 30, "Basic"),
    **for_model("gpt-3.5-chat", 30, "Basic"),
    **for_model("gpt-4o-mini", 30, "Basic", "More-Expl", "COT", "Red-Green"),
    **for_model("gpt-4o", 30, "Basic", "More-Expl"),  # , "COT", "Red-Green"),
}

In [10]:
amount_baselines = 100

results[r"\textsc{Random}", "Basic"] = compute_random_baseline_scores(
    amount_baselines, setting=current_setting
)

results[rf"$n$-\textsc{{Gram-Heuristic-NoPrefix}}", "Basic"] = compute_ngram_scores(
    amount_baselines, setting=current_setting
)
results[rf"$n$-\textsc{{Gram-Heuristic}}", "Basic"] = compute_ngram_scores_with_prefix(
    amount_baselines, setting=current_setting
)

results[r"\textsc{BruteForce}", "Basic"] = compute_brute_force_scores(
    44, current_setting
)

for ngram in (2, 3, 4, 5, 6):
    results[rf"{ngram}-\textsc{{Gram}}", "Basic"] = compute_true_ngrams(
        ngram, amount_baselines, current_setting
    )

In [11]:
results_nested = {m: {} for m, _ in results}
for m, p in results:
    results_nested[m][p] = results[m, p]

In [12]:
produce_table(results_nested, list(prompts_by_key))

\begin{tabular}{|r|c|c|c|c|}
\hline
Model & \textsc{Basic} & \textsc{More-Expl} & \textsc{COT} & \textsc{Red-Green}\\
\hline
\cellcolor{lightgray}\textsc{BruteForce} &\cellcolor{lightgray}100.0 (100.0--100.0)&--&--&--\\
\hline
\bf 6-\textsc{Gram} &\bf 91.9 (89.3--94.0)&--&--&--\\
\hline
$n$-\textsc{Gram-Heuristic} &91.7 (89.1--94.0)&--&--&--\\
\hline
5-\textsc{Gram} &91.4 (88.8--93.7)&--&--&--\\
\hline
4-\textsc{Gram} &90.5 (87.8--92.9)&--&--&--\\
\hline
3-\textsc{Gram} &86.3 (83.2--88.8)&--&--&--\\
\hline
$n$-\textsc{Gram-Heuristic-NoPrefix} &83.4 (79.8--86.8)&--&--&--\\
\hline
2-\textsc{Gram} &82.3 (78.8--85.4)&--&--&--\\
\hline
gpt-4o-mini &72.6 (64.9--79.7)&67.9 (60.2--74.9)&56.9 (49.3--64.5)&56.3 (48.9--63.7)\\
\hline
gpt-4o &72.1 (65.9--78.2)&invalid&--&--\\
\hline
llama3-70B &71.4 (70.0--72.7)&--&--&--\\
\hline
gpt-3.5-instruct &66.6 (59.3--74.0)&--&--&--\\
\hline
\textsc{Random} &51.2 (46.6--55.8)&--&--&--\\
\hline
gpt-3.5-chat &invalid&--&--&--\\
\hline
\end{tabular}


In [13]:
prompter = prompts_by_key["Basic"](current_setting)
responses = run_model(model_by_display_key["gpt-3.5-chat"], prompter, *example)

In [14]:
print(prompter.display_prompt(None, *example[1][0], is_chat=True)["user"])

The following strings come from an alien language that follows a simple grammar. Infer the alien grammar using the example strings. Then, add a suffix to the final string using between 1 and 5 characters such that the full string follows the grammar. Output only the necessary suffix to complete the final string, and nothing else.

cbcbabbcca
abcaaacbaa
aabccbabbb
bbbccbbbca
aababaccba
aaaacbacac
baacbccbaa
cbbaacabcc
baabaacaab
bbbbbcacab
acaabcbbba
acaacbccac
cacbabcbba
abcbcbcbcc
ccaccccaba
bcbcabbcca
baabacabca
caababacac
bacacaccaa
bcacbbbbca
bcbbbcaccc
ccabbcccbb
bccbcabbca
baacbabcbc
ccacabccab
caacbcaaab
cacbaaccac
aaccbcaabb
abacabcaab
bacbcbcaca
caacb


In [15]:
responses

["The alien grammar seems to follow the pattern of alternating between the letters 'a', 'b', and 'c'. Based on",
 "The alien grammar seems to be that each string consists of a combination of the letters 'a', 'b', and 'c",
 "The alien grammar seems to follow the pattern of alternating between the letters 'a' and 'b', with 'c' appearing",
 "The alien grammar seems to follow the pattern of alternating between two different letters. Let's denote the two letters as A and B",
 "The alien grammar seems to follow the pattern of alternating between the letters 'a' and 'b', with 'c' inters",
 "The alien grammar seems to follow the pattern of alternating between two different characters. Let's denote the characters as A and B.",
 'Suffix: a',
 "The alien grammar seems to follow the pattern of alternating between the letters 'a' and 'b', with 'c' appearing",
 'The alien grammar seems to follow the pattern of alternating between three different types of characters: a, b, and c. The',
 'The inferred