In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
import numpy as np
import pandas as pd

from evallm.experiments.transducer_summary import sample_dfa_spec, prompt_by_key_default
from evallm.experiments.transducer_experiment import run_transducer_experiment

In [3]:
num_dfas = 10

In [4]:
def small_experiment(model_name, num_dfas=num_dfas):
    return run_transducer_experiment(
        model=model_name,
        num_repeats_per_dfa=30,
        sample_dfa_spec=sample_dfa_spec,
        prompter=prompt_by_key_default["Basic"]["chat"],
        num_dfas=num_dfas,
    )

In [5]:
res_4os = small_experiment("gpt-4o-2024-05-13")

In [6]:
res_o1s = small_experiment("o1-preview-2024-09-12")

In [7]:
res_o3s = small_experiment("o3-mini-2025-01-31")

In [8]:
res_5 = small_experiment("gpt-5-2025-08-07")

In [9]:
def compute_frac(x):
    numer = (np.array(x.success_rate_each) == 1).sum()
    denom = (np.array(x.success_rate_each) != 0.5).sum()
    return numer, denom


def render_frac(x):
    numer, denom = compute_frac(x)
    return f"{numer:.0f}/{denom:.0f}"


def create_column_for_model(results):
    fractions = [compute_frac(x) for x in results]
    return create_column_for_fractions(fractions)


def create_column_for_fractions(fractions):
    labels = [f"Seed {i}" for i in range(len(fractions))]
    fractions += [tuple(sum(rs) for rs in zip(*fractions))]
    labels += ["Overall"]
    return dict(zip(labels, [f"{n:.0f}/{d:.0f}" for n, d in fractions]))

In [10]:
table = pd.DataFrame(
    {
        "o1-preview": create_column_for_model(res_o1s),
        "o3-mini": create_column_for_model(res_o3s),
        "gpt-4o": create_column_for_model(res_4os),
        "gpt-5": create_column_for_model(res_5),
        r"6-\textsc{Gram}": create_column_for_fractions(
            [(x.kgram_success_rates_each[6 - 2] * 30, 30) for x in res_4os]
        ),
    }
)

In [11]:
table

Unnamed: 0,o1-preview,o3-mini,gpt-4o,gpt-5,6-\textsc{Gram}
Seed 0,25/30,24/30,27/30,28/30,26/30
Seed 1,23/29,21/30,24/30,20/30,25/30
Seed 2,19/30,19/30,23/30,22/30,28/30
Seed 3,22/30,23/30,23/30,28/30,28/30
Seed 4,29/29,30/30,30/30,30/30,30/30
Seed 5,19/30,18/30,24/30,22/30,30/30
Seed 6,17/29,15/30,23/30,21/30,25/30
Seed 7,23/30,22/29,25/30,27/30,26/30
Seed 8,21/30,20/30,28/30,24/30,30/30
Seed 9,29/30,30/30,29/30,30/30,30/30


In [12]:
text = ""
text += r"\begin{tabular}{|r|c|c|c|}" + "\n"
text += r"\hline" + "\n"
text += "&".join(["DFA", *table.columns])
text += "\\\\\n"
for i, row in table.iterrows():
    text += r"\hline" + "\n"
    text += "&".join([i, *row])
    text += "\\\\\n"
text += r"\hline" + "\n"
text += r"\end{tabular}"

In [13]:
print(text)

\begin{tabular}{|r|c|c|c|}
\hline
DFA&o1-preview&o3-mini&gpt-4o&gpt-5&6-\textsc{Gram}\\
\hline
Seed 0&25/30&24/30&27/30&28/30&26/30\\
\hline
Seed 1&23/29&21/30&24/30&20/30&25/30\\
\hline
Seed 2&19/30&19/30&23/30&22/30&28/30\\
\hline
Seed 3&22/30&23/30&23/30&28/30&28/30\\
\hline
Seed 4&29/29&30/30&30/30&30/30&30/30\\
\hline
Seed 5&19/30&18/30&24/30&22/30&30/30\\
\hline
Seed 6&17/29&15/30&23/30&21/30&25/30\\
\hline
Seed 7&23/30&22/29&25/30&27/30&26/30\\
\hline
Seed 8&21/30&20/30&28/30&24/30&30/30\\
\hline
Seed 9&29/30&30/30&29/30&30/30&30/30\\
\hline
Overall&227/297&222/299&256/300&252/300&278/300\\
\hline
\end{tabular}
