In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
import matplotlib.pyplot as plt
from evallm.experiments.sequence_completion_summary import (
    sequence_completion_results,
    display_prompt as display_prompt_sc,
    sequence_completion_null_results,
)

from evallm.experiments.transducer_summary import (
    transducer_results,
    transducer_null_results,
)
from evallm.experiments.main_tables import (
    best_prompt,
    multi_prompts,
    main_table_of_results,
    multi_prompt_table_of_results,
    plot_transducer_vs_sequence_completion,
    best_prompt,
    all_p_values,
)
from evallm.experiments.main_tables import plot_significance

from evallm.experiments.transducer_plotting import produce_table
from evallm.experiments.models_display import model_by_display_key

In [None]:
null_t = transducer_null_results()
null_sc = sequence_completion_null_results()
results_t = transducer_results()
results_sc = sequence_completion_results()

  0%|          | 0/30 [00:00<?, ?it/s]

### Main Table

In [None]:
main_table_of_results(best_prompt(results_t), best_prompt(results_sc))

### Comparison of Prompts

In [5]:
multi_prompt_table_of_results(multi_prompts(results_t), multi_prompts(results_sc))

{\renewcommand{\arraystretch}{1.25}\begin{tabular}{l|ccccc}
\hline
\bf Model & \bf \textsc{Basic} & \bf \textsc{Basic-COT} & \bf \textsc{More-Expl} & \bf \textsc{More-Expl-COT} & \bf \textsc{Red-Green} \\
\hline
\multicolumn{6}{l}{ \bf Sequence Completion} \\
\hline
gpt-4o-mini & \bf 72.4 (68.1--76.3) & -- & 70.5 (66.4--74.6) & 58.0 (53.4--62.4) & 59.1 (54.9--63.2) \\
\hline
gpt-4o & 72.1 (65.9--78.2) & -- & N/A & 67.4 (60.8--73.8) & \bf 74.4 (69.9--78.6) \\
\hline
claude-3.5 & N/A & -- & N/A & \bf 84.0 (79.3--88.4) & 80.0 (74.9--85.2) \\
\hline
o3-mini & N/A & -- & N/A & 58.2 (49.6--66.8) & \bf 69.8 (64.4--75.0) \\
\hline
gpt-5 & 71.0 (58.3--82.7) & -- & -- & -- & \bf 86.0 (76.7--94.6) \\
\hline
\multicolumn{6}{l}{ \bf Transducer} \\
\hline
gpt-4o-mini & \bf 79.8 (77.3--82.2) & -- & 76.7 (74.2--79.3) & 65.2 (63.1--67.4) & 74.5 (72.0--77.0) \\
\hline
gpt-4o & \bf 83.7 (80.1--86.9) & -- & 82.6 (79.1--85.9) & 67.8 (63.1--72.3) & 82.6 (78.8--86.3) \\
\hline
claude-3.5 & 86.9 (83.3--90.0) 

### Commas

In [None]:
produce_table(
    {k: v for k, v in results_sc.items() if k in model_by_display_key},
    [display_prompt_sc(p) for p in ["Basic", "Basic-Commas"]],
)

In [None]:
plt.figure(figsize=(8, 4), dpi=400, facecolor="white", tight_layout=True)
plot_transducer_vs_sequence_completion(results_sc, results_t)
plt.savefig("../output/sequence_completion_vs_transducer.png")

### Null

In [None]:
print(r"{\scriptsize")
multi_prompt_table_of_results(
    multi_prompts(null_t, minimum_number_prompts=1),
    multi_prompts(null_sc, minimum_number_prompts=1),
    bold_best=False,
)
print("}")

In [None]:
from evallm.experiments.main_tables import flat_pandas_table, reorderings

In [None]:
import numpy as np


def flat(result):
    return {(k1, k2): np.mean(v) for k1, k2v in result.items() for k2, v in k2v.items()}


for k, v in flat(null_sc).items():
    if v < 0.05:
        continue
    if "Commas" in k[1]:
        continue
    print(k, v)
    # print(k, np.mean(v))

all_vals = np.array(
    [
        np.mean(v)
        for null in (null_t, null_sc)
        for vs in null.values()
        for prompt, v in vs.items()
        if "Commas" not in prompt
    ]
)

all_vals[all_vals < 0.5].max(), all_vals[all_vals >= 0.6].min()

In [None]:
table_sc = flat_pandas_table(results_sc, null_sc)
table_t = flat_pandas_table(results_t, null_t)

In [None]:
table_sc

In [None]:
print("Sequence just same model")
reorderings(table_sc, same_model=True)
print()
print("Transducer just same model")
reorderings(table_t, same_model=True)

In [None]:
print("Sequence just best prompt")
reorderings(table_sc, only_best_prompt=True)
print()
print("Transducer just best prompt")
reorderings(table_t, only_best_prompt=True)

In [None]:
flat_t = best_prompt(results_t)
ps_t = all_p_values(flat_t)
flat_sc = {
    k: v
    for k, v in best_prompt(results_sc).items()
    if not isinstance(v, float) or not np.isnan(v)
}
ps_sc = all_p_values(flat_sc)

In [None]:
size = 6
_, axs = plt.subplots(1, 2, figsize=(size * 2, size), dpi=400, tight_layout=True)
plot_significance(axs[0], flat_sc, ps_sc)
axs[0].set_title("Sequence Completion")
plot_significance(axs[1], flat_t, ps_t)
axs[1].set_title("Transducer")
plt.savefig("../output/significance.png")
plt.show()