In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
from collections import defaultdict
import numpy as np

from evallm.experiments.transducer_summary import (
    for_model_and_prompt as for_model_t,
    compute_model_results as compute_model_results_t,
)

from evallm.experiments.sequence_completion_summary import for_model as for_model_sc
from evallm.experiments.sequence_completion.sequence_completion_prompt import (
    WithTemperatureSequenceCompletionPrompt,
)
from evallm.prompting.transducer_prompt import WithTemperatureTransducerPrompter

In [3]:
def produce_transducer_results(**kwargs):
    r = {
        **for_model_t(
            "mistral-nemo-minitron-8B",
            1000,
            "Basic",
            **kwargs,
        ),
        **for_model_t(
            "claude-3.5",
            30,
            "Basic",
            "More-Expl",
            "COT",
            "Red-Green",
            **kwargs,
        ),
    }

    return {
        mod: {prompt: res for prompt, res in prompts_res.items()}
        for mod, prompts_res in compute_model_results_t(
            r, accuracy_summary=True
        ).items()
    }


def produce_sc_results(**kwargs):
    r = {
        **for_model_sc(
            "mistral-nemo-minitron-8B",
            1000,
            "Basic",
            na_mode="ignore",
            **kwargs,
        ),
        **for_model_sc(
            "claude-3.5",
            30,
            # "Basic", "Basic-Commas", "More-Expl",
            "COT",
            # "Red-Green",
            na_mode="ignore",
            **kwargs,
        ),
        **for_model_sc(
            "claude-3.5",
            30,
            # "Basic", "Basic-Commas", "More-Expl",
            # "COT",
            "Red-Green",
            na_mode="ignore",
            **kwargs,
        ),
    }
    results = defaultdict(dict)
    for mod, prompt in r:
        results[mod][prompt] = r[mod, prompt]
    return results


def both_conditions(produce_results_fn, wrapper):
    result_temp_0 = produce_results_fn()
    result_temp_nonzero = produce_results_fn(
        wrapper=wrapper,
    )
    return result_temp_0, result_temp_nonzero


def summary(result_temp_0, result_temp_nonzero):
    for mod in result_temp_0:
        for prompt in result_temp_0[mod]:
            a = np.mean(result_temp_0[mod][prompt])
            b = np.mean(result_temp_nonzero[mod][prompt])
            print(mod, prompt)
            print(f"Temp Zero   : {a:.2%}")
            print(f"Temp NonZero: {b:.2%}")
            print(f"Delta       : {b - a:+.2%}")
            print()

In [4]:
t = both_conditions(
    produce_transducer_results,
    lambda prompt: WithTemperatureTransducerPrompter(prompt, 0.1),
)
sc = both_conditions(
    produce_sc_results,
    wrapper=lambda prompt: lambda args: WithTemperatureSequenceCompletionPrompt(
        prompt(args), 0.1
    ),
)

In [5]:
summary(*t)

mistral-nemo-minitron-8B Basic
Temp Zero   : 88.56%
Temp NonZero: 88.17%
Delta       : -0.39%

claude-3.5 Basic
Temp Zero   : 86.89%
Temp NonZero: 87.00%
Delta       : +0.11%

claude-3.5 More-Expl
Temp Zero   : 87.11%
Temp NonZero: 86.89%
Delta       : -0.22%

claude-3.5 COT
Temp Zero   : 76.44%
Temp NonZero: 78.11%
Delta       : +1.67%

claude-3.5 Red-Green
Temp Zero   : 82.89%
Temp NonZero: 82.78%
Delta       : -0.11%



In [6]:
summary(*sc)

mistral-nemo-minitron-8B Basic
Temp Zero   : 78.70%
Temp NonZero: 77.67%
Delta       : -1.04%

claude-3.5 COT
Temp Zero   : 84.00%
Temp NonZero: 84.22%
Delta       : +0.22%

claude-3.5 Red-Green
Temp Zero   : 80.00%
Temp NonZero: 80.78%
Delta       : +0.78%



In [24]:
from evallm.experiments.nonzero_temperature import (
    temperature_comparison_subtable,
    emit_table,
    temperature_comparison_tables,
)

In [20]:
temperature_comparison_subtable(*sc)

['\\textbf{Model}',
 '\\textbf{Prompt}',
 '\\textbf{Zero Temp}',
 '\\textbf{Nonzero Temp}',
 '\\textbf{Difference}',
 ['mistral-nemo-minitron-8B',
  '\\textsc{Basic}',
  '78.70% (77.49% -- 79.79%)',
  '77.67% (76.51% -- 78.76%)',
  '-1.04% (-1.49% -- -0.63%)'],
 ['claude-3.5',
  '\\textsc{COT}',
  '84.00% (79.33% -- 88.44%)',
  '84.22% (79.56% -- 89.00%)',
  '0.22% (-2.11% -- 2.33%)'],
 ['claude-3.5',
  '\\textsc{Red-Green}',
  '80.00% (74.89% -- 85.22%)',
  '80.78% (75.00% -- 86.11%)',
  '0.78% (-2.11% -- 3.44%)']]

In [23]:
emit_table(temperature_comparison_subtable(*sc))

{\renewcommand{\arraystretch}{1.25}
\begin{tabular}{llrrr}
\hline
\textbf{Model} & \textbf{Prompt} & \textbf{Zero Temp} & \textbf{Nonzero Temp} & \textbf{Difference} \\
\hline
mistral-nemo-minitron-8B & \textsc{Basic} & 78.70% (77.49% -- 79.79%) & 77.67% (76.51% -- 78.76%) & -1.04% (-1.49% -- -0.63%) \\
\hline
claude-3.5 & \textsc{COT} & 84.00% (79.33% -- 88.44%) & 84.22% (79.56% -- 89.00%) & 0.22% (-2.11% -- 2.33%) \\
\hline
claude-3.5 & \textsc{Red-Green} & 80.00% (74.89% -- 85.22%) & 80.78% (75.00% -- 86.11%) & 0.78% (-2.11% -- 3.44%) \\
\hline
\end{tabular}
}



In [36]:
temperature_comparison_tables(sc, t)

{\renewcommand{\arraystretch}{1.25}
\begin{tabular}{llrrr}
\hline
\textbf{Model} & \textbf{Prompt} & \textbf{Zero Temp} & \textbf{Nonzero Temp} & \textbf{Difference} \\
\hline
\multicolumn{5}{l}{\textbf{Sequence Completion}} \\
\hline
mistral-nemo-minitron-8B & \textsc{Basic} & 78.70\% (77.49\% -- 79.79\%) & 77.67\% (76.51\% -- 78.76\%) & -1.04\% (-1.49\% -- -0.63\%) \\
\hline
claude-3.5 & \textsc{COT} & 84.00\% (79.33\% -- 88.44\%) & 84.22\% (79.56\% -- 89.00\%) & 0.22\% (-2.11\% -- 2.33\%) \\
\hline
claude-3.5 & \textsc{Red-Green} & 80.00\% (74.89\% -- 85.22\%) & 80.78\% (75.00\% -- 86.11\%) & 0.78\% (-2.11\% -- 3.44\%) \\
\hline
\multicolumn{5}{l}{\textbf{Transducer}} \\
\hline
mistral-nemo-minitron-8B & \textsc{Basic} & 88.56\% (88.05\% -- 89.08\%) & 88.17\% (87.64\% -- 88.68\%) & -0.39\% (-0.58\% -- -0.22\%) \\
\hline
claude-3.5 & \textsc{Basic} & 86.89\% (83.33\% -- 90.00\%) & 87.00\% (83.33\% -- 90.11\%) & 0.11\% (-0.67\% -- 0.89\%) \\
\hline
claude-3.5 & \textsc{More-Expl} & 87