In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [2]:
from collections import defaultdict
import numpy as np

from evallm.experiments.transducer_summary import (
    for_model_and_prompt as for_model_t,
    compute_model_results as compute_model_results_t,
)

from evallm.experiments.sequence_completion_summary import for_model as for_model_sc
from evallm.experiments.sequence_completion.sequence_completion_prompt import (
    WithTemperatureSequenceCompletionPrompt,
)
from evallm.prompting.transducer_prompt import WithTemperatureTransducerPrompter

In [3]:
def produce_transducer_results(**kwargs):
    r = {
        **for_model_t(
            "mistral-nemo-minitron-8B",
            1000,
            "Basic",
            **kwargs,
        ),
        **for_model_t(
            "claude-3.5",
            30,
            # "Basic",
            # "More-Expl",
            # "COT",
            "Red-Green",
            **kwargs,
        ),
    }

    return {
        mod: {prompt: np.mean(res) for prompt, res in prompts_res.items()}
        for mod, prompts_res in compute_model_results_t(
            r, accuracy_summary=True
        ).items()
    }


def produce_sc_results(**kwargs):
    r = {
        **for_model_sc(
            "mistral-nemo-minitron-8B",
            1000,
            "Basic",
            na_mode="ignore",
            **kwargs,
        ),
        **for_model_sc(
            "claude-3.5",
            30,
            # "Basic",
            # "Basic-Commas",
            # "More-Expl",
            # "COT",
            "Red-Green",
            na_mode="ignore",
            **kwargs,
        ),
    }
    results = defaultdict(dict)
    for mod, prompt in r:
        results[mod][prompt] = np.mean(r[mod, prompt])
    return results


def both_conditions(produce_results_fn, wrapper):
    result_temp_0 = produce_results_fn()
    result_temp_nonzero = produce_results_fn(
        wrapper=wrapper,
    )
    for mod in result_temp_0:
        for prompt in result_temp_0[mod]:
            a = result_temp_0[mod][prompt]
            b = result_temp_nonzero[mod][prompt]
            print(mod, prompt)
            print(f"Temp Zero   : {a:.2%}")
            print(f"Temp NonZero: {b:.2%}")
            print(f"Delta       : {b - a:+.2%}")
            print()

In [4]:
both_conditions(
    produce_transducer_results,
    lambda prompt: WithTemperatureTransducerPrompter(prompt, 0.1),
)

mistral-nemo-minitron-8B Basic
Temp Zero   : 88.56%
Temp NonZero: 88.17%
Delta       : -0.39%

claude-3.5 Red-Green
Temp Zero   : 82.89%
Temp NonZero: 82.78%
Delta       : -0.11%



In [5]:
both_conditions(
    produce_sc_results,
    wrapper=lambda prompt: lambda args: WithTemperatureSequenceCompletionPrompt(
        prompt(args), 0.1
    ),
)

mistral-nemo-minitron-8B Basic
Temp Zero   : 78.70%
Temp NonZero: 77.67%
Delta       : -1.04%

claude-3.5 Red-Green
Temp Zero   : 80.00%
Temp NonZero: 80.78%
Delta       : +0.78%

