# Zero-shot MMOS-DeepSeekMath-7B with self-consistency and generated code reasoning evaluation

Self-consistency is a modification of the standard greedy decoding in reasoning pipelines via sampling several diverse answers followed by aggregation, e.g., most common answer ([SC-CoT paper](https://arxiv.org/pdf/2203.11171.pdf)).

In this kernel, we will consider MMOS-DeepSeekMath-7B RL-tuned backbone; in my experiments, this model produces more consistent code reasoning and the code block execution will allow us to decrease arithmetic hallucinations.

## References

- https://www.kaggle.com/code/ironbar/autobots-roll-out/notebook
- https://www.kaggle.com/code/abdurrafae/improved-code-interpretation
- https://kaggle.com/code/xiaoz259/pure-rng/notebook
- https://www.kaggle.com/code/olyatsimboy/aimo-openmath-mistral-baseline
- https://www.kaggle.com/code/aatiffraz/prompt-prediction-w-mixtral-mistral7b-gemma-llama
- https://www.kaggle.com/code/thedrcat/aimo-mixtral-baseline

## Configuration

In [None]:
class CFG:
    # Data parameters
    quick_save = False # If true it will set the time limit to 1 so the saving of the notebook is really quick
    submission_mode = False # If True it will use aimo.env otherwise a mock environment
    ## Data parameters only used when submission_mode is False
    dataset = '/mnt/hdd0/Kaggle/aimo/external_data/filtered_MATH_test_5.csv'
    problem_indices = None # If not None will restrict the evaluation to the given problem idx of the dataset
    # Model parameters
    model_path = "/home/gbarbadillo/data/deepseekmath"
    use_4bit_quantization = False
    balanced_device_map = True
    cuda_visible_devices = None # If not None could be used to limit the use of GPUS
    context_window_size = 4096
    # Run parameters
    time_limit = 31500 # seconds, 31500 by default which is 8.75 hours
    verbose = True
    save_results = True
    result_priority = ['code_answer', 'text_answer'] #['code_answer', 'boxed_answer', 'text_answer'] # Select which answers will be used as result
    # few-shot parameters
    few_shot_dataset = '/mnt/hdd0/Kaggle/aimo/external_data/MathCodeInstruct/MATHCodeInstruct_curated.csv'
    few_shot_samples = 0
    max_sample_tokens = 512 # problems with more than this tokens will be filtered
    max_prompt_tokens = 1024 # 3072 # only prompts with less than this tokens will be used
    difficulty_levels = None # levels outside this range won't be used
    # Inference parameters
    confidence_level = 0.95 # this will be used to stop sampling solutions if the difference between the first and second most voted options is significative
    n_repetitions = 25
    random_seed = None # None or int
    max_new_tokens = 1024 #2048
    max_coding_errors = 2
    code_output_truncate_length = 125 # max number of output parameters
    default_answer = 0 # this will be the response when the system does not have a valid answer
    # https://community.openai.com/t/cheat-sheet-mastering-temperature-and-top-p-in-chatgpt-api/172683
    # temperature for text generation
    temperature_text = 0.5
    top_p_text = 1.0
    # temperature for coding generation
    temperature_code = 0.5
    top_p_code = 1.0

## Imports

In [None]:
import time
NOTEBOOK_START_TIME = time.time()

if CFG.use_4bit_quantization:
    !pip install -U /kaggle/input/accelerate-wheelwhl/accelerate-0.29.1-py3-none-any.whl -qq
    !pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq

import os
if CFG.cuda_visible_devices is not None:
    os.environ["CUDA_VISIBLE_DEVICES"] = str(CFG.cuda_visible_devices)
    
import sys
import subprocess
from IPython.display import display, Markdown
import pandas as pd
from tqdm.auto import tqdm
import torch
import gc
import re
import math
import random
import json
from collections import Counter
import numpy as np
import tempfile
from pydantic import BaseModel
from typing import Optional
import datetime
from scipy.stats import norm
import glob

# https://pytorch.org/docs/stable/backends.html#torch.backends.cuda.enable_mem_efficient_sdp
# Enables or disables memory efficient scaled dot product attention.
# If set to True I get this error: RuntimeError: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)

from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    AutoConfig,
    StoppingCriteria,
    StoppingCriteriaList,
    set_seed
)

import transformers
print(f"Transformers Version: {transformers.__version__}")
if CFG.random_seed is not None:
    set_seed(CFG.random_seed)

import logging

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.info('Imported all libraries.')

import matplotlib.pyplot as plt
import matplotlib as mpl

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

### Load data

In [None]:
class MockEnvWithDataframe:
    """
    This class has the same interface as aimo.env, thus you can reuse the same code
    for making submissions or evaluating other datasets
    """
    def __init__(self, df):
        """
        Initializes the mock environment with a dataframe containing problems.
        """
        self.df = df
        self.submissions = []

    def iter_test(self):
        """
        Simulates the iter_test function by yielding each problem with an accompanying sample_submission.
        """
        for _, row in self.df.iterrows():
            problem = pd.DataFrame([row])
            sample_submission = pd.DataFrame({'id': problem.id, 'answer': [None]})
            yield problem, sample_submission

    def predict(self, sample_submission):
        self.submissions.append(sample_submission)
        
    def get_all_submissions(self):
        return pd.concat(self.submissions)

In [None]:
if CFG.submission_mode:
    import aimo
    env = aimo.make_env()
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        N_PROBLEMS = 50
    else:
        N_PROBLEMS = 3
else:
    df = pd.read_csv(CFG.dataset)
    if CFG.problem_indices is not None:
        df = df.iloc[CFG.problem_indices].reset_index(drop=True)
    if 'answer' in df.columns:
        df['ground_truth'] = df['answer']
    elif CFG.dataset == '/kaggle/input/ai-mathematical-olympiad-prize/test.csv': 
        df['ground_truth'] = 0
    N_PROBLEMS = len(df)
    display(df)
    env = MockEnvWithDataframe(df)
iter_test = env.iter_test()

### Model

In [None]:
class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = [stop.to("cuda") for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for stop in self.stops:
            last_token = input_ids[0][-len(stop):]
            if torch.all(torch.eq(stop, last_token)):
                return True
        return False

In [None]:
def load_model(model_path, use_4bit_quantization=False):
    logging.info(f'Loading model: {model_path}')
    config = AutoConfig.from_pretrained(model_path)
    config.gradient_checkpointing = True # we are not training, so I believe this is irrelevant
    device_map = {
        'model.embed_tokens': 0,
        'model.layers.0': 0,
        'model.layers.1': 0,
        'model.layers.2': 0,
        'model.layers.3': 0,
        'model.layers.4': 0,
        'model.layers.5': 0,
        'model.layers.6': 0,
        'model.layers.7': 0,
        'model.layers.8': 0,
        'model.layers.9': 0,
        'model.layers.10': 0,
        'model.layers.11': 0,
        'model.layers.12': 0,
        'model.layers.13': 0,
        'model.layers.14': 0,
        'model.layers.15': 1,
        'model.layers.16': 1,
        'model.layers.17': 1,
        'model.layers.18': 1,
        'model.layers.19': 1,
        'model.layers.20': 1,
        'model.layers.21': 1,
        'model.layers.22': 1,
        'model.layers.23': 1,
        'model.layers.24': 1,
        'model.layers.25': 1,
        'model.layers.26': 1,
        'model.layers.27': 1,
        'model.layers.28': 1,
        'model.layers.29': 1,
        'model.norm': 1,
        'lm_head': 1
    }
    if not CFG.balanced_device_map or torch.cuda.device_count() < 2:
        device_map = 'sequential'

    if use_4bit_quantization:
        from transformers import BitsAndBytesConfig
        quantization_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )
    else:
        quantization_config = None
    
        
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map=device_map,
        torch_dtype="auto", #torch.bfloat16 does not show speed differences
        trust_remote_code=True,
        quantization_config=quantization_config,
        config=config
    )
    model.eval()
    return model


def get_tokenizer(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    return tokenizer


def get_stopping_criteria(tokenizer, stop_words):
    stop_words_ids = [tokenizer(stop_word, return_tensors='pt', add_special_tokens=False)['input_ids'].squeeze() for stop_word in stop_words]
    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
    return stopping_criteria

In [None]:
tokenizer = get_tokenizer(CFG.model_path)

In [None]:
class TextGenerator():
    """
    Abstraction that allows to generate text and code in different steps efficiently
    """
    def __init__(self, cfg):
        self.cfg = cfg
        self.reset()
    
    def reset(self):
        self.prompt_tokens = 0
        self.generated_tokens = 0
        self.past_key_values = None
        self.set_generation_mode('text')
        self.max_new_tokens = self.cfg.max_new_tokens
        
    def set_generation_mode(self, mode):
        if mode == 'text':
            self.set_sampling_parameters(self.cfg.temperature_text, self.cfg.top_p_text)
        elif mode == 'code':
            self.set_sampling_parameters(self.cfg.temperature_code, self.cfg.top_p_code)
        else:
            raise KeyError(mode)
            
    def set_sampling_parameters(self, temperature, top_p):
        if temperature == 0:
            self.sampling_parameters = dict(do_sample=False)
        else:
            self.sampling_parameters = dict(do_sample=True, temperature=temperature, top_p=top_p)
            
    def are_generation_tokens_available(self):
        return self.generated_tokens < self.max_new_tokens
    
    def verify_max_new_tokens(self):
        if self.max_new_tokens > self.cfg.context_window_size - self.prompt_tokens:
            self.max_new_tokens = self.cfg.context_window_size - self.prompt_tokens
            logging.warning(f'Reducing max_new_tokens to {self.max_new_tokens} to avoid exceeding the context window of {self.cfg.context_window_size}')
        
    def generate(self, prompt, mode='text'):
        self.set_generation_mode(mode)
        model_inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
        if self.prompt_tokens == 0:
            self.prompt_tokens = len(model_inputs['input_ids'][0])
            logging.info(f'Prompt has {self.prompt_tokens} tokens.')
            self.verify_max_new_tokens()
        self.generated_tokens = len(model_inputs['input_ids'][0]) - self.prompt_tokens
        if not self.are_generation_tokens_available():
            logging.warning(f'Input text exceeded the available generation tokens. This is likely happening because a big code output.')
            return prompt
        
        t0 = time.time()
        clear_memory()
        generation_output = model.generate(
            **model_inputs, 
            max_new_tokens=self.max_new_tokens - self.generated_tokens,
            past_key_values=self.past_key_values,
            return_dict_in_generate=True,
            num_return_sequences=1,
            stopping_criteria=stopping_criteria,
            pad_token_id=tokenizer.eos_token_id,
            **self.sampling_parameters
            )
        output_ids = generation_output.sequences[0]
        newly_generated_tokens = len(output_ids) - len(model_inputs['input_ids'][0])
        self.generated_tokens = len(output_ids) - self.prompt_tokens
        logging.info(f'Generating {mode} speed: {newly_generated_tokens/(time.time() - t0):.1f} tokens/s ({newly_generated_tokens}) ({self.generated_tokens}/{self.max_new_tokens})')
        self.past_key_values = generation_output.past_key_values
        decoded_output = tokenizer.decode(output_ids, skip_special_tokens=True)
        return decoded_output
    
    def __call__(self, prompt, mode):
        return self.generate(prompt, mode)

In [None]:
def log_gpu_memory():
    for device in range(torch.cuda.device_count()):
        logging.info(f'GPU {device} memory allocated: {torch.cuda.memory_allocated(device)/1024**3:.1f} GB, max memory allocated: {torch.cuda.max_memory_allocated(device)/1024**3:.1f} GB')
        
def empty_gpu_vram():
    logging.info('Emptying GPU VRAM...')
    global model, tokenizer
    del model
    del tokenizer
    gc.collect()
    gc.collect()
    torch.cuda.empty_cache()
    log_gpu_memory()

log_gpu_memory()

In [None]:
def create_model_and_inference_artifacts():
    global model, text_generator, stop_words, stopping_criteria
    if 'model' in globals():
        return
    model = load_model(CFG.model_path, use_4bit_quantization=CFG.use_4bit_quantization)
    stop_words = ["```output", "```python", "```\nOutput" , ")\n```" , "``````output", 'Problem:']
    stopping_criteria = get_stopping_criteria(tokenizer, stop_words)
    text_generator = TextGenerator(cfg=CFG)
    log_gpu_memory()

### Utils

In [None]:
def clear_memory():
    for _ in range(2):
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(0.01)

In [None]:
def is_ending_time(max_time=CFG.time_limit):
    is_ending_time = get_time_spent() > max_time
    if is_ending_time:
        logging.warning('Reached limit time, inference will be skipped.')
    return is_ending_time

def get_time_spent():
    return time.time() - NOTEBOOK_START_TIME

assert not is_ending_time(100)
assert is_ending_time(0)

In [None]:
def is_quick_save_condition(idx, test):
    if CFG.quick_save and idx == 0 and CFG.submission_mode:
        if test['id'].values[0] == '000aaa':
            if test['problem'].values[0] == 'What is $1-1$?':
                logging.info('Quick save condition reached. Skipping inference')
                return True
    return False

In [None]:
def get_timestamp():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")

print(get_timestamp())

In [None]:
N_REPETITIONS = CFG.n_repetitions

PROBLEM_REPETITIONS = []

def adjust_repetitions_to_meet_ending_time(answered_problems,
                                           max_time=CFG.time_limit,
                                           min_problem_threshold=5,
                                           hysteresis=0.975):
    global N_REPETITIONS, PROBLEM_REPETITIONS
    PROBLEM_REPETITIONS.append(N_REPETITIONS)
    if answered_problems < min_problem_threshold:
        return
    spent_time = get_time_spent()
    mean_problem_time = spent_time/sum(PROBLEM_REPETITIONS)
    estimated_ending_time = (N_PROBLEMS - answered_problems)*mean_problem_time*N_REPETITIONS + spent_time
    logging.info(f'Mean problem time: {mean_problem_time:.1f} seconds, estimated ending time {estimated_ending_time/3600:.1f} hours')
    if estimated_ending_time > max_time and N_REPETITIONS > 1:
        N_REPETITIONS -= 1
        logging.warning(f'Decreasing the number of repetitions to {N_REPETITIONS} to try to meet ending time')
    elif estimated_ending_time < max_time*hysteresis and N_REPETITIONS < CFG.n_repetitions:
        N_REPETITIONS += 1
        logging.warning(f'Increasing the number of repetitions to {N_REPETITIONS} because it seems to be enough time to meet the ending time')

### Response parsing

In [None]:
def text_to_int_answer(text):
    try:
        answer = float(text)
        if answer < 0 or not answer.is_integer():
            return None
        return int(answer)
    except (ValueError, OverflowError):
        return None

assert 5 == text_to_int_answer('5')
assert 5 == text_to_int_answer('5.0')
assert text_to_int_answer('-1') is None
assert text_to_int_answer('0.5') is None
assert text_to_int_answer('pi') is None

In [None]:
def parse_boxed_answer(text):
    matches = re.findall(r'\\boxed\{(\d+)\}', text)
    if matches:
        return text_to_int_answer(matches[-1])
    return None

test_text = """

blah blah \\boxed{5} 7
"""
assert parse_boxed_answer(test_text) == 5

test_text = """

blah blah {5} 7
"""
assert parse_boxed_answer(test_text) == None

In [None]:
def parse_response_in_text(text):
    response = parse_boxed_answer(text)
    if response is not None:
        return response
    return parse_last_answer(text)

def parse_last_answer(text):
    pattern = r'(?:the answer is|the final answer is)\s*:?\s*\$?(\d+(\.\d+)?)\$?'
    matches = re.findall(pattern, text, re.IGNORECASE)
    if matches:
        return text_to_int_answer(matches[-1][0])
    return None

test_cases = [
    ('The answer is: $651$', 651),
    ('The answer is: $5$.', 5),
    ('The answer is: 6.', 6),
    ('The final answer is 0.', 0),
    ('The final answer is 126.', 126),
    ('The final answer is: $2$.', 2),
    ('The answer is $\\boxed{3}$', 3),
    ('The answer is $\\boxed{-1}$', None),
    ('The answer is $\\boxed{1.5}$', None),
    ('The answer is: $-1$.', None),
    ('The answer is: $4.5$.', None),
    ('The final answer is 0.6', None),
]
for text, answer in test_cases:
    assert parse_response_in_text(text) == answer
    assert parse_response_in_text(text.lower()) == answer

In [None]:
def parse_response_in_code(code_output):
    if code_output is None:
        return None
    try:
        code_output = code_output.strip()
        if code_output.startswith('[') and code_output.endswith(']'):
            return text_to_int_answer(code_output[1:-1])
        return text_to_int_answer(code_output)
    except Exception as e:
        print(f'Exception when trying to get a response from code: {e}')
        return None
    
assert parse_response_in_code('0') == 0
assert parse_response_in_code('[0]') == 0

### Code interpreter

In [None]:
def code_interpreter(code):
    code = preprocess_code(code)
    output, run_success = execute_code(code)
    return output, run_success

def preprocess_code(code):
    code = ensure_symbols_are_real(code)
    code = add_simplify_to_print(code)
    code = f'from sympy import *\n{code}'
    return code

def add_simplify_to_print(code):
    code = code.replace('print(', 'simplify_print(')
    new_code = """
def simplify_print(x):
    print(recursive_simplify(x))
        
def recursive_simplify(x):
    if isinstance(x, list):
        return [recursive_simplify(y) for y in x]
    return simplify(x)
"""
    code = new_code + '\n' + code
    return code

def ensure_symbols_are_real(code):
    def replace_symbols_call(match):
        matched_text = match.group()
        if "real" not in matched_text:
            return f"{matched_text[:-1]}, real=True)"
        else:
            return matched_text
    code = re.sub(r"symbols\([^)]+\)", replace_symbols_call, code)
    return code

assert ensure_symbols_are_real("x, y, z = symbols('x y z')") == "x, y, z = symbols('x y z', real=True)"
assert ensure_symbols_are_real("x, y, z = symbols('x y z', real=True)") == "x, y, z = symbols('x y z', real=True)"

def execute_code(code, timeout_limit=7):
    with tempfile.NamedTemporaryFile(mode='w+', delete=False) as temp_file:
        temp_file.write(code)
        temp_filepath = temp_file.name
    cmd = f'timeout {timeout_limit} {sys.executable} {temp_filepath}'
    ret = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    os.remove(temp_filepath)
    if ret.returncode == 0:
        return truncate_text(get_last_line(ret.stdout)), True
    elif ret.returncode == 124:
        return f'The execution of the code timeout. The code needs to run in less than {timeout_limit} seconds.', False
    else:
        #output = remove_references_to_temp_code_file(ret.stderr, temp_filepath)
        output = truncate_text(get_last_line(ret.stderr))
        return output, False
    
def remove_references_to_temp_code_file(output, filepath):
    return output.replace(f'File "{filepath}", ', '')

def get_last_line(text):
    lines = text.strip().splitlines()
    if lines:
        return lines[-1]
    return text.strip()

def truncate_text(text, max_length=CFG.code_output_truncate_length):
    """Sometimes code output can be very long"""
    if len(text) > max_length:
        return text[:max_length] + '...'
    return text

test_code = """
print('Hello')
"""
print(code_interpreter('print(0)'))
print(code_interpreter('foo'))

In [None]:
test_code = """
from sympy import symbols, Eq, solve

def solve_equation():
    x = symbols('x')
    equation = Eq(4 + x, 4)
    solution = solve(equation, x)

    return solution

result = solve_equation()
print(result)
"""
print(code_interpreter(test_code))

test_code = """
from sympy import symbols, Eq, solve

def solve_equation():
    x = symbols('x', real=True)
    equation = Eq(4 + x, 4)
    solution = solve(equation, x)

    return solution

result = solve_equation()
print(result)
"""
print(code_interpreter(test_code))

In [None]:
def parse_last_python_code_block(text):
    return text.split('```python')[-1].split("```")[0]

test_text = """
```python
hello
``````output
"""
assert parse_last_python_code_block(test_text) == '\nhello\n'

test_text = """
```python
hello
```
"""
assert parse_last_python_code_block(test_text) == '\nhello\n'

In [None]:
def add_code_output_to_prompt(decoded_output, code_output):
    if decoded_output.endswith(")\n```"):
        prompt = decoded_output+'```output\n'+str(code_output)+'\n```\n'
    else:
        prompt = decoded_output+'\n'+str(code_output)+'\n```\n'
    return prompt

In [None]:
class CodeRunner():
    """
    Abstraction to run code that:
    
    - Accumulates the code if the runs are succesfull
    - Measures number of coding errors
    """
    def __init__(self, max_coding_errors=2):
        self.accumulated_code = ''
        self.n_coding_errors = 0
        self.successful_code_output = None
        self.max_coding_errors = max_coding_errors
        self.code_interpreter_calls = 0
    
    def run_code(self, code):
        self.code_interpreter_calls += 1
        new_code = self.accumulated_code + "\n" + code
        code_output, run_success = code_interpreter(new_code)
        if run_success:
            self.accumulated_code = new_code
            self.successful_code_output = code_output
        else:
            self.n_coding_errors += 1
            self.successful_code_output = None
        return code_output
    
    def max_coding_errors_reached(self):
        max_coding_errors_reached = self.n_coding_errors >= self.max_coding_errors
        if max_coding_errors_reached:
            logging.warning(f'Stopping solution generation because {self.n_coding_errors} coding errors were done.')
        return max_coding_errors_reached

### Prompts

#### Define problems

In [None]:
prompts_df = pd.read_csv(CFG.few_shot_dataset)
prompts_df.head()

In [None]:
logging.info(f'The number of problems for few-shot prompting is {len(prompts_df)} previous to filtering')
logging.info(f'Filtering problems longer than {CFG.max_sample_tokens} tokens and outside levels {CFG.difficulty_levels}')
prompts_df = prompts_df[prompts_df.total_tokens < CFG.max_sample_tokens]
if CFG.difficulty_levels is not None:
    prompts_df = prompts_df[prompts_df.level.isin([f'Level {i}' for i in CFG.difficulty_levels])]
prompts_df.reset_index(drop=True, inplace=True)
logging.info(f'The number of problems for few-shot prompting is {len(prompts_df)} after filtering')

#### Create prompts

In [None]:
# https://github.com/deepseek-ai/DeepSeek-Math/tree/main
# this first template was used with the original MATH dataset
problem_prompt = """
Problem:

QUESTION_PLACEHOLDER

Please reason step by step, and always end with "The final answer is $\\boxed{}$".
The answer must be an integer greater or equal to zero.

ANSWER_PLACEHOLDER
"""

# this other template is designed to 
problem_prompt = """
Problem:

QUESTION_PLACEHOLDER

You are an expert mathematical programmer. Solve the above mathematical problem by writing a Python program.
Express your answer as a numeric type or a SymPy object. The answer must be an integer greater or equal to zero.
Please reason step by step, and always end with "The final answer is $\\boxed{}$".

ANSWER_PLACEHOLDER
"""


def create_random_few_shot_prompt(n=CFG.few_shot_samples):
    prompt = ''
    problem_indices = np.random.choice(np.arange(len(prompts_df)), n, replace=False)
    for problem_idx in problem_indices:
        row = prompts_df.loc[problem_idx]
        prompt += problem_prompt.replace('QUESTION_PLACEHOLDER', row['problem']).replace('ANSWER_PLACEHOLDER', row['solution'])
        prompt += f'\nThe final answer is $\\boxed{{{row["answer"]}}}$\n'
    prompt += problem_prompt.replace('QUESTION_PLACEHOLDER', 'PROBLEM_PLACEHOLDER').replace('ANSWER_PLACEHOLDER', '')
    return prompt.strip()

def create_random_few_shot_prompt_with_token_limit(token_limit=CFG.max_prompt_tokens):
    while 1:
        prompt = create_random_few_shot_prompt()
        if len(tokenizer.tokenize(prompt)) < token_limit:
            return prompt


prompt = create_random_few_shot_prompt_with_token_limit()
print(f'Number of tokens in prompt: {len(tokenizer.tokenize(prompt))}')
display(Markdown(prompt))

In [None]:
%%time
print('Create some random prompts to see token length distribution')
[len(tokenizer.tokenize(create_random_few_shot_prompt_with_token_limit())) for _ in range(10)]

In [None]:
def get_formatted_prompt(problem, repetition_idx):
    prompt = create_random_few_shot_prompt()
    # prompt = prompt_options[repetition_idx % len(prompt_options)]
    prompt = prompt.replace('PROBLEM_PLACEHOLDER', problem)
    return prompt

#### Simpler prompts

In [None]:
code_prompt = """Below is a math problem you are to solve (non negative integer answer):

\"PROBLEM_PLACEHOLDER\"

To accomplish this, first determine a sympy-based approach for solving the problem by listing each step to take and what functions need to be called in each step. Be clear so even an idiot can follow your instructions, and remember, your final answer should be a non negative integer, not an algebraic expression!
Write the entire script covering all the steps (use comments and document it well) and print the result. After solving the problem, output the final numerical answer within \\boxed{}.

Approach:

```python"""


cot_prompt = """Below is a math problem you are to solve (non negative integer answer):

\"PROBLEM_PLACEHOLDER\"

Analyze this problem and think step by step to come to a solution with programs. After solving the problem, output the final numerical answer within \\boxed{}.

```python"""

custom_prompt_1 = """
You are an expert mathematical programmer. Solve the mathematical problem below by writing a Python program.
Express your answer as a numeric type or a sympy object. The answer must be an integer greater or equal to zero.
Please reason step by step, and write clean and readable code.
You can use python libraries such as sympy, math or numpy to solve the problem.

PROBLEM_PLACEHOLDER

Sure, let's write a python script that solves the problem step by step.

```python"""

custom_prompt_2 = """PROBLEM_PLACEHOLDER

You are an expert mathematical programmer. Solve the above mathematical problem by writing a Python program.
Express your answer as a numeric type or a sympy object. The answer must be an integer greater or equal to zero.
Please reason step by step, and write clean and readable code.
You can use python libraries such as sympy, math or numpy to solve the problem.

```python"""

prompt_options = [custom_prompt_1, custom_prompt_2]

def get_formatted_prompt(problem, repetition_idx):
    prompt = prompt_options[repetition_idx % len(prompt_options)]
    prompt = prompt.replace('PROBLEM_PLACEHOLDER', problem)
    return prompt

### Results

In [None]:
class InferenceResult(BaseModel):
    # text
    prompt: str
    response: Optional[str] = None
    # answers
    boxed_answer: Optional[int] = None
    text_answer: Optional[int] = None
    code_answer: Optional[int] = None
    # output
    output_tokens: int = 0
    reached_max_tokens: bool = False
    # code
    coding_errors: int = 0
    code_interpreter_calls: int = 0

In [None]:
def is_difference_significative(n_first, n_second, n_tries, confidence_level=CFG.confidence_level):
    if n_second == 0:
        if n_first == n_tries:
            return is_difference_significative(n_first, 1, n_tries + 1, confidence_level)
        elif n_first < n_tries:
            return is_difference_significative(n_first, 1, n_tries, confidence_level)
        else:
            raise ValueError()
    p_first = n_first/n_tries
    p_second = n_second/n_tries
    uncertainty = (p_first*(1-p_first)/n_tries + p_second*(1-p_second)/n_tries)**0.5
    z = (p_first - p_second)/uncertainty
    logging.info(f'p_first: {p_first*100:.1f}% p_second: {p_second*100:.1f}% Confidence level for the difference: {2*(norm.cdf(z) - 0.5)*100:.1f}%')
    return z > norm.interval(confidence_level)[1]

is_difference_significative(3, 0, 3)

In [None]:
def log_ground_truth(idx):
    if isinstance(env, MockEnvWithDataframe) and 'ground_truth' in df.columns:
        logging.info(f'Ground truth: {df["ground_truth"].loc[idx]}')

class Results():
    def __init__(self):
        self.results = dict()

    def initialize(self, idx):
        self.results[idx] = []

    def add_result(self, idx, result: InferenceResult):
        self.results[idx].append(result)
    
    def log_results_distribution(self, idx):
        log_ground_truth(idx)
        keys = ['boxed_answer', 'text_answer', 'code_answer']
        for key in keys:
            values = self.get_result_distribution(idx, key)
            logging.info(f'{key} distribution: {values}')

    def get_valid_results(self, idx, result_priority):
        results = []
        for result in self.results[idx]:
            result = result.dict()
            for key in result_priority:
                if result[key] is not None:
                    results.append(result[key])
                    break
        if results:
            return results
        raise NoValidResults(idx)

    def get_most_frequent_result(self, idx, result_priority=CFG.result_priority):
        valid_results = self.get_valid_results(idx, result_priority)
        counter_ret = Counter(valid_results).most_common()
        logging.info(f'Result counts for {idx}: {counter_ret}')
        result, count = get_minimum_most_frequent_value(counter_ret)
        return result, count

    def is_best_solution_found(self, idx, result_priority=CFG.result_priority):
        try:
            valid_results = self.get_valid_results(idx, result_priority)
            counter_ret = Counter(valid_results).most_common()
            logging.info(f'Result counts for {idx}: {counter_ret}')
            if len(counter_ret) == 1:
                return is_difference_significative(counter_ret[0][1], 0, len(valid_results))
            else:
                return is_difference_significative(counter_ret[0][1], counter_ret[1][1], len(valid_results))
        except NoValidResults:
            return False

    def get_result_distribution(self, idx, key):
        results = self.results[idx]
        distribution = np.array([result.dict()[key] for result in results])
        return distribution
    
    def save(self, filepath='results.json'):
        logging.info(f'Saving results in {filepath}')
        results = {idx: [result.dict() for result in results] for idx, results in self.results.items()}
        with open(filepath, 'w') as f:
            json.dump(results, f, indent=4)

    def load(self, filepath):
        logging.info(f'Loading results from {filepath}')
        with open(filepath, 'r') as f:
            results = json.load(f)
        self.results = {int(idx): [InferenceResult(**result) for result in results] for idx, results in results.items()}

    def __repr__(self):
        return str(self.results)
    
def get_minimum_most_frequent_value(counter_ret):
    max_count = counter_ret[0][1]
    candidates = []
    for value, count in counter_ret:
        if count == max_count:
            candidates.append(value)
        else:
            break
    return min(candidates), max_count

class NoValidResults(Exception):
    pass

assert get_minimum_most_frequent_value([(2, 1), (3, 1)]) == (2, 1)
assert get_minimum_most_frequent_value([(3, 1), (2, 1)]) == (2, 1)
assert get_minimum_most_frequent_value([(3, 2), (2, 1)]) == (3, 2)

### Inference

In [None]:
def solve_problem_with_code_interpreter(prompt):
    text_generator.reset()
    clear_memory()
    code_runner = CodeRunner(CFG.max_coding_errors)
    decoded_output = prompt
    stop_word_cond = True
    generation_mode = 'text'
    while stop_word_cond and text_generator.are_generation_tokens_available():
        if decoded_output.endswith("Problem:"):
            break
        is_code_block_finished = not decoded_output.endswith("```python") and generation_mode == 'code'
        if is_code_block_finished:
            code_text = parse_last_python_code_block(decoded_output)
            code_output = code_runner.run_code(code_text)
            if code_runner.max_coding_errors_reached():
                break
            decoded_output = add_code_output_to_prompt(decoded_output, code_output)

        if decoded_output.endswith("```python"):
            decoded_output += '\n'
            generation_mode = 'code'
        else:
            generation_mode = 'text'

        decoded_output = text_generator(decoded_output, mode=generation_mode)
        stop_word_cond = any(decoded_output.endswith(stop_word) for stop_word in stop_words)

    log_gpu_memory()
    if prompt.endswith("```python"):
        decoded_output = decoded_output.replace(prompt, '```python')
        prompt = prompt[:-len("```python")]
    else:
        decoded_output = decoded_output.replace(prompt, '')
    result = InferenceResult(
        prompt=prompt,
        response=decoded_output,
        output_tokens=text_generator.generated_tokens,
        coding_errors=code_runner.n_coding_errors,
        code_interpreter_calls=code_runner.code_interpreter_calls
    )
    if not text_generator.are_generation_tokens_available():
        # Solution was not achieved, it does not have sense to parse responses
        logging.warning(f'Max number of new tokens {CFG.max_new_tokens} was reached. Solution not found.')
        result.reached_max_tokens = True
    else:
        logging.info(f'Total generated tokens: {text_generator.generated_tokens}')
        if not code_runner.max_coding_errors_reached():
            result.boxed_answer = parse_boxed_answer(decoded_output)
            result.text_answer = parse_response_in_text(decoded_output)
            result.code_answer = parse_response_in_code(code_runner.successful_code_output)
    return result

### Show

In [None]:
def display_decoded_output(idx, text):
    display(Markdown('---'))
    display(Markdown(f'### Problem {idx}'))
    display(Markdown(text.replace('Assistant: ', 'Assistant: \n')))
    display(Markdown('---'))

### Results analysis

In [None]:
def show_inference_insights(results):
    keys = ['coding_errors', 'output_tokens', 'code_interpreter_calls']
    answers = ['boxed_answer', 'text_answer', 'code_answer']
    rows = []
    for idx in results.results:
        logging.info(f'Logging inference insights for problem {idx}')
        row = dict(n_runs=len(results.get_result_distribution(idx, keys[0])))
        for key in keys:
            values = results.get_result_distribution(idx, key)
            logging.info(f'{key} distribution: {values}')
            row[f'mean_{key}'] = round(np.mean(values), 1)
            row[f'median_{key}'] = round(np.median(values), 1)
        values = results.get_result_distribution(idx, 'reached_max_tokens')
        logging.info(f'reached_max_tokens distribution: {values}')
        row['unfinished_responses'] = np.sum(values)
        for answer in answers:
            values = results.get_result_distribution(idx, answer)
            logging.info(f'{answer} distribution: {values}')
            row[f'{answer}s'] = np.sum(values != None)
        rows.append(row)
        logging.info('')
    insights = pd.DataFrame(rows)
    summary = insights.sum()
    for column in insights.columns:
        if 'mean' in column or 'median' in column:
            summary[column] = round(summary[column] / len(insights), 1)
    insights.loc['all'] = summary
    for column in insights.columns[-5:]:
        insights[column] = (insights[column]/insights['n_runs']*100).round(1)
    return insights

In [None]:
def get_accuracy_report(results, result_priority):
    report = df[['answer', 'ground_truth']].copy()
    report['answer'] = 0
    report['n_runs'] = 0
    report['correct_counts'] = 0
    report['highest_wrong_counts'] = 0
    report['wrong_counts'] = 0
    report['highest_correct_tokens'] = None

    for idx in results.results:
        try:
            report.loc[idx, 'n_runs'] = get_n_runs(idx, results)
            values = np.array(results.get_valid_results(idx, result_priority))
            counter_ret = Counter(values).most_common()
            report.loc[idx, 'answer'] = get_minimum_most_frequent_value(counter_ret)[0]
            ground_truth = df.loc[idx, 'ground_truth']
            for pred, count in counter_ret:
                if pred == ground_truth:
                    report.loc[idx, 'correct_counts'] = count
                    break
            report.loc[idx, 'highest_wrong_counts'] = get_highest_wrong_count(counter_ret, ground_truth)
            report.loc[idx, 'wrong_counts'] = get_wrong_counts(counter_ret, ground_truth)
        except NoValidResults:
            report.loc[idx, 'answer'] = None
        if report.loc[idx, 'correct_counts'] > 0:
            report.loc[idx, 'highest_correct_tokens'] = get_highest_correct_tokens(idx, ground_truth, results)
    report['is_correct'] = (report['answer'] == report['ground_truth']).astype(int)
    report['pass'] = report['correct_counts'] > 0
    report.loc[report['answer'].isna(), 'is_correct'] = np.nan
    return add_summary_to_report(report)

def add_summary_to_report(report):
    summary = report.sum()
    for key in report.columns[:2]:
        summary[key] = '-'
    summary['highest_correct_tokens'] = report['highest_correct_tokens'].max()
    report.loc['summary'] = summary
    return report

def get_highest_wrong_count(counter_ret, ground_truth):
    for pred, count in counter_ret:
        if pred != ground_truth:
            return count
    return 0

def get_wrong_counts(counter_ret, ground_truth):
    wrong_counts = 0
    for pred, count in counter_ret:
        if pred != ground_truth:
            wrong_counts += count
    return wrong_counts


def get_n_runs(idx, results):
    return len(results.results[idx])

def get_highest_correct_tokens(idx, ground_truth, results):
    highest_correct_tokens = 0
    tokens = results.get_result_distribution(idx, 'output_tokens')
    for answer in CFG.result_priority:
        values = results.get_result_distribution(idx, answer)
        correct_answer_tokens = tokens[values == ground_truth]
        if len(correct_answer_tokens) > 0:
            max_tokens = max(correct_answer_tokens)
            highest_correct_tokens = max(highest_correct_tokens, max_tokens)
    return highest_correct_tokens

In [None]:
def analyze_MATH_results(result_priority):
    logging.info(f'Analyzing MATH results for {result_priority} priorities')
    accuracy_report = get_accuracy_report(results, result_priority)
    print_disaggregated_metrics(accuracy_report)
    accuracy_report = accuracy_report.loc[accuracy_report.index[:-1]]
    print_relevant_metrics(accuracy_report)
    for key in ['level', 'type']:
        accuracy_report[key] = df[key]
        plot_grouped_results(accuracy_report, key)


def print_relevant_metrics(accuracy_report):
    correct = accuracy_report['is_correct'].value_counts().get(1, 0)
    unanswered = accuracy_report['is_correct'].isna().sum()
    wrong = accuracy_report['is_correct'].value_counts().get(0, 0)
    total = correct + unanswered + wrong
    accuracy = correct/total
    print('\tAggregated metrics majority vote')
    print(f'Correct: {correct}/{total} ({accuracy:.2f} ± {estimate_uncertainty(accuracy, total):.2f})')
    print(f'Unanswered: {unanswered}/{total} ({unanswered/total:.2f} ± {estimate_uncertainty(unanswered/total, total):.2f})')
    print(f'Wrong: {wrong}/{total} ({wrong/total:.2f} ± {estimate_uncertainty(wrong/total, total):.2f})')
    print('\tAggregated metrics pass')
    correct = accuracy_report['pass'].sum()
    accuracy = correct/total
    print(f'Correct: {correct}/{total} ({accuracy:.2f} ± {estimate_uncertainty(accuracy, total):.2f})')


def estimate_uncertainty(proportion, n):
    return 1.96 * np.sqrt(proportion * (1 - proportion) / n)


def print_disaggregated_metrics(accuracy_report):
    correct = accuracy_report.loc['summary', 'correct_counts']
    wrong = accuracy_report.loc['summary', 'wrong_counts']
    total = accuracy_report.loc['summary', 'n_runs']
    unanswered = total - correct - wrong
    print('\tDisaggregated metrics')
    print(f'Correct: {correct}/{total} ({correct/total:.2f} ± {estimate_uncertainty(correct/total, total):.2f})')
    print(f'Unanswered: {unanswered}/{total} ({unanswered/total:.2f} ± {estimate_uncertainty(unanswered/total, total):.2f})')
    print(f'Wrong: {wrong}/{total} ({wrong/total:.2f} ± {estimate_uncertainty(wrong/total, total):.2f})')


def plot_grouped_results(df, group):
    categories = sorted(df[group].unique().tolist())
    correct = []
    unanswered = []
    wrong = []
    for category in categories:
        correct.append(df[df[group] == category].is_correct.value_counts().get(1, 0))
        unanswered.append(df[df[group] == category].is_correct.isna().sum())
        wrong.append(df[df[group] == category].is_correct.value_counts().get(0, 0))

    correct.append(np.sum(correct))
    unanswered.append(np.sum(unanswered))
    wrong.append(np.sum(wrong))
    categories.append('overall')

    total = np.array(correct) + np.array(unanswered) + np.array(wrong)
    correct = np.array(correct)/total
    unanswered = np.array(unanswered)/total
    wrong = np.array(wrong)/total
    plt.bar(categories, correct, label='Correct', color='tab:green')
    plt.bar(categories, unanswered, bottom=correct, label='Unanswered', color='tab:orange')
    plt.bar(categories, wrong, bottom=np.array(correct)+np.array(unanswered), label='Wrong', color='tab:red')
    for idx, value in enumerate(categories):
        plt.text(value, correct[idx]/2, f'{correct[idx]*100:.0f}%', ha='center', va='center')
        plt.text(value, correct[idx] + unanswered[idx]/2, f'{unanswered[idx]*100:.0f}%', ha='center', va='center')
        plt.text(value, correct[idx] + unanswered[idx] + wrong[idx]/2, f'{wrong[idx]*100:.0f}%', ha='center', va='center')
    #plt.legend(loc=0)
    plt.ylim(0, 1)
    plt.grid(axis='y')
    plt.title(f'Results grouped by {group}')
    plt.show()

In [None]:
raise

## Merge results

### Merge results

In [None]:
def merge_results(results_filepaths, new_filepath):
    merged_results = dict()
    for results_idx, filepath in enumerate(results_filepaths):
        with open(filepath, 'r') as f:
            new_results = json.load(f)
        new_results = {int(idx)*len(results_filepaths) + results_idx: results for idx, results in new_results.items()}
        merged_results.update(new_results)
    with open(new_filepath, 'w') as f:
        json.dump(merged_results, f, indent=4)


In [None]:
merge_results(['/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-05-23_08:19:21_results.json',
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-05-23_08:48:16_results.json',],
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/02_public_notebook_prompts.json')

In [None]:
merge_results(['/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-05_03:09:43_results.json',
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-05_02:08:38_results.json',
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-05_09:47:32_results.json',
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-05_10:02:20_results.json',
               ],
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/05_multi_prompt.json')

In [None]:
merge_results(['/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-05_18:44:19_results.json',
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-06_01:24:28_results.json',
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-06_03:37:41_results.json',
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/2024-06-06_08:37:22_results.json',
               ],
               '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/06_multi_prompt_conf09.json')

### Concat results

In [None]:
def concat_results(results_filepaths, new_filepath):
    results_concat = dict()
    for filepath in results_filepaths:
        with open(filepath, 'r') as f:
            new_results = json.load(f)
        if not results_concat:
            results_concat = new_results
        else:
            for idx, results in new_results.items():
                results_concat[idx] += results
    # let's shuffle the results
    for idx in results_concat:
        np.random.shuffle(results_concat[idx])
    with open(new_filepath, 'w') as f:
        json.dump(results_concat, f, indent=4)

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-04_08:19:02_results.json', # v10 2 prompts
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-04_08:11:25_results.json', # v10 2 prompts
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_05:50:06_results.json', # v10 2 prompts
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-02_19:53:43_results.json', # v8
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-27_16:39:12_results.json', # v5 assistant
]
concat_results(filepaths, '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/03_2_prompts.json')

In [None]:
filepaths = sorted(glob.glob('/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/*.json'))
concat_results(filepaths, '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/04_all_evaluations.json')

In [None]:
concat_results([
    '/mnt/hdd0/Kaggle/aimo/experiments/17_vllm/2024-06-12_08:46:54_results.json', # 200
    '/mnt/hdd0/Kaggle/aimo/experiments/17_vllm/2024-06-09_11:46:47_results.json', # 25
    '/mnt/hdd0/Kaggle/aimo/experiments/17_vllm/2024-06-10_04:47:21_results.json', # 100
    '/mnt/hdd0/Kaggle/aimo/experiments/17_vllm/2024-06-10_23:50:22_results.json', # 200 but with incorrect time limit
    ],
    '/mnt/hdd0/Kaggle/aimo/experiments/17_vllm/400_repetitions.json')

## Results reanalysis

### Define experiments

In [None]:
experiment_to_results = {
    # 09_evaluate_MATH5
    'MATHCodeInstruct_10_4rep_2shot': '/mnt/hdd0/Kaggle/aimo/experiments/09_evaluate_MATH5/2024-05-15_08:35:58_results.json',
    '08_original_prompts': '/mnt/hdd0/Kaggle/aimo/experiments/09_evaluate_MATH5/2024-05-14_21:11:18_results.json',
    '11_original_code_prompt': '/mnt/hdd0/Kaggle/aimo/experiments/09_evaluate_MATH5/2024-05-15_11:53:30_results.json',
    '12_original_cot_prompt': '/mnt/hdd0/Kaggle/aimo/experiments/09_evaluate_MATH5/2024-05-15_11:33:15_results.json',
    # 10_temperature
    'AIMO_train_02_temperature_025': '/mnt/hdd0/Kaggle/aimo/experiments/10_temperature/2024-05-17_05:07:49_results.json',
    # 12_prompt_variations
    '01_original_code_with_python': '/mnt/hdd0/Kaggle/aimo/experiments/12_prompt_variations/2024-05-18_10:06:31_results.json',
    '02_original_cot_with_python': '/mnt/hdd0/Kaggle/aimo/experiments/12_prompt_variations/2024-05-18_10:02:56_results.json',
    '05_custom_prompt': '/mnt/hdd0/Kaggle/aimo/experiments/12_prompt_variations/2024-05-18_16:31:45_results.json',
    # 13_full_evaluation
    '01_3_python_prompts': '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/01_3_python_prompts.json',
    '02_public_notebook_prompts': '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/02_public_notebook_prompts.json',
    '03_2_prompts': '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/03_2_prompts.json',
    '04_all_evaluations': '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/04_all_evaluations.json',
    '05_multi_prompt': '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/05_multi_prompt.json',
    '06_multi_prompt_conf09': '/mnt/hdd0/Kaggle/aimo/experiments/13_full_evaluation/06_multi_prompt_conf09.json',
    'vll_400_repetitions': '/mnt/hdd0/Kaggle/aimo/experiments/17_vllm/400_repetitions.json',
}

### Reanalysis

In [None]:
def reanalyze_results(experiment):
    global results
    results = Results()
    results.load(experiment_to_results[experiment])
    # reparse the text results with the new parsing functions
    for idx, inference_results in results.results.items():
        for inference_result in inference_results:
            inference_result.text_answer = parse_response_in_text(inference_result.response)
    for result_priority in [['code_answer'], ['text_answer'], ['code_answer', 'text_answer'], ['text_answer', 'code_answer']]:
        print(f'Analyzing results for {result_priority} priorities')
        accuracy_report = get_accuracy_report(results, result_priority)
        print_disaggregated_metrics(accuracy_report)

In [None]:
reanalyze_results('03_2_prompts')

In [None]:
analyze_MATH_results(['code_answer', 'text_answer'])

In [None]:
accuracy_report_code = get_accuracy_report(results, ['code_answer'])
accuracy_report_text = get_accuracy_report(results, ['text_answer'])
(accuracy_report_code.is_correct - accuracy_report_text.is_correct).value_counts()

In [None]:
accuracy_report_code.tail()

In [None]:
accuracy_report_code.loc['summary', 'highest_correct_tokens']

## Pairwise comparison

In [None]:
def pairwise_comparison(experiment_1, experiment_2, result_priority=['code_answer', 'text_answer']):
    results_1 = Results()
    if not os.path.exists(experiment_1): experiment_1 = experiment_to_results[experiment_1]
    results_1.load(experiment_1)
    results_2 = Results()
    if not os.path.exists(experiment_2): experiment_2 = experiment_to_results[experiment_2]
    results_2.load(experiment_2)
    experiment_1_is_correct = get_accuracy_report(results_1, result_priority).is_correct.values[:-1]
    experiment_2_is_correct = get_accuracy_report(results_2, result_priority).is_correct.values[:-1]
    experiment_1_is_correct[np.isnan(experiment_1_is_correct)] = 0.5
    experiment_2_is_correct[np.isnan(experiment_2_is_correct)] = 0.5
    print_summarized_results('Experiment 1', experiment_1_is_correct)
    print_summarized_results('Experiment 2', experiment_2_is_correct)
    experiment_1_is_better = np.sum(experiment_1_is_correct - experiment_2_is_correct > 0)
    experiment_2_is_better = np.sum(experiment_2_is_correct - experiment_1_is_correct > 0)
    print(f'Experiment 1 is better in {experiment_1_is_better} problems, while experiment 2 is better in {experiment_2_is_better} problems')
    fast_pairwise_comparison_experiment(experiment_1_is_better, experiment_2_is_better)

def print_summarized_results(prefix, is_correct):
    print(f'{prefix} Correct: {np.sum(is_correct==1)}, Unanswered: {np.sum(is_correct == 0.5)}, Wrong: {np.sum(is_correct==0)}')

def run_pairwise_comparison_experiment(experiment_1_is_better, experiment_2_is_better, n_runs=1e5):
    distribution = [1]*experiment_1_is_better + [-1]*experiment_2_is_better
    diff_distribution = []
    for _ in range(int(n_runs)):
        resampled_distribution = np.random.choice(distribution, len(distribution), replace=True)
        diff_distribution.append(np.mean(resampled_distribution))
    plt.title('Experiment 1 - Experiment 2 bootstrapped difference distribution')
    plt.hist(diff_distribution, density=True, bins=20)
    plt.plot([0, 0], plt.ylim(), 'r')
    return diff_distribution

def fast_pairwise_comparison_experiment(experiment_1_is_better, experiment_2_is_better):
    distribution = [1]*experiment_1_is_better + [-1]*experiment_2_is_better
    distribution_mean = np.mean(distribution)
    distribution_uncertainty = np.std(distribution)/np.sqrt(len(distribution))*1.96
    print(f'Experiment diff mean: {distribution_mean:.2f} ± {distribution_uncertainty:.2f}')
    if distribution_mean > distribution_uncertainty:
        print('Experiment 1 is better')
    elif distribution_mean < -distribution_uncertainty:
        print('Experiment 2 is better')
    else:
        print('No significant difference between experiments')

In [None]:
pairwise_comparison(
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_05:50:06_results.json',
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_06:01:48_results.json')

In [None]:
pairwise_comparison(
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_05:50:06_results.json',
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-27_04:01:40_results.json')

In [None]:
pairwise_comparison(
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_05:50:06_results.json',
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-25_00:28:30_results.json')

In [None]:
pairwise_comparison(
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_05:50:06_results.json',
    '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_06:01:48_results.json')

In [None]:
pairwise_comparison('01_3_python_prompts', '02_public_notebook_prompts')

In [None]:
pairwise_comparison('01_original_code_with_python', '11_original_code_prompt')

In [None]:
pairwise_comparison('01_3_python_prompts', '03_2_prompts')

In [None]:
pairwise_comparison('05_multi_prompt', '03_2_prompts')

In [None]:
pairwise_comparison('05_multi_prompt', '01_3_python_prompts')

In [None]:
pairwise_comparison('05_multi_prompt', '06_multi_prompt_conf09')

## Math results

In [None]:
results = Results()
results.load(experiment_to_results['01_3_python_prompts'])
analyze_MATH_results(['code_answer', 'text_answer'])

In [None]:
results = Results()
results.load(experiment_to_results['03_2_prompts'])
analyze_MATH_results(['code_answer', 'text_answer'])

In [None]:
results = Results()
results.load(experiment_to_results['04_all_evaluations'])
analyze_MATH_results(['code_answer', 'text_answer'])

In [None]:
results = Results()
results.load(experiment_to_results['02_public_notebook_prompts'])
analyze_MATH_results(['code_answer', 'text_answer'])

In [None]:
results = Results()
results.load(experiment_to_results['05_multi_prompt'])
analyze_MATH_results(['code_answer', 'text_answer'])

In [None]:
results = Results()
results.load(experiment_to_results['06_multi_prompt_conf09'])
analyze_MATH_results(['code_answer', 'text_answer'])

In [None]:
results = Results()
results.load(experiment_to_results['vll_400_repetitions'])
analyze_MATH_results(['code_answer', 'text_answer'])

## Simulate effect of number of repetitions

In [None]:
def simulate_effect_of_number_of_repetitions(experiment, result_priority, max_repetitions=None):
    global results
    results = Results()
    if not os.path.exists(experiment): experiment = experiment_to_results[experiment]
    results.load(experiment)
    accuracy_report = get_accuracy_report(results, result_priority)
    max_repetitions = max_repetitions or accuracy_report.n_runs.values[:-1].max()
    n_repetitions_range = np.linspace(1, max_repetitions, 40).astype(int)
    n_repetitions_range = sorted(np.unique(n_repetitions_range))[::-1]
    #n_repetitions_range = np.arange(accuracy_report.n_runs.values[:-1].max(), 0, -1)
    correct, wrong, unanswered = [], [], []
    for n_repetitions in tqdm(n_repetitions_range):
        results.results = {idx: results.results[idx][:n_repetitions] for idx in results.results}
        accuracy_report = get_accuracy_report(results, result_priority)
        correct.append(np.mean(accuracy_report.is_correct == 1))
        wrong.append(np.mean(accuracy_report.is_correct == 0))
        unanswered.append(np.mean(accuracy_report.is_correct.isna()))

    correct, wrong, unanswered = np.array(correct), np.array(wrong), np.array(unanswered)
    plt.fill_between(n_repetitions_range, 0, correct, label='Correct', color='tab:green', alpha=0.5)
    plt.fill_between(n_repetitions_range, correct, correct + unanswered, label='Unanswered', color='tab:orange', alpha=0.5)
    plt.fill_between(n_repetitions_range, correct + unanswered, correct + unanswered + wrong, label='Wrong', color='tab:red', alpha=0.5)

    for x, y in zip(n_repetitions_range, correct):
        plt.text(x, y - 0.01, f'{y*100:.0f}%', ha='center', va='top')

    plt.legend(loc='lower center')
    plt.grid()
    offset = 0.02
    plt.ylim(np.min(correct) - offset, np.max(correct + unanswered) + offset)
    plt.xlim(n_repetitions_range[-1], n_repetitions_range[0])
    plt.xlabel('Number of repetitions')
    plt.ylabel('Results')
    plt.title(f'Effect of the number of repetitions on {experiment} for {result_priority} priorities')

In [None]:
simulate_effect_of_number_of_repetitions('vll_400_repetitions', result_priority=['code_answer', 'text_answer'], max_repetitions=400)

In [None]:
simulate_effect_of_number_of_repetitions('/mnt/hdd0/Kaggle/aimo/experiments/17_vllm/2024-06-12_08:46:54_results.json', result_priority=['code_answer', 'text_answer'])

In [None]:
simulate_effect_of_number_of_repetitions('01_3_python_prompts', result_priority=['code_answer', 'text_answer'])

In [None]:
simulate_effect_of_number_of_repetitions('02_public_notebook_prompts', result_priority=['code_answer', 'text_answer'])

In [None]:
simulate_effect_of_number_of_repetitions('03_2_prompts', result_priority=['code_answer', 'text_answer'], max_repetitions=25)

In [None]:
simulate_effect_of_number_of_repetitions('04_all_evaluations', result_priority=['code_answer', 'text_answer'])

In [None]:
simulate_effect_of_number_of_repetitions('05_multi_prompt', result_priority=['code_answer', 'text_answer'])

In [None]:
simulate_effect_of_number_of_repetitions('06_multi_prompt_conf09', result_priority=['code_answer', 'text_answer'])

Maybe having more prompts is helpful?

## Simulate effect of the number of tokens

### Effect of the number of tokens

In [None]:
def simulate_effect_of_number_of_tokens(experiment, result_priority):
    global results
    results = Results()
    results.load(experiment_to_results[experiment])
    accuracy_report = get_accuracy_report(results, result_priority)
    n_tokens_range = np.linspace(accuracy_report_code.loc['summary', 'highest_correct_tokens'], 128, 20)
    correct, wrong, unanswered = [], [], []
    for n_tokens in n_tokens_range:
        results.results = {idx: [result for result in inference_results if result.output_tokens < n_tokens] for idx, inference_results in results.results.items()}
        accuracy_report = get_accuracy_report(results, result_priority)
        correct.append(np.mean(accuracy_report.is_correct == 1))
        wrong.append(np.mean(accuracy_report.is_correct == 0))
        unanswered.append(np.mean(accuracy_report.is_correct.isna()))

    correct, wrong, unanswered = np.array(correct), np.array(wrong), np.array(unanswered)
    plt.fill_between(n_tokens_range, 0, correct, label='Correct', color='tab:green', alpha=0.5)
    plt.fill_between(n_tokens_range, correct, correct + unanswered, label='Unanswered', color='tab:orange', alpha=0.5)
    plt.fill_between(n_tokens_range, correct + unanswered, correct + unanswered + wrong, label='Wrong', color='tab:red', alpha=0.5)

    for x, y in zip(n_tokens_range, correct):
        plt.text(x, y - 0.01, f'{y*100:.0f}%', ha='center', va='top')

    plt.legend(loc='lower center')
    plt.grid()
    offset = 0.02
    plt.ylim(np.min(correct) - offset, np.max(correct + unanswered) + offset)
    plt.xlim(n_tokens_range[-1], n_tokens_range[0])
    plt.xlabel('Number of output tokens')
    plt.ylabel('Results')
    plt.title(f'Effect of the number of output tokens on {experiment} for {result_priority} priorities')

In [None]:
simulate_effect_of_number_of_tokens('01_3_python_prompts', result_priority=['code_answer', 'text_answer'])

In [None]:
simulate_effect_of_number_of_tokens('02_public_notebook_prompts', result_priority=['code_answer', 'text_answer'])

### Token distribution

In [None]:
def plot_output_tokens_distribution(experiment, result_priority):
    global results
    results = Results()
    results.load(experiment_to_results[experiment])
    correct_answer_tokens, wrong_answer_tokens = [], []
    for problem_idx in results.results:
        ground_truth = df.loc[problem_idx, 'ground_truth']
        tokens = results.get_result_distribution(problem_idx, 'output_tokens')
        values = [None for _ in range(len(tokens))]
        for answer in result_priority:
            new_values = results.get_result_distribution(problem_idx, answer)
            for idx, value in enumerate(new_values):
                if value is not None:
                    values[idx] = value
        values = np.array(values)
        correct_answer_tokens.extend(tokens[values == ground_truth].tolist())
        wrong_answer_tokens.extend(tokens[values != ground_truth].tolist())

    bins = np.linspace(0, np.max(wrong_answer_tokens), 20)
    plt.hist(correct_answer_tokens, alpha=0.5, label='Correct', bins=bins, density=True)
    plt.hist(wrong_answer_tokens, alpha=0.5, label='Wrong', bins=bins, density=True)
    plt.legend(loc=0)
    plt.title(f'Output token distribution for {experiment} for {result_priority} priorities')

    return correct_answer_tokens + wrong_answer_tokens

In [None]:
output_tokens = plot_output_tokens_distribution('01_3_python_prompts', result_priority=['code_answer', 'text_answer'])
np.sum(output_tokens), np.sum(np.clip(output_tokens, 0, 512))

In [None]:
25*44/32

In [None]:
output_tokens = plot_output_tokens_distribution('02_public_notebook_prompts', result_priority=['code_answer', 'text_answer'])
np.sum(output_tokens), np.sum(np.clip(output_tokens, 0, 512))

In [None]:
25*52/37

We might increase the repetitions from 25 to 35 if reducing the output tokens from 1024 to 512

## Results analysis


- Distribution of correct answers per problem
- Distribution of the number of runs

In [None]:
results = Results()
results.load(experiment_to_results['04_all_evaluations'])
accuracy_report = get_accuracy_report(results, ['code_answer', 'text_answer'])
accuracy_report = accuracy_report.loc[accuracy_report.index[:-1]]
accuracy_report.head()

In [None]:
bins = np.arange(-.5, 26, 1)
plt.hist(accuracy_report['n_runs'], bins=bins)
plt.title('Distribution of the number of runs per problem');

In [None]:
bins = np.arange(-0.5, 20, 1)
plt.hist(accuracy_report['correct_counts'], bins=bins)
plt.title('Distribution of the number of correct counts per problem');

In [None]:
(accuracy_report['correct_counts'] == 0).mean()

## What if I remove answers with more than one code execution?

In [None]:
results = Results()
results.load(experiment_to_results['03_2_prompts'])
analyze_MATH_results(['code_answer'])

In [None]:
results.results = {idx: [inference_result for inference_result in inference_results if inference_result.code_interpreter_calls == 1] for idx, inference_results in results.results.items()}
analyze_MATH_results(['code_answer'])

The accuracy decreases from 57% to 54% when only allowing one code execution, however the estimation is not totally fair because the inference would have been different (maybe more runs)

## Can I find a better combination of prompts?

Let's do a random search to select 5 out of all evaluations and maximize the accuracy on the MATH dataset.

The baseline is 332 correct answers.

In [None]:
filepaths = sorted(glob.glob('/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/*.json'))
n_runs = 6000
n_samples = 7
search_results = []
for _ in tqdm(range(n_runs)):
    sample_filepaths = sorted(np.random.choice(filepaths, n_samples, replace=False))
    try:
        concat_results(sample_filepaths, 'delete.json')
    except KeyError:
        continue
    results = Results()
    results.load('delete.json')
    accuracy_report = get_accuracy_report(results, ['code_answer', 'text_answer'])
    correct = accuracy_report.is_correct.value_counts().get(1, 0)
    print(correct)
    search_results.append([correct, sample_filepaths])


In [None]:
plt.hist([result[0] for result in search_results])

```python

[361,
  ['/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-27_04:01:40_results.json', # 09 AIMO train 2 shots
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-28_05:18:13_results.json', # custom prompt v7
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-30_22:22:00_results.json',
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-31_09:27:26_results.json',
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-01_04:42:58_results.json', # AIMO 2 shots assistant t09
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-01_04:49:02_results.json', # AIMO 2 shots assistant t20
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-02_05:28:41_results.json'  # 20 AIMO 2 shots assistant 
   ]
],

[[360,
  ['/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-29_05:42:08_results.json', # 16 AIMO 2 shots assistant
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-30_05:40:33_results.json', # 19 AIMO 2 shots assistant t02
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-31_03:51:32_results.json', # 20 AIMO 2 shots assistant t07
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-31_09:27:26_results.json', 
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-02_05:28:41_results.json', # 20 AIMO 2 shots assistant 
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-03_05:50:06_results.json', # 2 assistant
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-04_08:11:25_results.json' # 2 assistant
]],


[
    357,
'/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-01_04:42:58_results.json', # AIMO 2 shots assistant t09
'/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-04_08:11:25_results.json', # 2 assistant
'/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-01_04:49:02_results.json', # AIMO 2 shots assistant t20
'/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-26_02:53:20_results.json', # custom prompt v3 list
'/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-25_00:28:30_results.json', # 01 public notebook prompts
]

[[356,
  ['/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-26_02:53:20_results.json', # custom prompt v3 list
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-05-27_04:01:40_results.json', # 09 AIMO train 2 shots
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-02_05:21:38_results.json', # 20 AIMO 2 shots assistant t09 top_p 0.5
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-02_05:28:41_results.json', # 20 AIMO 2 shots assistant t07 top_p 0.5
   '/mnt/hdd0/Kaggle/aimo/experiments/15_prompt_engineering/2024-06-04_08:11:25_results.json' # 2 assistant
]],
```

In [None]:
357/580

In [None]:
sorted(search_results, key=lambda x: x[0], reverse=True)[:5]

In [None]:
sorted(search_results, key=lambda x: x[0], reverse=True)[:5]

In [None]:
sorted(search_results, key=lambda x: x[0], reverse=True)[:5]

In [None]:
357/580

In [None]:
sorted(search_results, key=lambda x: x[0], reverse=True)[:5]