In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

from pathlib import Path
import json
from utils import load_arguments, get_boxed_answer
from latex_formater import latex_deformat
from model_generate import load_datasets, prompt_generator, load_model
from blocks import Batch, Block
import pandas as pd

root_dir = Path('~/model_evals').expanduser()

# load the name maps
with open('name_maps.json', 'r') as f:
    name_maps = json.load(f)
    DATASET_MAP = name_maps['DATASET_MAP']
    MODEL_MAP = name_maps['MODEL_MAP']

In [15]:
def load_predictions(path):
    with open(path / 'output.json', 'r') as f:
        return json.load(f)


predictions = [
    {
        'dataset': dataset.name,
        'model': model.name,
        'method': method.name,
        'predictions': load_predictions(method),
        'generation_args': load_arguments(method / 'args.json').__dict__
    }
    for dataset in root_dir.iterdir()
    for model in dataset.iterdir()
    for method in model.iterdir()
]
args = predictions[0]['generation_args']
# load using pandas instead
# predictions = pd.DataFrame(
#     [
#         {
#             'dataset': dataset.name,
#             'model': model.name,
#             'method': method.name,
#             'predictions': load_predictions(method),
#             'args': load_arguments(method / 'args.json')
#         }
#         for dataset in root_dir.iterdir()
#         for model in dataset.iterdir()
#         for method in model.iterdir()
#     ]
# )

# predictions.set_index(['dataset', 'model', 'method'], inplace=True)
# args = predictions.iloc[0].args

In [16]:
save_path = Path('~/GitHub/gold-ai-olympiad/data/MATH/Predictions/predictions.json').expanduser()
import json
with open(save_path, 'w') as f:
    json.dump(predictions, f, indent=4)

In [3]:
# args.model = 'mistralai/Mistral-7B-Instruct-v0.2'
# args.use_chat = True
# args.model = 'mistralai/Mixtral-8x7B-Instruct-v0.1'
# args.use_chat = True
# args.model = 'tiiuae/falcon-7b'  # Don't use?
# args.use_chat = False
# args.model = 'meta-llama/Llama-2-7b-chat--hf'
# args.use_chat = True
# args.model = 'meta-llama/Llama-2-13b-chat-hf'
# args.use_chat = True
# args.model = 'meta-llama/Llama-2-70b-chat-hf'
# args.use_chat = True
# args.model = 'meta-llama/Meta-Llama-3-8B-Instruct'
# args.use_chat = True
# args.model = 'meta-llama/Meta-Llama-3-70B-Instruct'
# args.use_chat = True
# args.model = 'deepseek-ai/deepseek-math-7b-instruct'
# args.use_chat = True
# args.model = 'deepseek-ai/deepseek-llm-67b-chat'
# args.use_chat = True

In [3]:
%%capture
datasets = load_datasets(args)
datasets = {f'{DATASET_MAP[dataset_name]}_{args.split}': dataset for dataset_name, dataset in datasets.items()}

In [7]:
# get the prompts
dataset_name = 'Number_Theory_train'
prompt_gen = prompt_generator(args)
prompts = prompt_gen(Batch(datasets[dataset_name]))

In [18]:
datasets[dataset_name][0]

{'problem': 'If $AAA_4$ can be expressed as $33_b$, where $A$ is a digit in base 4 and $b$ is a base greater than 5, what is the smallest possible sum $A+b$?',
 'level': 'Level 4',
 'type': 'Number Theory',
 'solution': 'We can rewrite $AAA_4$ and $33_b$ to get \\begin{align*}\n16A+4A+A&=3b+3\\quad\\Rightarrow\\\\\n21A&=3b+3.\n\\end{align*}The smallest possible value for $A$ is 1, which gives us $21=3b+3$ and $b=6$. So the smallest sum $A+b=\\boxed{7}$. While there are other values for $A$ and $b$ that work, increasing $A$ will increase $b$, resulting in a larger sum.'}

In [8]:
i = 4
print(prompts[i])
print('=' * 80)
print(predictions.loc[(dataset_name, 'Deepseek-7b-RL', 'autoregressive')].predictions[i])

[{'role': 'user', 'content': 'Marcus has two numbers, $a$ and $b$.  When he divides $a$ by 45 he gets a remainder of 37.  When he divides $b$ by 30 he gets a remainder of $9$.  What remainder does he get when he divides $a+b$ by 15?\nPlease reason step by step, and put your final answer within \\boxed{}.'}, {'role': 'assistant', 'content': 'Marcus has two equations: \\[a=45n+37\\]and \\[b=30m+9.\\]When he adds these he gets \\[a+b=45n+30m+37+9=15(3n+2m)+46=15(3n+2m+3)+1.\\]The remainder when $a+b$ is divided by 15 is $\\boxed{1}$.'}, {'role': 'user', 'content': 'If $AAA_4$ can be expressed as $33_b$, where $A$ is a digit in base 4 and $b$ is a base greater than 5, what is the smallest possible sum $A+b$?\nPlease reason step by step, and put your final answer within \\boxed{}.'}, {'role': 'assistant', 'content': 'We can rewrite $AAA_4$ and $33_b$ to get \\begin{align*}\n16A+4A+A&=3b+3\\quad\\Rightarrow\\\\\n21A&=3b+3.\n\\end{align*}The smallest possible value for $A$ is 1, which gives u

In [20]:
class Metric(Block):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset

    def process(self, predictions):
        pass

In [37]:
def get_answer(solution: str):
    return latex_deformat(get_boxed_answer(solution))


class BoxedMatch(Metric):
    def __init__(self, dataset):
        super().__init__(dataset)

    def process(self, predictions):
        boxed_match = []
        for data, prediction in tqdm(zip(self.dataset, predictions)):
            pred_answer = get_answer(prediction)
            true_answer = get_answer(data['solution'])
            boxed_match.append(pred_answer == true_answer)
        return boxed_match

In [35]:
def apply_metric(predictions, metric, datasets):
    metric_name = metric.__name__

    def compute_metric(x):
        dataset_name = x.name[0]
        print(x.name)
        return metric(datasets[dataset_name])(Batch(x.predictions))
    
    predictions[metric_name] = predictions.apply(compute_metric, axis=1)

In [None]:
from tqdm.auto import tqdm

apply_metric(predictions, BoxedMatch, datasets)

In [20]:
i = 5
print('Question:')
print(prompts[i])
print('Answer:')
print(outputs[i])

Question:
[{'role': 'user', 'content': 'Let $A$ be the set of all numbers which can be represented as the sum of three consecutive positive integers. What is the greatest common divisor of all numbers in $A$?\nPlease reason step by step, and put your final answer within \\boxed{}.'}, {'role': 'assistant', 'content': 'Each member in the set is of the form $(x-1)+x+(x+1)=3x$. Since $x$ can be any positive integer, the greatest common divisor of all these members is $\\boxed{3}$.'}, {'role': 'user', 'content': 'How many integers between 100 and 300 have both 11 and 8 as factors?\nPlease reason step by step, and put your final answer within \\boxed{}.'}, {'role': 'assistant', 'content': 'The only numbers that have 11 and 8 as a factor are multiples of 88.  If we list the first few multiples of 88: $$88,176,264,352,...$$ we can see that there are exactly $\\boxed{2}$ between 100 and 300.'}, {'role': 'user', 'content': 'Express .$\\overline{28}$ as a common fraction.\nPlease reason step by s