In [54]:
import pickle

with open('/lfs/ampere8/0/kaif/model_evals/Algebra_test/Deepseek-7b-Instruct/teacher_forcing/output.pkl', 'rb') as f:
    t = pickle.load(f)

In [55]:
len(t)

6

In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

from pathlib import Path
import json
from utils import load_arguments, get_boxed_answer
from latex_formater import get_final_answer, process_docs, process_results, is_equiv
# from model_generate import load_datasets, prompt_generator, load_model
# from blocks import Batch, Block
import itertools
import pandas as pd

root_dir = Path('~/model_evals/').expanduser()

# load the name maps
with open('name_maps.json', 'r') as f:
    name_maps = json.load(f)
    DATASET_MAP = name_maps['DATASET_MAP']
    MODEL_MAP = name_maps['MODEL_MAP']

In [2]:
def load_predictions(path):
    print(path)
    if not (path / 'output.json').exists():
        return []
    with open(path / 'output.json', 'r') as f:
        return json.load(f)


predictions = pd.DataFrame([
    {
        'dataset': dataset.name,
        'model': model.name,
        'method': method.name,
        'prediction': prediction,
        'i': i,
        'generation_args': load_arguments(method / 'args.json')
    }
    for dataset in root_dir.iterdir()
    for model in dataset.iterdir()
    for method in model.iterdir()
    for i, prediction in enumerate(load_predictions(method))
])
args = predictions.iloc[0]['generation_args']

/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/Mixtral-8x22b-Instruct-v0.1/teacher_forcing
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/Gemma-1.1-7b-Instruct/teacher_forcing
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/Gemma-1.1-7b-Instruct/autoregressive
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/LLeMMA-34b/teacher_forcing
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/Abel-7b/teacher_forcing
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/Abel-7b/autoregressive
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/Llama-2-13b/teacher_forcing
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/Llama-2-13b/autoregressive
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/LLeMMA-7b/teacher_forcing
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_test/LLeMMA-7b/autoregressive
/lfs/skampere1/0/kaif/model_evals/Counting_and_Probability_te

In [3]:
predictions.groupby(['dataset', 'model', 'method']).size()

dataset            model                       method         
Algebra_test       Abel-7b                     autoregressive     1187
                                               teacher_forcing    1187
                   Deepseek-67b                autoregressive     1187
                                               teacher_forcing    1187
                   Deepseek-7b-Instruct        autoregressive     1187
                                                                  ... 
Precalculus_train  Meta-Llama-3-8b-Instruct    teacher_forcing     746
                   Mistral-7b-Instruct-v0.2    autoregressive      746
                                               teacher_forcing      11
                   Mixtral-8x7b-Instruct-v0.1  autoregressive      746
                                               teacher_forcing      11
Length: 343, dtype: int64

In [4]:
%%capture
DATASET_CONFIGS = {
    'EleutherAI/hendrycks_math': {
        'subsets': ['algebra', 'counting_and_probability', 'geometry', 'intermediate_algebra',
                    'number_theory', 'prealgebra', 'precalculus'],
        'aliases': ['hendrycks_math', 'math', 'hendrycks'],
    }
}

DATASET_ALIASES = {alias: dataset_name
                   for dataset_name, config in DATASET_CONFIGS.items()
                   for alias in config['aliases']}
DATASET_ALIASES.update({dataset_name: dataset_name for dataset_name in DATASET_CONFIGS})
import datasets as hf_datasets


def load_datasets(args):
    datasets = {}
    for dataset_name in args.dataset.split(','):
        dataset_name = dataset_name.strip()
        assert dataset_name in DATASET_ALIASES, f'Unknown dataset {dataset_name}'
        dataset_name = DATASET_ALIASES[dataset_name]

        for subset in DATASET_CONFIGS[dataset_name]['subsets']:
            try:
                dataset = hf_datasets.load_dataset(dataset_name, subset)[args.split]
            except ValueError as e:  # @TODO: get right exception type
                print(e)
                print(f'Error while loading {dataset_name}:{subset}. Redownloading...')
                dataset = hf_datasets.load_dataset(dataset_name, subset, download_mode='force_redownload',
                                                   verification_mode='no_checks')[args.split]
            datasets[f'{dataset_name}:{subset}'] = dataset

    return datasets

train_args = args
train_args.split = 'train'
test_args = args
test_args.split = 'test'
datasets = {
    'train': load_datasets(train_args),
    'test': load_datasets(test_args)
}
datasets = pd.DataFrame([
    {
        'dataset': f'{DATASET_MAP[dataset_name]}_{split}',
        'i': i,
        **datapoint
    }
    
    for split in ['train', 'test']
    for dataset_name, dataset in datasets[split].items()
    for i, datapoint in enumerate(process_docs(dataset))
])


# def groupby(datasets, key):
#     return {k: list(g) for k, g in itertools.groupby(datasets, key=lambda x: x[key])}

In [5]:
# merge predictions and datasets on ['dataset', 'i']
results = datasets.merge(predictions, on=['dataset', 'i'])

save_path = Path('~/GitHub/gold-ai-olympiad/data/MATH/Predictions/').expanduser()
save_path.mkdir(exist_ok=True, parents=True)
import json

results_copy = results.copy()
results_copy['generation_args'] = results_copy['generation_args'].apply(lambda x: x.__dict__)

# save each dataset in a different file
for dataset_name, dataset in results_copy.groupby('dataset'):
    with open(save_path / f'{dataset_name}.json', 'w') as f:
        json.dump(dataset.to_dict('records'), f)

In [6]:
# load the results and combine them back into a single dataframe
results = pd.concat([
    pd.read_json(save_path / f)
    for f in save_path.iterdir()
    if f.suffix == '.json'
])

In [7]:
results.head()

Unnamed: 0,dataset,i,problem,level,type,solution,boxed,model,method,prediction,generation_args
0,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Mixtral-8x22b-Instruct-v0.1,teacher_forcing,"{'tfa': 0.7857142686843871, 'tfce': 0.83525991...","{'debug': False, 'seed': None, 'model': 'mistr..."
1,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Gemma-1.1-7b-Instruct,teacher_forcing,"{'tfa': 0.7358490824699401, 'tfce': 1.66470706...","{'debug': False, 'seed': None, 'model': 'googl..."
2,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Gemma-1.1-7b-Instruct,autoregressive,\nThe positive multiples of 45 less than 1000 ...,"{'debug': False, 'seed': None, 'model': 'googl..."
3,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},LLeMMA-34b,teacher_forcing,"{'tfa': 0.7361963391304011, 'tfce': 0.98589032...","{'debug': False, 'seed': None, 'model': 'Eleut..."
4,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Abel-7b,teacher_forcing,"{'tfa': 0.7857142686843871, 'tfce': 1.07728362...","{'debug': False, 'seed': None, 'model': 'GAIR/..."


In [8]:
class Metric():
    def __init__(self):
        super().__init__()

    def process(self, results):
        pass

    def __call__(self, results):
        return self.process(results)


def apply_metric(predictions, metric, datasets):
    m = metric(datasets)
    results = m(predictions)
    return results


from latex_formater import get_final_answer


class BoxedMatch(Metric):
    def process(self, results):
        results['boxed_pred'] = results['prediction'].apply(get_final_answer)
        results['boxed_true'] = results['boxed']
        results['boxed_match'] = results['boxed_true'] == results['boxed_pred']

        # very slow, probably don't use
        # results['match'] = results.apply(lambda x: is_equiv(x['boxed_true'], x['boxed_pred']), axis=1)

        # set first columns to be ['dataset', 'i', 'model', 'method', 'boxed_true', 'boxed_pred', 'match', ...]
        # cols = ['dataset', 'i', 'model', 'method', 'boxed_true', 'boxed_pred', 'boxed_match']
        # cols.extend([col for col in results.columns if col not in cols])
        return results  #[cols]

In [9]:
results['method'].unique()

array(['teacher_forcing', 'autoregressive'], dtype=object)

In [12]:
boxed_match = BoxedMatch()
ar_df = results.query('method == "autoregressive"').copy()
boxed_results = boxed_match(ar_df)

save_path = Path('~/GitHub/gold-ai-olympiad/data/MATH/Predictions/BoxedMatch').expanduser()
import pickle
with open(save_path / 'results.pkl', 'wb') as f:
    pickle.dump(boxed_results, f)

In [16]:
tf_df.head()

Unnamed: 0,dataset,i,problem,level,type,solution,boxed,model,method,prediction,generation_args
0,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Mixtral-8x22b-Instruct-v0.1,teacher_forcing,"{'tfa': 0.7857142686843871, 'tfce': 0.83525991...","{'debug': False, 'seed': None, 'model': 'mistr..."
1,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Gemma-1.1-7b-Instruct,teacher_forcing,"{'tfa': 0.7358490824699401, 'tfce': 1.66470706...","{'debug': False, 'seed': None, 'model': 'googl..."
3,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},LLeMMA-34b,teacher_forcing,"{'tfa': 0.7361963391304011, 'tfce': 0.98589032...","{'debug': False, 'seed': None, 'model': 'Eleut..."
4,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Abel-7b,teacher_forcing,"{'tfa': 0.7857142686843871, 'tfce': 1.07728362...","{'debug': False, 'seed': None, 'model': 'GAIR/..."
6,Number_Theory_test,0,A positive multiple of 45 less than 1000 is ra...,Level 2,Number Theory,"The positive multiples of 45 are \[45,90,135,...",\frac{1}{11},Llama-2-13b,teacher_forcing,"{'tfa': 0.7116564512252801, 'tfce': 1.41562521...","{'debug': False, 'seed': None, 'model': 'meta-..."


In [31]:
tf_df = results.query('method == "teacher_forcing"').copy()

# save to
save_path = Path('~/GitHub/gold-ai-olympiad/data/MATH/Predictions/TF2').expanduser()
save_path.mkdir(exist_ok=True, parents=True)

for dataset_name, dataset in tf_df.groupby('dataset'):
    if 'test' not in dataset_name:
        continue
    with open(save_path / f'{dataset_name}.json', 'w') as f:
        json_data = dataset.to_dict('records')
        assert isinstance(json_data[0]['prediction'], dict), f'Expected dict, got {type(json_data[0]["prediction"])} {json_data[0]["prediction"]}'
        json.dump(json_data, f)

In [28]:
test = json.loads(Path('~/GitHub/gold-ai-olympiad/data/MATH/Predictions/TF2/Algebra_train.json').expanduser().read_text())
test[0]

JSONDecodeError: Expecting value: line 1 column 1 (char 0)