In [None]:
import matplotlib.pyplot as plt
from pathlib import Path

# Load the data
data_dir = Path('~/model_outputs').expanduser()
# grab each subdirectory
model_dirs = [x for x in data_dir.iterdir() if x.is_dir()]
model_names = [x.name for x in model_dirs]
dataset_names = list(set(x.name for model_dir in model_dirs for x in model_dir.iterdir()))
model_names, dataset_names
import json

# load the name maps
with open('name_maps.json', 'r') as f:
    name_maps = json.load(f)
    DATASET_MAP = name_maps['DATASET_MAP']
    MODEL_MAP = name_maps['MODEL_MAP']
%%capture
from dataset import from_name
from prompt import generate_nshot_prompts
from dataset import load_datasets, BOXED_ANSWERS_DATASETS

import numpy as np

datasets_raw = load_datasets(BOXED_ANSWERS_DATASETS)
datasets = {
    DATASET_MAP[dataset['name']]: dataset['data']['train']
    for dataset in datasets_raw
}

np.random.seed(0)
num_samples = 100
# datasets_subset = {k: np.random.choice(v, num_samples, replace=False) for k, v in datasets.items()}
datasets_subset = datasets

datasets = {k: list(v) for k, v in datasets.items()}

# prepend datasets_subset with 3 examples from datasets
# datasets_subset = {k: datasets[k][:3] + list(v) for k, v in datasets_subset.items()}

datas = {k: generate_nshot_prompts(v, n=3) for k, v in datasets_subset.items()}
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
import torch

torch.set_float32_matmul_precision('medium')

from comet import download_model, load_from_checkpoint

# model_path = download_model("Unbabel/XCOMET-XL")
model_path = download_model("Unbabel/wmt23-cometkiwi-da-xl")
model = load_from_checkpoint(model_path)
import json
import os
from typing import List

from nltk.tokenize import sent_tokenize

from roscoe.score import (
    SEQ_EMB_MODEL_TYPES,
    Chain,
    Evaluator,
    REASONING_SCORES,
    UNSUPERVISED_SCORES,
    SENT_TRANS,
    SIMSCE
)
from roscoe.util import (
    print_and_reset_max_gpu_memory,
    save_scores,
    split_gsm8k_gpt3_generations_to_steps,
)


class ReasoningSteps(Chain):
    def __init__(self, line: str, type="regular") -> None:
        self.chain = self.parse_chain(line, type=type)

    def parse_chain(self, chain: str, type: str) -> List[str]:
        """
        Change formatting.

        Returns list of steps in reasoning chain.
        """
        if type == "gsm8k_ref":
            return chain.split("IGNORE THIS. Ground truth here for reference. ")[
                1
            ].split('\n')
        elif type == "gsm8k_hypo":
            return split_gsm8k_gpt3_generations_to_steps(reasoning=chain)
        elif type == "regular":
            return sent_tokenize(chain)
        else:
            raise NotImplementedError(f"{type} chain type is not supported")


# get memory usage of evaluator.ppl_model
param_size = 0
for param in evaluator.grmr_model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in evaluator.grmr_model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024 ** 3
print('model size: {:.3f}GB'.format(size_all_mb))
from roscoe.score import Evaluator

evaluator = Evaluator(
    score_types=REASONING_SCORES,
    model_type=SIMSCE,
    transformer_model="facebook/roscoe-512-roberta-base",
    ppl_model="gpt2-large",
    discourse_batch=64,
    coherence_batch=16,
    hypos=[],
    context=[],
)
len(evaluator.hypos), len(evaluator.context), len(evaluator.references)


def get_nshot_base_question(question):
    return question.split('4.')[-1].strip()


def compute_roscoe(outputs, solutions):
    # hypos is the predicted reasoning chain
    hypos = [
        ReasoningSteps(o.outputs[0].text)
        for o in outputs
    ]
    # refs is the ground truth reasoning chain
    refs = [
        ReasoningSteps(s['answer'] if isinstance(s, dict) else s)
        for s in solutions
    ]
    # context is the prompt
    context = [
        ReasoningSteps(get_nshot_base_question(o.prompt))
        for o in outputs
    ]

    # x = [get_nshot_base_question(s['question']) for s in solutions]
    # y = [get_nshot_base_question(o.prompt) for o in outputs]
    # print(len(hypos), len(refs), len(context))
    # print(x[:2])
    # print('=' * 20)
    # print(y[:2])
    # raise ValueError
    # evaluate
    n = 50
    evaluator.set_hypos(hypos[:n])
    evaluator.set_references(refs[:n])
    evaluator.set_context(context[:n])
    scores = evaluator.evaluate()
    return dict(scores)


# roscoe_res = {
#     dataset: compute_roscoe(pickle.load(open(subdir / "deepseek-ai_deepseek-math-7b-instruct_autoregressive.pkl", 'rb'))[0], datas[dataset.replace(' ', '_')])
#     for dataset, subdir in zip(dataset_names, subdirs)
# }
with open('/lfs/skampere1/0/kaif/generated_outputs/roscoe_temp.pkl', 'wb') as f:
    pickle.dump(roscoe_res, f)


# data = [
#     # {
#     #     "src": datas['Hendrycks Algebra'][0]['question'],
#     #     "mt": predictions[0].outputs[0].text,
#     #     "ref": datas['Hendrycks Algebra'][0]['answer']
#     # }
#     {
#         "src": d['question'],
#         "mt": o.outputs[0].text,
#         "ref": d['answer']
#     } for d, o in zip(datas['Hendrycks Algebra'], predictions)
# ]
# model_output = model.predict(data, batch_size=64, gpus=1)
# # Segment-level scores
# print (model_output.scores)
#
# # System-level score
# print (model_output.system_score)
#
# # Score explanation (error spans)
# print (model_output.metadata.error_spans)

def comet(outputs, solutions, use_ref=True):
    if use_ref:
        data = [
            {
                "src": o.prompt,
                "mt": o.outputs[0].text,
                "ref": sol
            } for (sol, o) in zip(solutions, outputs)
        ]
    else:
        data = [
            {
                "src": o.prompt,
                "mt": o.outputs[0].text
            } for o in outputs
        ]
    model_output = model.predict(data, batch_size=16, gpus=1)
    return model_output


import pickle

# xcomet_res = {
#     dataset: xcomet(pickle.load(open(subdir / "deepseek-ai_deepseek-math-7b-instruct_autoregressive.pkl", 'rb'))[0], datas[dataset])
#     for dataset, subdir in zip(dataset_names, subdirs)
# }
# # save
# with open('/lfs/skampere1/0/kaif/generated_outputs/xcom_temp.pkl', 'wb') as f:
#     pickle.dump(xcomet_res, f)
# load
with open('/lfs/skampere1/0/kaif/generated_outputs/xcom_temp.pkl', 'rb') as f:
    xcomet_res = pickle.load(f)
xcomet_res['Hendrycks Number Theory'][0]
import pickle

x = pickle.load(open(
    "/lfs/skampere1/0/kaif/generated_outputs/EleutherAI_hendrycks_math_algebra/deepseek-ai_deepseek-math-7b-instruct_autoregressive.pkl",
    'rb'))
subdirs[
    0] / "deepseek-ai_deepseek-math-7b-instruct_autoregressive.pkl", "/lfs/skampere1/0/kaif/generated_outputs/EleutherAI_hendrycks_math_algebra/deepseek-ai_deepseek-math-7b-instruct_autoregressive.pkl"
from grader import ExactMatchGrader, NextTokenAccuracyGrader
from dataset import get_boxed_answer
from latex_formater import latex_deformat


def grade_predictions(outputs, data):
    grader = ExactMatchGrader()

    boxed_predictions = []
    for o in outputs:
        try:
            boxed_predictions.append(latex_deformat(get_boxed_answer(o.outputs[0].text)))
        except:
            print(get_boxed_answer(o.outputs[0].text))

    # boxed_predictions = [latex_deformat(get_boxed_answer(o.outputs[0].text)) for o in outputs]
    boxed_answers = [latex_deformat(get_boxed_answer(d)) for d in data]
    grades = grader.grade(boxed_predictions, boxed_answers)

    return grades


def teacher_forcing_accuracy(predictions, solutions):
    grader = NextTokenAccuracyGrader()
    return grader.grade(predictions, solutions)


model_names


# examine some of the predictions
def get_predictions(model, dataset, data_type):
    model_dir = data_dir / model
    dataset_dir = model_dir / dataset
    file = dataset_dir / f"{data_type}.pkl"
    with open(file, 'rb') as f:
        predictions, solutions = pickle.load(f)
    return predictions, solutions


predictions, solutions = get_predictions('LLeMMA-7b', 'Algebra', 'autoregressive')
Llama - 2 - 13
b
tends
to
have
a
bunch
of
whitespace and ORs in the
predictions.
Tora - 13
b - v1
.0
lots
of
code and repeated
sections.Probably
hard
to
parse
into
steps
i = 0
print(get_nshot_base_question(predictions[i].prompt))
print('=' * 20)
print(solutions[i])
print('=' * 20)
print(predictions[i].outputs[0].text)
root_dir = Path('/lfs/skampere1/0/kaif/model_outputs')
for model_dir in root_dir.iterdir():
    any_ouputs = False
    for dataset_dir in model_dir.iterdir():
        num_files = len(list(dataset_dir.iterdir()))
        if num_files != 0:
            any_ouputs = True
    if not any_ouputs:
        print(f"Empty directory: {model_dir}")
import pickle
import numpy as np
from tqdm.auto import tqdm

HOOKS = {
    'autoregressive': {
        'Accuracy': grade_predictions,
        'ROSCOE': compute_roscoe,
        # 'XCOMET': comet
    }
    # 'teacher_forcing': {}
}


def ensure_dict_path(d, path):
    """
    Ensure that the path exists in the dictionary
    """
    for key in path:
        if key not in d:
            d[key] = {}
        d = d[key]


def eval_data(data_dir):
    evals = {}
    number_of_models = len(list(data_dir.iterdir()))
    number_of_metrics = sum(len(metrics) for metrics in HOOKS.values())
    with (
        tqdm(total=number_of_models, desc='Models') as model_tqdm,
        tqdm(total=0, desc='Dataset') as data_tqdm,
        tqdm(total=number_of_metrics, desc='Metric') as metric_tqdm
    ):
        for model_dir in data_dir.iterdir():
            model_name = model_dir.name
            model_tqdm.set_description(model_name)
            data_tqdm.reset(total=len(list(model_dir.iterdir())))

            for dataset_dir in model_dir.iterdir():
                dataset_name = dataset_dir.name
                data_tqdm.set_description(dataset_name)
                metric_tqdm.reset()

                for data_type, metrics in HOOKS.items():
                    for metric_name, metric in metrics.items():
                        metric_tqdm.set_description(metric_name)
                        file = dataset_dir / f"{data_type}.pkl"
                        if file.exists():
                            with open(file, 'rb') as f:
                                predictions, solutions = pickle.load(f)
                                metric_value = metric(predictions, solutions)

                                if isinstance(metric_value, dict):
                                    for k, v in metric_value.items():
                                        sub_metric_name = f"{metric_name}_{k}"
                                        ensure_dict_path(evals, [sub_metric_name, dataset_name, model_name])
                                        evals[sub_metric_name][dataset_name][model_name] = v
                                else:
                                    ensure_dict_path(evals, [metric_name, dataset_name, model_name])
                                    evals[metric_name][dataset_name][model_name] = metric_value
                        metric_tqdm.update(1)
                data_tqdm.update(1)
            model_tqdm.update(1)

    return evals


evals = eval_data(root_dir)
evals.keys()
import pickle

save_path = Path('/lfs/skampere1/0/kaif/generated_outputs/metrics/evals_roscoe.pkl')
# save_path.parent.mkdir(parents=True, exist_ok=True)
# with open(save_path, 'wb') as f:
#     pickle.dump(evals, f)
with open(save_path, 'rb') as f:
    evals = pickle.load(f)
for metric, datasets in evals.items():
    for dataset, models in datasets.items():
        for model, value in models.items():
            try:
                x = np.mean(value)
            except:
                print(len([v for v in value if isinstance(v, float)]),
                      np.mean([v for v in value if isinstance(v, float)]), metric, dataset, model)
                if len([v for v in value if isinstance(v, float)]) == 0:
                    print(value)
                    print('=' * 20)
roscoe_avg
import pandas as pd


def sum_values(a, b):
    a = [v if isinstance(v, float | bool) else 0 for v in a]
    b = [v if isinstance(v, float | bool) else 0 for v in b]
    return [x + y for x, y in zip(a, b)]


# compute average of all metrics with ROSCOE in their name
roscoe_avg = {}
n = 0
for metric, datasets in evals.items():
    if 'ROSCOE' in metric:
        n += 1
        roscoe_avg = {
            dataset: {
                model: sum_values(v, roscoe_avg.get(dataset, {}).get(model, [0] * len(v)))
                for model, v in models.items()
            }
            for dataset, models in datasets.items()
        }
roscoe_avg = {
    dataset: {
        model: [v / n for v in value]
        for model, value in models.items()
    }
    for dataset, models in roscoe_avg.items()
}
evals['ROSCOE'] = roscoe_avg

evals_avg = {
    metric: {
        model: {
            dataset: np.mean([v for v in value if isinstance(v, float | bool)])
            for dataset, value in datasets.items()
        }
        for model, datasets in models.items()
    }
    for metric, models in evals.items()
}

evals_dfs = {
    metric: pd.DataFrame(v).melt(value_name=metric, var_name='dataset',
                                 ignore_index=False).reset_index(names='model')
    for metric, v in evals_avg.items()
}
plt.close()


# plot histograms for each metric, model, dataset conditioned on accuracy
def plot_histograms(evals, metric, model, dataset, ax):
    # check to see if dataset and model are in evals
    ax.set_title(f"{model} {dataset}")
    if dataset not in evals[metric] or model not in evals[metric][dataset]:
        return
    metric_data = evals[metric][dataset][model]
    metric_data = [m if isinstance(m, float | bool) else -0.5 for m in metric_data]
    accuracy_data = evals['Accuracy'][dataset][model]
    metric_correct = [m for m, a in zip(metric_data, accuracy_data) if a]
    metric_incorrect = [m for m, a in zip(metric_data, accuracy_data) if not a]
    ax.hist(metric_correct, bins=20, alpha=0.5, label='Correct')
    ax.hist(metric_incorrect, bins=20, alpha=0.5, label='Incorrect')
    # set x ticks to be the same
    ax.set_xticks(np.linspace(-0.5, 1, 5))
    ax.legend()


# plot_histograms(evals, 'ROSCOE', 'Deepseek-7b-Instruct', 'Algebra', plt.gca())
# plt.show()


for metric in evals_dfs.keys():
    fig, axs = plt.subplots(len(model_names), len(dataset_names),
                            figsize=(len(model_names) * 4, len(dataset_names) * 8))
    for i, model in enumerate(model_names):
        for j, dataset in enumerate(dataset_names):
            plot_histograms(evals, metric, model, dataset, axs[i, j])
    plt.savefig(f'plots/hist_{metric}.png')
    plt.close()
# merge all the dataframes in evals_dfs
from functools import reduce

df = reduce(lambda x, y: pd.merge(x, y, on=['dataset', 'model']), evals_dfs.values())
for metric in evals_dfs.values():
    assert df.shape[0] == metric.shape[0]

# compute R^2 of each metric against Accuracy conditioned on model (use .corr().iloc[0, 1] ** 2)
R2_model = {
    metric: df.groupby('model').apply(lambda x: x[['Accuracy', metric]].corr().iloc[0, 1] ** 2)
    for metric in evals_dfs.keys()
}
R2_dataset = {
    metric: df.groupby('dataset').apply(lambda x: x[['Accuracy', metric]].corr().iloc[0, 1] ** 2)
    for metric in evals_dfs.keys()
}
import seaborn as sns

sns.set_theme(style="whitegrid")

# g = sns.lmplot(
#     data=df,
#     x='Teacher Forced Cross Entropy',
#     y='Accuracy',
#     hue='model',
#     col='dataset',
#     col_wrap=3,
#     height=4,
#     aspect=1,
#     scatter_kws=dict(s=50, linewidths=1, edgecolor='w'),
#     palette='tab10',
# )
# g.set(xscale="log", yscale="log")
# # put R2 next to model name in legend
# # df['Log Accuracy'] = np.log(df['Accuracy'])
# xcom_corr = df.groupby('model').apply(lambda x: x[['COMETKIWI', 'Accuracy']].corr().iloc[0, 1] ** 2)
#
# g = sns.lmplot(
#     data=df,
#     x='COMETKIWI',
#     y='Accuracy',
#     hue='model',
#     height=6,
#     aspect=1.5,
#     scatter_kws=dict(s=50, linewidths=1, edgecolor='w'),
#     palette='tab10',
#     # ci=None,
#     # legend=False,  # turn off the original legend
# )
#
# import matplotlib.patches as mpatches
#
# # Fetching the color palette based on the number of unique models
# color_palette = sns.color_palette('tab10', n_colors=len(df['model'].unique()))
#
# legend_patches = []
# for i, model in enumerate([l.get_text() for l in g.legend.texts]):
#     correlation = xcom_corr[model]
#     legend_patches.append(mpatches.Patch(color=color_palette[i],
#                                          label=f'({correlation:0.2f}) {model}'))
#
# g.legend.set_visible(False)
# plt.legend(handles=legend_patches)
# plt.show()
# # g.set(xscale="log", yscale="log")

# repeat above but for each df in evals_dfs
for metric, eval_df in evals_dfs.items():
    g = sns.lmplot(
        data=df,
        x=metric,
        y='Accuracy',
        hue='model',
        height=6,
        aspect=1.5,
        scatter_kws=dict(s=50, linewidths=1, edgecolor='w'),
        palette='tab10',
        ci=None,
        # legend=False,  # turn off the original legend
    )

    import matplotlib.patches as mpatches

    # Fetching the color palette based on the number of unique models
    color_palette = sns.color_palette('tab10', n_colors=len(df['model'].unique()))

    legend_patches = []
    for i, model in enumerate([l.get_text() for l in g.legend.texts]):
        correlation = R2_model[metric][model]
        legend_patches.append(mpatches.Patch(color=color_palette[i],
                                             label=f'({correlation:0.2f}) {model}'))

    g.legend.set_visible(False)
    plt.legend(handles=legend_patches)
    plt.show()