In [1]:
import os
import re
import json
import csv
import ast
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import glob

In [2]:
model_order = [
    "meta-llama__Llama-3.2-1B-Instruct_chosen",
    "meta-llama__Llama-3.2-3B-Instruct_chosen",
    "meta-llama__Llama-3.1-8B-Instruct_chosen",
    "meta-llama__Llama-3.1-70B-Instruct_chosen",
    "meta-llama__Llama-3.3-70B-Instruct_chosen",
]

INPUT_FOLDER = Path("datasets")
OUTPUT_FOLDER = Path("datasets/csv")
OUTPUT_FOLDER.mkdir(exist_ok=True)

FILE_SELECTION_PATTERN = re.compile(r"samples_(.*?)_\d")


In [10]:
DF = pd.DataFrame()
for model_dir in INPUT_FOLDER.iterdir():
    model_df = pd.DataFrame()
    if not model_dir.is_dir():
        continue
    
    for jsonl_file in model_dir.glob("*.jsonl"):
        match = FILE_SELECTION_PATTERN.match(jsonl_file.name)
        
        if not match:
            print(f"Skipping file (doesn't match pattern): {filename}")
            continue

        dataset_name = match.group(1)

        model_name = model_dir.name

        f = pd.read_json(jsonl_file, lines=True)
        f["input_text"] = f["doc"].apply(lambda x: x["question"])
        try: 
            f["choices"] = f["doc"].apply(lambda x: x["choices"])
        except KeyError as e: 
            try:
                f["choices"] = f["doc"].apply(lambda x: ["yes", "no"])
            except KeyError as e:
                display(f["doc"][0]) 
                raise e

        try: 
            f["correct_response"] = f["doc"].apply(lambda x: x["answer"])
        except KeyError as e: 
            try:
                f["correct_response"] = f["doc"].apply(lambda x: x["label"])
            except KeyError as e:
                display(f["doc"][0]) 
                raise e

        try: 
            f["subject"] = f["doc"].apply(lambda x: x["subject"])
        except KeyError as e: 
            f["subject"] = "N/A"
        
        f["dataset"] = dataset_name
        f[f"{model_dir.name}_correct"] = f["acc"].apply(lambda x: 0 if x == 0 else 1)

        model_df = pd.concat([
            model_df, 
            f[["input_text", "dataset", "choices", "correct_response", "subject", f"{model_dir.name}_correct"]]
        ])

    if len(model_df) == 0:
        continue
        
    if len(DF) > 0:
        DF = DF.merge(model_df[["input_text", f"{model_dir.name}_correct"]], on="input_text", how="inner")
    else: 
        DF = model_df.copy(deep=True)

COLUMNS = DF.columns 
MODEL_COLS = [i for i in COLUMNS if "meta-llama" in i]
COLS = []
for i in MODEL_COLS:
    i_split = i.split("-")
    param_count = i_split[3] if len(i_split[3]) == 3 else f"0{i_split[3]}"
    
    COLS.append(
        f"llama_{param_count}_{i_split[2].replace('.', '')}"
    )

NON_MODEL_COLS = [i for i in COLUMNS if "meta-llama" not in i]
COLUMNS = NON_MODEL_COLS + COLS
DF.columns = COLUMNS

MODEL_COLS = [i for i in COLUMNS if "llama_" in i]
SORTED_MODEL_COLS = sorted(MODEL_COLS)
DF = DF[NON_MODEL_COLS + SORTED_MODEL_COLS]

DF["label"] = DF.apply(lambda row: [row[i] for i in SORTED_MODEL_COLS], axis=1)

DF.to_csv(f"{OUTPUT_FOLDER}/all_data.csv")

In [None]:
def jsonl_to_csv(jsonl_path, csv_path):
    """
    Read a .jsonl file, gather all keys from all lines,
    and write out a CSV with those columns.
    """
    rows = []
    all_keys = set()

    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            rows.append(data)
            all_keys.update(data.keys())

    with open(csv_path, 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.DictWriter(out_f, fieldnames=sorted(all_keys))
        writer.writeheader()
        for row in rows:
            writer.writerow(row)

In [None]:
for jsonl_file in OUTPUT_FOLDER.glob("*.jsonl"):
    dataset_name = jsonl_file.stem 
    csv_file = OUTPUT_FOLDER / f"{dataset_name}.csv"

    # print(f"Converting {jsonl_file} -> {csv_file} ...")
    jsonl_to_csv(jsonl_file, csv_file)
    # print(f"Saved {csv_file}")

In [None]:
BOOLQ = pd.read_csv(f"{OUTPUT_FOLDER}/boolq.csv")
LOGIQA2 = pd.read_csv(f"{OUTPUT_FOLDER}/logiqa2.csv")

MMLU_FILES = glob.glob(f"{OUTPUT_FOLDER}/mmlu_*.csv")
MMLU = pd.DataFrame()
for file in MMLU_FILES:
    MMLU = pd.concat([MMLU, pd.read_csv(file)])
    

In [8]:
display(len(BOOLQ))
display(len(LOGIQA2))
display(len(MMLU))

65400

31440

280840

In [14]:
display(BOOLQ.head(1))

Unnamed: 0,acc,arguments,doc,doc_hash,doc_id,filter,filtered_resps,metrics,model_name,prompt_hash,resps,target,target_hash
0,0.0,"{'gen_args_0': {'arg_0': ""Ethanol fuel -- All ...",{'question': 'does ethanol take more energy ma...,34c12031b4c7298fa36fd4bd2d8f2a482ec2fa5b8849bf...,0,none,"[['-3.78125', 'False'], ['-3.28125', 'False']]",['acc'],meta-llama__Llama-3.1-70B-Instruct,99815785e839d379cc9330a0be56ce32e4956ac9aae467...,"[[['-3.78125', 'False']], [['-3.28125', 'False...",0,5feceb66ffc86f38d952786c6d696c79c2dbc239dd4e91...


In [13]:
BOOLQ["arguments"][0]

'{\'gen_args_0\': {\'arg_0\': "Ethanol fuel -- All biomass goes through at least some of these steps: it needs to be grown, collected, dried, fermented, distilled, and burned. All of these steps require resources and an infrastructure. The total amount of energy input into the process compared to the energy released by burning the resulting ethanol fuel is known as the energy balance (or ``energy returned on energy invested\'\'). Figures compiled in a 2007 report by National Geographic Magazine point to modest results for corn ethanol produced in the US: one unit of fossil-fuel energy is required to create 1.3 energy units from the resulting ethanol. The energy balance for sugarcane ethanol produced in Brazil is more favorable, with one unit of fossil-fuel energy required to create 8 from the ethanol. Energy balance estimates are not easily produced, thus numerous such reports have been generated that are contradictory. For instance, a separate survey reports that production of ethanol

In [None]:
boolq.info()

In [None]:
accuracy_per_model = boolq.groupby('model_name')['acc'].mean()

print(accuracy_per_model)

In [None]:
datasets_dir = Path("data/lm_eval_harness_output")

results = {}

for dataset_file in datasets_dir.glob("*.csv"):
    dataset_name = dataset_file.stem
    df = pd.read_csv(dataset_file)
    accuracy = df.groupby("model_name")["acc"].mean()
    results[dataset_name] = accuracy

summary_df = pd.DataFrame(results).T.fillna(0)
summary_df = summary_df[model_order]

plt.figure(figsize=(12, max(2, len(summary_df) * 0.5)))
summary_df.plot(kind="barh", figsize=(12, max(2, len(summary_df) * 0.5)), width=0.8)
plt.title("Model Accuracy per Dataset", fontsize=16)
plt.xlabel("Accuracy", fontsize=14)
plt.ylabel("Dataset", fontsize=14)
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=10)
plt.tight_layout()
plt.show()

In [None]:
def extract_input_text(doc_str):
    data = ast.literal_eval(doc_str)
    question = data.get("question", "")
    passage = data.get("passage", "")
    return f"{question} {passage}".strip()

In [None]:
boolq["input_text"] = boolq["doc"].apply(extract_input_text)
boolq.info()

In [None]:
boolq["input_text"][0]

In [None]:
datasets_dir = Path("data/lm_eval_harness_output")

for csv_file in datasets_dir.glob("*.csv"):
    print(f"Processing {csv_file.name}...")
    df = pd.read_csv(csv_file)
    
    if "doc" in df.columns:
        df["input_text"] = df["doc"].apply(extract_input_text)
        df.to_csv(csv_file, index=False)
        print(f"Updated {csv_file.name}.")
    else:
        print(f"Skipping {csv_file.name}: No 'doc' column.")

print("All files processed.")

In [None]:
mmlu_medical_genetics = pd.read_csv("data/lm_eval_harness_output/modern-bert-embeddings/mmlu_medical_genetics_embeds.csv")

In [None]:
mmlu_medical_genetics["input_text"][1]

In [None]:
input_text_to_find = 'If the frequency of males affected with an X-linked recessive condition in a human population is .10 (one in ten), what will be the expected frequency of affected females?'

count = (mmlu_medical_genetics["input_text"] == input_text_to_find).sum()
print(f"The input text appears {count} times in the dataset.")


In [None]:
def combine_rows(group):
    combined_row = group.iloc[0].drop(["acc", "model_name", "arguments"]).to_dict()
    combined_row["arguments"] = " ".join(group["arguments"])
    for _, row in group.iterrows():
        combined_row[f"{row['model_name']}_chosen"] = row["acc"]
    return pd.Series(combined_row)

grouped_df = (
    mmlu_medical_genetics.groupby("input_text", as_index=False)
    .apply(combine_rows)
)

existing_columns = [col for col in grouped_df.columns if col not in model_order]
grouped_df = grouped_df[existing_columns + model_order]

In [None]:
grouped_df.head()

In [None]:
datasets_dir = Path("data/lm_eval_harness_output/modern-bert-embeddings/")

for csv_file in datasets_dir.glob("*.csv"):
    print(f"Processing {csv_file}...")
    df = pd.read_csv(csv_file)
    grouped_df = (
        df.groupby("input_text", as_index=False)
        .apply(combine_rows)
    )
    existing_columns = [col for col in grouped_df.columns if col not in model_order]
    grouped_df = grouped_df[existing_columns + model_order]
    grouped_df.to_csv(csv_file, index=False)
    print(f"Saved updated file: {csv_file}")

In [None]:
logiqa2 = pd.read_csv("data/lm_eval_harness_output/modern-bert-embeddings/logiqa2_embeds.csv")

In [None]:
def determine_chosen_model_numeric(row, model_order):
    for idx, model in enumerate(model_order, start=1):
        if row[model] == 1:  
            return idx
    return len(model_order)  

logiqa2["chosen_model"] = logiqa2.apply(lambda row: determine_chosen_model_numeric(row, model_order), axis=1).astype(int)
logiqa2.to_csv("data/lm_eval_harness_output/modern-bert-embeddings/logiqa2_embeds.csv", index=False)
print("Added 'chosen_model' column with integer labels.")

In [None]:
logiqa2.head()

In [None]:
datasets_dir = Path("datasets/modern-bert-embeddings")

for csv_file in datasets_dir.glob("*.csv"):
    df = pd.read_csv(csv_file)
    df["chosen_model"] = df.apply(lambda row: determine_chosen_model_numeric(row, model_order), axis=1).astype(int)
    df.to_csv(csv_file, index=False)

In [None]:
boolq = pd.read_csv("datasets/modern-bert-embeddings/boolq_embeds.csv")

In [None]:
boolq.head()