In [18]:
import os
import re
import json
import csv
from pathlib import Path
import pandas as pd

In [12]:
BASE_DIR = Path("data/lm_eval_results")

OUTPUT_DIR = Path("datasets")
OUTPUT_DIR.mkdir(exist_ok=True)

# Regex pattern to capture dataset name from filenames like:
# "samples_boolq_2024-12-29T14-39-43.293973.jsonl"
# This pattern says: "samples_" then capture the text up to the next underscore
# (We assume there's at least one more underscore, right before the timestamp)
pattern = re.compile(r"samples_(.*?)_\d")

for model_dir in BASE_DIR.iterdir():
    if not model_dir.is_dir():
        continue 
    for jsonl_file in model_dir.glob("*.jsonl"):
        filename = jsonl_file.name

        match = pattern.match(filename)
        if not match:
            # If it doesn't match "samples_<dataset>_timestamp.jsonl", skip
            print(f"Skipping file (doesn't match pattern): {filename}")
            continue

        dataset_name = match.group(1)

        out_path = OUTPUT_DIR / f"{dataset_name}.jsonl"

        with open(out_path, "a", encoding="utf-8") as out_f, \
             open(jsonl_file, "r", encoding="utf-8") as in_f:
            for line in in_f:
                out_f.write(line)

        print(f"Appended {jsonl_file} -> {out_path}")

Appended data/lm_eval_results/meta-llama__Llama-3.2-3B-Instruct/samples_mmlu_college_biology_2024-12-29T14-34-19.612445.jsonl -> datasets/mmlu_college_biology.jsonl
Appended data/lm_eval_results/meta-llama__Llama-3.2-3B-Instruct/samples_mmlu_international_law_2024-12-29T14-34-19.612445.jsonl -> datasets/mmlu_international_law.jsonl
Appended data/lm_eval_results/meta-llama__Llama-3.2-3B-Instruct/samples_mmlu_conceptual_physics_2024-12-29T14-34-19.612445.jsonl -> datasets/mmlu_conceptual_physics.jsonl
Appended data/lm_eval_results/meta-llama__Llama-3.2-3B-Instruct/samples_mmlu_moral_disputes_2024-12-29T14-34-19.612445.jsonl -> datasets/mmlu_moral_disputes.jsonl
Appended data/lm_eval_results/meta-llama__Llama-3.2-3B-Instruct/samples_mmlu_astronomy_2024-12-29T14-34-19.612445.jsonl -> datasets/mmlu_astronomy.jsonl
Appended data/lm_eval_results/meta-llama__Llama-3.2-3B-Instruct/samples_mmlu_machine_learning_2024-12-29T14-34-19.612445.jsonl -> datasets/mmlu_machine_learning.jsonl
Appended dat

In [14]:
def jsonl_to_csv(jsonl_path, csv_path):
    """
    Read a .jsonl file, gather all keys from all lines,
    and write out a CSV with those columns.
    """
    rows = []
    all_keys = set()

    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            data = json.loads(line)
            rows.append(data)
            all_keys.update(data.keys())

    with open(csv_path, 'w', newline='', encoding='utf-8') as out_f:
        writer = csv.DictWriter(out_f, fieldnames=sorted(all_keys))
        writer.writeheader()
        for row in rows:
            writer.writerow(row)

In [17]:
jsonl_dir = OUTPUT_DIR

for jsonl_file in jsonl_dir.glob("*.jsonl"):
    dataset_name = jsonl_file.stem 
    csv_file = jsonl_dir / f"{dataset_name}.csv"

    print(f"Converting {jsonl_file} -> {csv_file} ...")
    jsonl_to_csv(jsonl_file, csv_file)
    print(f"Saved {csv_file}")

Converting datasets/mmlu_college_computer_science.jsonl -> datasets/mmlu_college_computer_science.csv ...
Saved datasets/mmlu_college_computer_science.csv
Converting datasets/mmlu_college_medicine.jsonl -> datasets/mmlu_college_medicine.csv ...
Saved datasets/mmlu_college_medicine.csv
Converting datasets/mmlu_philosophy.jsonl -> datasets/mmlu_philosophy.csv ...
Saved datasets/mmlu_philosophy.csv
Converting datasets/mmlu_high_school_physics.jsonl -> datasets/mmlu_high_school_physics.csv ...
Saved datasets/mmlu_high_school_physics.csv
Converting datasets/mmlu_abstract_algebra.jsonl -> datasets/mmlu_abstract_algebra.csv ...
Saved datasets/mmlu_abstract_algebra.csv
Converting datasets/mmlu_econometrics.jsonl -> datasets/mmlu_econometrics.csv ...
Saved datasets/mmlu_econometrics.csv
Converting datasets/mmlu_conceptual_physics.jsonl -> datasets/mmlu_conceptual_physics.csv ...
Saved datasets/mmlu_conceptual_physics.csv
Converting datasets/mmlu_management.jsonl -> datasets/mmlu_management.csv 

In [None]:
pd.read_csv("")