In [None]:
import json
import uuid
from pathlib import Path

import pandas as pd

In [None]:
# Configuration
OUTPUT_BASE_DIR = Path('/home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/')
RUN_DIR = OUTPUT_BASE_DIR / 'bucketed'
INPUT_FILENAME = 'dataset_full_metadata.jsonl'
OUTPUT_FILENAME = 'dataset_normalized.jsonl'

In [None]:
# Find the JSONL file
if RUN_DIR is None:
    # Find the most recent run directory
    run_dirs = sorted([d for d in OUTPUT_BASE_DIR.iterdir() if d.is_dir()], reverse=True)
    if not run_dirs:
        raise FileNotFoundError(f"No run directories found in {OUTPUT_BASE_DIR}")
    RUN_DIR = run_dirs[0]
    print(f"Using most recent run directory: {RUN_DIR.name}")
else:
    print(f"Using specified run directory: {RUN_DIR.name}")

jsonl_file = RUN_DIR / INPUT_FILENAME

if not jsonl_file.exists():
    raise FileNotFoundError(f"JSONL file not found: {jsonl_file}")

print(f"Reading from: {jsonl_file}")

In [None]:
# Read and parse JSONL file
datapoints = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f, 1):
        if line.strip():  # Skip empty lines
            try:
                datapoint = json.loads(line)
                datapoints.append(datapoint)
            except json.JSONDecodeError as e:
                print(f"Error parsing line {line_num}: {e}")
                print(f"Line content: {line[:200]}...")

NUM_ITEMS_TO_DISPLAY = 10
print(f"\nTotal datapoints loaded: {len(datapoints)}")
print(f"Displaying first {min(NUM_ITEMS_TO_DISPLAY, len(datapoints))} items:\n")

In [None]:
# ──────────────────────────────────────────────────────────────
# Normalized output schema  (field -> source path)
# ──────────────────────────────────────────────────────────────
#
# Identity
#   id                          -> uuid4()
#   source_license              -> record.license
#   timestamp                   -> record.timestamp
#
# Problem definition
#   domain                      -> math_structure.from_text.domain
#   level                       -> lowest computation_buckets[].level where passes >= 1
#   text                        -> problem.text
#   solution                    -> problem.original_solution
#   expected_answer             -> problem.expected_answer
#
# Math structure (from text)
#   objects                     -> math_structure.from_text.objects
#   constraints                 -> math_structure.from_text.constraints
#
# Math structure (from solution)
#   reasoning_depth             -> math_structure.from_solution.reasoning_depth
#   technique_transitions       -> math_structure.from_solution.technique_transitions
#   reasoning_scope             -> math_structure.from_solution.reasoning_scope
#   intermediate_reuse          -> math_structure.from_solution.intermediate_reuse
#
# Code artifact & telemetry
#   code                        -> first attempt with exec.status == "success"
#   code_attempts               -> outcome.execution_attempts
#   code_runtime_ms             -> successful attempt exec.duration (seconds -> ms)
#   code_generated_tokens       -> tokenizer(code)
#   code_predicted_correct_answer -> successful attempt result.correct

In [None]:
def _successful_attempt(record):
    """First attempt with exec.status == 'success'."""
    for a in record.get("attempts", []):
        if (a.get("exec") or {}).get("status") == "success":
            return a
    return None


def _lowest_passing_level(record):
    """Lowest computation_buckets level where passes >= 1, or None."""
    buckets = record.get("computation_buckets") or []
    passing = [b["level"] for b in buckets if b.get("passes", 0) >= 1]
    return min(passing) if passing else None


def record_to_flat(record, tokenizer_fn=None):
    """
    Convert one source record to the normalized flat format.
    Returns None if there is no successful attempt.
    """
    attempt = _successful_attempt(record)
    if attempt is None:
        return None

    problem = record.get("problem") or {}
    outcome = record.get("outcome") or {}
    math_struct = record.get("math_structure") or {}
    from_text = math_struct.get("from_text") or {}
    from_solution = math_struct.get("from_solution") or {}
    exec_ = attempt.get("exec") or {}
    result = attempt.get("result") or {}

    code = attempt.get("code") or ""

    # code_runtime_ms: exec.duration is in seconds -> convert to ms
    duration_sec = exec_.get("duration")
    code_runtime_ms = int(duration_sec * 1000) if isinstance(duration_sec, (int, float)) else None

    code_generated_tokens = tokenizer_fn(code) if tokenizer_fn and code else None

    flat = {
        # ── Identity ──────────────────────────────────
        "id": str(uuid.uuid4()),
        "source_license": record.get("license"),
        "timestamp": record.get("timestamp"),

        # ── Problem definition ────────────────────────
        "domain": from_text.get("domain"),
        "level": _lowest_passing_level(record),
        "text": problem.get("text"),
        "solution": problem.get("original_solution"),
        "expected_answer": problem.get("expected_answer"),

        # ── Math structure (from text) ────────────────
        "objects": from_text.get("objects"),
        "constraints": from_text.get("constraints"),

        # ── Math structure (from solution) ────────────
        "reasoning_depth": from_solution.get("reasoning_depth"),
        "technique_transitions": from_solution.get("technique_transitions"),
        "reasoning_scope": from_solution.get("reasoning_scope"),
        "intermediate_reuse": from_solution.get("intermediate_reuse"),

        # ── Code artifact & telemetry ─────────────────
        "code": code or None,
        "code_attempts": outcome.get("execution_attempts"),
        "code_runtime_ms": code_runtime_ms,
        "code_generated_tokens": code_generated_tokens,
        "code_predicted_correct_answer": result.get("correct"),
    }
    return flat

In [None]:
import tiktoken
_tokenizer_fn = None
enc = tiktoken.get_encoding("cl100k_base")
_tokenizer_fn = lambda code: len(enc.encode(code, disallowed_special=())) if code else 0

In [None]:
# Convert and write normalized output (read-only: original dataset is not modified)
out_path = RUN_DIR / OUTPUT_FILENAME
normalized = []
for rec in datapoints:
    flat = record_to_flat(rec, tokenizer_fn=_tokenizer_fn)
    if flat is not None:
        normalized.append(flat)

with open(out_path, "w", encoding="utf-8") as f:
    for rec in normalized:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"Read: {len(datapoints)} records from {jsonl_file}")
print(f"Written: {len(normalized)} records to {out_path}")
print(f"Skipped (no successful attempt): {len(datapoints) - len(normalized)}")

In [None]:
# Preview first normalized record
if normalized:
    print(json.dumps(normalized[1], indent=2, ensure_ascii=False))