# 6 - Effective Difficulty Levels

Resolves the definitive computational bucket and computes a continuous
effective-difficulty score for every record.

**Pipeline**:
1. Load dataset (same JSONL as other pipeline notebooks).
2. For each record, find the **definitive bucket level** from `computation_buckets`:
   loop over the array, find the lowest level where `passes >= 1`.
3. Compute a **mixed difficulty** score combining bucket level, text structure,
   and solution structure signals using the specified mixed_difficulty function.
4. Build a quantile mapper using quintile buckets (1-5) from the score distribution.
5. Write `effective_difficulty: { level, score }` into each record and save.

In [36]:
import json
import numpy as np
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional
from tqdm.notebook import tqdm

In [37]:
# ============================================================================
# CONFIGURATION
# ============================================================================

INPUT_JSONL_PATH = Path("/home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/dataset_1_5_complete.jsonl")
OUTPUT_JSONL_PATH = INPUT_JSONL_PATH.parent / "dataset_1_5_complete_effective_difficulty.jsonl"

# Only process records with these tiers (None = process all)
PROCESSABLE_TIERS = None

print(f"Input:  {INPUT_JSONL_PATH}")
print(f"Output: {OUTPUT_JSONL_PATH}")

Input:  /home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/dataset_1_5_complete.jsonl
Output: /home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/dataset_1_5_complete_effective_difficulty.jsonl


In [38]:
# ============================================================================
# LOAD DATASET
# ============================================================================

all_records: List[Dict[str, Any]] = []
with open(INPUT_JSONL_PATH, "r", encoding="utf-8") as f:
    for line_num, line in enumerate(f, 1):
        if line.strip():
            try:
                all_records.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Error parsing line {line_num}: {e}")

print(f"Loaded {len(all_records)} records")

Loaded 71832 records


In [39]:
# ============================================================================
# DEFINITIVE BUCKET LEVEL
# ============================================================================

def get_definitive_bucket_level(record: Dict[str, Any]) -> Optional[int]:
    """Find the lowest computation_buckets level where passes >= 1.

    Returns None when the array is missing or no level passed.
    """
    buckets = record.get("computation_buckets")
    if not isinstance(buckets, list) or len(buckets) == 0:
        return None

    best = None
    for entry in buckets:
        passes = entry.get("passes", 0)
        level = entry.get("level")
        if level is None:
            continue
        # passes is an integer count (0 = fail, >= 1 = success)
        if isinstance(passes, (int, float)) and passes >= 1:
            if best is None or level < best:
                best = level
    return best


# Quick sanity check
from collections import Counter
levels = [get_definitive_bucket_level(r) for r in all_records]
level_counts = Counter(levels)
print("Definitive bucket level distribution:")
for lvl in sorted(level_counts, key=lambda x: (x is None, x)):
    print(f"  level={lvl}: {level_counts[lvl]}")

Definitive bucket level distribution:
  level=0: 571
  level=1: 2152
  level=2: 10516
  level=3: 7295
  level=4: 2363
  level=5: 1080
  level=6: 6
  level=None: 47849


In [40]:
# ============================================================================
# MIXED DIFFICULTY SCORE
# ============================================================================

def mixed_difficulty(B, from_text, from_solution):
    """
    Compute mixed difficulty score combining bucket level (B), text structure, 
    and solution structure.
    
    Args:
        B: bucket level (int or None)
        from_text: dict with mechanisms, constraints, objects
        from_solution: dict with reasoning features
    
    Returns:
        float in [0, 6]
    """
    # Default bucket level to 2 if None
    if B is None:
        B = 2
    
    # Text structure contribution
    from_text = from_text or {}
    m = len(from_text.get("mechanisms") or [])
    c = len(from_text.get("constraints") or [])
    o = len(from_text.get("objects") or [])
    
    delta_text = 0.30 * m + 0.10 * max(0, c - 1) + 0.05 * max(0, o - 1)
    
    # Solution structure contribution
    from_solution = from_solution or {}
    
    depth = from_solution.get("reasoning_depth") or "medium"
    w_d = {"shallow": -0.15, "medium": 0.0, "deep": 0.30}.get(depth, 0.0)
    
    cs = from_solution.get("case_split") or "none"
    w_cs = {"none": 0.0, "binary": 0.15, "multi": 0.30}.get(cs, 0.0)
    
    rs = from_solution.get("reasoning_shape") or "linear"
    w_rs = {"linear": 0.0, "branching": 0.20}.get(rs, 0.0)
    
    tt_raw = from_solution.get("technique_transitions")
    try:
        w_t = 0.10 * min(int(tt_raw), 3)
    except (TypeError, ValueError):
        w_t = 0.0
    
    inv = from_solution.get("invariant") or "none"
    w_inv = {"none": 0.0, "implicit": 0.15, "explicit": 0.30}.get(inv, 0.0)
    
    scope = from_solution.get("reasoning_scope") or "local"
    w_sc = {"local": 0.0, "global": 0.15}.get(scope, 0.0)
    
    w_pr = 0.10 if bool(from_solution.get("dead_end_pruning")) else 0.0
    
    reuse = from_solution.get("intermediate_reuse") or "single"
    w_reuse = {"single": 0.0, "multiple": 0.10}.get(reuse, 0.0)
    
    delta_sol = w_d + w_cs + w_rs + w_t + w_inv + w_sc + w_pr + w_reuse
    
    # Combine and clip to [0, 6]
    D = B + delta_text + delta_sol
    if D < 0:
        D = 0.0
    if D > 6:
        D = 6.0
    
    return D


print("mixed_difficulty defined")

mixed_difficulty defined


In [41]:
# ============================================================================
# BUILD QUANTILE MAPPER
# ============================================================================

def build_quantile_mapper(D_values):
    """
    Build a quantile mapper that maps difficulty scores to levels 1-5.
    
    Args:
        D_values: array of difficulty scores
    
    Returns:
        tuple of (quantiles, mapper_function)
        - quantiles: array of 20th, 40th, 60th, 80th percentiles
        - mapper_function: function that maps a score to level 1-5
    """
    qs = np.quantile(D_values, [0.2, 0.4, 0.6, 0.8])
    
    def map_D(D):
        if D <= qs[0]:
            return 1
        if D <= qs[1]:
            return 2
        if D <= qs[2]:
            return 3
        if D <= qs[3]:
            return 4
        return 5
    
    return qs, map_D


print("build_quantile_mapper defined")

build_quantile_mapper defined


In [42]:
# ============================================================================
# COMPUTE EFFECTIVE DIFFICULTY
# ============================================================================

def get_tier(record: Dict[str, Any]) -> Optional[str]:
    return (record.get("audit", {}).get("tier") or record.get("tier", "")).lower() or None


# --- Pass 1: compute raw mixed_difficulty for eligible records ---
raw_scores: List[float] = []
record_scores: List[Optional[float]] = []  # parallel to all_records
record_levels: List[Optional[int]] = []

for record in tqdm(all_records, desc="Computing raw scores"):
    tier = get_tier(record)
    if PROCESSABLE_TIERS and tier not in PROCESSABLE_TIERS:
        record_scores.append(None)
        record_levels.append(None)
        continue

    bl = get_definitive_bucket_level(record)
    ms = record.get("math_structure") or {}
    from_text = ms.get("from_text")
    from_solution = ms.get("from_solution")

    score = mixed_difficulty(bl, from_text, from_solution)
    raw_scores.append(score)
    record_scores.append(score)
    record_levels.append(bl)

print(f"\nComputed raw scores for {len(raw_scores)} eligible records")
if raw_scores:
    print(f"Score range: [{min(raw_scores):.3f}, {max(raw_scores):.3f}]")
    print(f"Mean: {np.mean(raw_scores):.3f}, Median: {np.median(raw_scores):.3f}")

Computing raw scores:   0%|          | 0/71832 [00:00<?, ?it/s]


Computed raw scores for 71832 eligible records
Score range: [0.000, 6.000]
Mean: 2.418, Median: 2.250


In [43]:
# --- Pass 2: build quantile mapper and write effective_difficulty ---
if not raw_scores:
    raise RuntimeError("No eligible records found â€” check input data and filters.")

quantiles, mapper = build_quantile_mapper(np.array(raw_scores))

print(f"Quantile thresholds: {quantiles}")

written = 0
skipped = 0

for i, record in enumerate(tqdm(all_records, desc="Writing effective_difficulty")):
    score_raw = record_scores[i]
    level = record_levels[i]
    if score_raw is None:
        skipped += 1
        continue

    record["effective_difficulty"] = {
        "level": level,
        "score": mapper(score_raw),
    }
    written += 1

print(f"\nWritten: {written}, Skipped: {skipped}")

Quantile thresholds: [2.05 2.2  2.35 2.75]


Writing effective_difficulty:   0%|          | 0/71832 [00:00<?, ?it/s]


Written: 71832, Skipped: 0


In [44]:
# ============================================================================
# SAVE OUTPUT
# ============================================================================

OUTPUT_JSONL_PATH.parent.mkdir(parents=True, exist_ok=True)
tmp = OUTPUT_JSONL_PATH.with_suffix(".tmp")
with open(tmp, "w", encoding="utf-8") as f:
    for record in all_records:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")
tmp.rename(OUTPUT_JSONL_PATH)

print(f"Saved {len(all_records)} records to {OUTPUT_JSONL_PATH}")

Saved 71832 records to /home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/dataset_1_5_complete_effective_difficulty.jsonl


In [45]:
# ============================================================================
# SUMMARY STATISTICS
# ============================================================================

from collections import Counter

ed_records = [r for r in all_records if "effective_difficulty" in r]
levels_out = [r["effective_difficulty"]["level"] for r in ed_records]
scores_out = [r["effective_difficulty"]["score"] for r in ed_records]

print(f"Records with effective_difficulty: {len(ed_records)}")

# Bucket level distribution
print("\nDefinitive bucket level distribution:")
for lvl, cnt in sorted(Counter(levels_out).items(), key=lambda x: (x[0] is None, x[0])):
    print(f"  level={lvl}: {cnt} ({cnt/len(ed_records)*100:.1f}%)")

# Mapped difficulty score distribution (1-5)
print("\nMapped difficulty score distribution (1-5):")
for score_level, cnt in sorted(Counter(scores_out).items()):
    print(f"  score={score_level}: {cnt} ({cnt/len(ed_records)*100:.1f}%)")

all_records[0]

Records with effective_difficulty: 71832

Definitive bucket level distribution:
  level=0: 571 (0.8%)
  level=1: 2152 (3.0%)
  level=2: 10516 (14.6%)
  level=3: 7295 (10.2%)
  level=4: 2363 (3.3%)
  level=5: 1080 (1.5%)
  level=6: 6 (0.0%)
  level=None: 47849 (66.6%)

Mapped difficulty score distribution (1-5):
  score=1: 15991 (22.3%)
  score=2: 15547 (21.6%)
  score=3: 11829 (16.5%)
  score=4: 14563 (20.3%)
  score=5: 13902 (19.4%)


{'id': 'gsm8k_0_20260206_000',
 'dataset': 'gsm8k',
 'problem_id': '0',
 'timestamp': '2026-02-06T20:39:18.551213',
 'problem': {'text': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'expected_answer': '72',
  'original_solution': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.'},
 'prompt_ids': {'base': 'base_prompt_v1',
  'timeout_repair': 'timeout_repair_prompt_v1',
  'error_repair': 'error_repair_prompt_v1',
  'wrong_answer_repair': 'wrong_answer_repair_prompt_v1',
  'reasoning_summary': None},
 'models': {'generating': 'gpt-oss', 'fixing': 'gpt-oss'},
 'generation_config': {'temperature': 1,
  'max_tokens': 3072,
  'reasoning_effort': 'low'},
 'reasoning_summary_config': {'enabled': False,
  'reasoning_effort': None,
  'max_tokens': None},
 'attempts': [{'n': 1,
   'stage': 'initial',
   'model':

In [46]:
# ============================================================================
# SAMPLE INSPECTION
# ============================================================================

import random
random.seed(42)

# Show one example per bucket level
by_level: Dict[Optional[int], List] = {}
for r in ed_records:
    lvl = r["effective_difficulty"]["level"]
    by_level.setdefault(lvl, []).append(r)

for lvl in sorted(by_level, key=lambda x: (x is None, x)):
    sample = random.choice(by_level[lvl])
    print(f"--- level={lvl} | score={sample['effective_difficulty']['score']:.4f} ---")
    print(f"  id: {sample.get('id', 'N/A')}")
    print(f"  dataset: {sample.get('dataset', 'N/A')}")
    text = sample.get('problem', {}).get('text', '')[:120]
    print(f"  problem: {text}...")
    print()

--- level=0 | score=1.0000 ---
  id: numina1.5_2369_20260202_2370
  dataset: numina1.5
  problem: 6 books are aligned in a library. How many ways are there to arrange them....

--- level=1 | score=1.0000 ---
  id: gsm8k_795_20260206_795
  dataset: gsm8k
  problem: A movie theater has 6 screens which show movies back-to-back all day. If the movie theater is open for 8 hours, and each...

--- level=2 | score=3.0000 ---
  id: numina1.5_3606_20260202_3607
  dataset: numina1.5
  problem: 2. Find the smallest natural number that has exactly 12 different natural divisors, the largest prime divisor of which i...

--- level=3 | score=5.0000 ---
  id: numina1.5_14_20260203_015
  dataset: numina1.5:cn_contest
  problem: Example 21. How many three-digit numbers can be formed using $0,1,2,3,4,5$ without repeating any digit?...

--- level=4 | score=5.0000 ---
  id: numina1.5_3566_20260202_3567
  dataset: numina1.5
  problem: 13.437 What whole positive number should 180 be divided by, so that the rem