# Dataset Analysis

Analyze training dataset for distribution, quality, and coverage.

In [1]:
import sys
from pathlib import Path
import json
from collections import Counter, defaultdict

sys.path.append(str(Path('../evaluation').resolve()))
from quality_metrics import QualityMetrics

## Load Dataset

In [2]:
DATASET_PATH = Path('../data/training_dataset/synthetic_preferences.jsonl')

def load_dataset(filepath: Path) -> list:
    examples = []
    if filepath.exists():
        with open(filepath, 'r') as f:
            for line in f:
                if line.strip():
                    examples.append(json.loads(line))
    return examples

examples = load_dataset(DATASET_PATH)
print(f"Loaded {len(examples)} preference pairs")

if not examples:
    print("Run 04_dataset_building.ipynb first")

Loaded 0 preference pairs
Run 04_dataset_building.ipynb first


## Distribution Analysis

In [3]:
if examples:
    source_dist = Counter(ex.get('source', 'unknown') for ex in examples)
    print("Source Distribution:")
    print("=" * 50)
    for source, count in sorted(source_dist.items(), key=lambda x: -x[1]):
        pct = count / len(examples) * 100
        bar = '#' * int(pct / 2)
        print(f"  {source:15} {count:5} ({pct:5.1f}%) {bar}")
    
    dim_dist = Counter(ex.get('dimension_group', 'unknown') for ex in examples)
    print("\nDimension Distribution:")
    print("=" * 50)
    for dim, count in sorted(dim_dist.items(), key=lambda x: -x[1]):
        print(f"  {dim:20} {count:5} ({count/len(examples)*100:5.1f}%)")

## Quality Analysis

In [4]:
if examples:
    confidence_gaps = []
    for ex in examples:
        pref_conf = ex.get('preferred', {}).get('confidence', 0.5)
        rej_conf = ex.get('rejected', {}).get('confidence', 0.5)
        confidence_gaps.append(pref_conf - rej_conf)
    
    print("Confidence Gap Analysis:")
    print("=" * 50)
    print(f"  Mean gap: {sum(confidence_gaps)/len(confidence_gaps):.3f}")
    print(f"  Min gap:  {min(confidence_gaps):.3f}")
    print(f"  Max gap:  {max(confidence_gaps):.3f}")
    
    weights = [ex.get('weight', 1.0) for ex in examples]
    print(f"\nWeight Statistics:")
    print(f"  Mean weight: {sum(weights)/len(weights):.3f}")

## Abstraction Level Analysis

In [5]:
if examples:
    pref_abstractions = Counter(
        ex.get('preferred', {}).get('abstraction_level', 'unknown') 
        for ex in examples
    )
    print("Preferred Rule Abstraction Levels:")
    print("=" * 50)
    for level, count in sorted(pref_abstractions.items(), key=lambda x: -x[1]):
        print(f"  {level:15} {count:5} ({count/len(examples)*100:5.1f}%)")
    
    pref_scopes = Counter(
        ex.get('preferred', {}).get('scope', 'unknown') 
        for ex in examples
    )
    print("\nPreferred Rule Scopes:")
    print("=" * 50)
    for scope, count in sorted(pref_scopes.items(), key=lambda x: -x[1]):
        print(f"  {scope:20} {count:5} ({count/len(examples)*100:5.1f}%)")

## Constitutional Signals

In [6]:
if examples:
    constitutional_pairs = [ex for ex in examples if ex.get('source') == 'constitutional']
    print(f"Constitutional Strategy Pairs: {len(constitutional_pairs)}")
    
    iteration_keywords = ['iterative', 'refine', 'adjust', 'return', 'revisit']
    iteration_count = sum(
        1 for ex in examples
        if any(kw in ex.get('preferred', {}).get('description', '').lower() for kw in iteration_keywords)
    )
    print(f"Rules with iteration language: {iteration_count} ({iteration_count/len(examples)*100:.1f}%)")

## Sample Examples

In [7]:
if examples:
    print("Sample Preference Pairs:")
    print("=" * 60)
    for i, ex in enumerate(examples[:3]):
        print(f"\n[{i+1}] Source: {ex.get('source', 'unknown')}")
        pref = ex.get('preferred', {})
        rej = ex.get('rejected', {})
        print(f"  Preferred: {pref.get('description', 'N/A')[:60]}...")
        print(f"  Rejected:  {rej.get('description', 'N/A')[:60]}...")

## Summary

In [8]:
if examples:
    print("Dataset Summary")
    print("=" * 60)
    print(f"Total pairs:          {len(examples)}")
    print(f"Unique sources:       {len(source_dist)}")
    print(f"Mean confidence gap:  {sum(confidence_gaps)/len(confidence_gaps):.3f}")
    print(f"Constitutional pairs: {len(constitutional_pairs)}")
    
    quality_score = (
        (len(constitutional_pairs) / max(len(examples), 1)) * 0.3 +
        (iteration_count / max(len(examples), 1)) * 0.3 +
        (sum(confidence_gaps) / max(len(confidence_gaps), 1)) * 0.4
    )
    print(f"\nQuality score: {quality_score:.2f}")

## Next Steps

- 06_pre_training_validation.ipynb: Validate before training