# Dataset Building

Build training dataset from generated rules and preferences.

In [1]:
import sys
from pathlib import Path
import json
from datetime import datetime
from collections import Counter

sys.path.append(str(Path('../core').resolve()))
sys.path.append(str(Path('../evaluation').resolve()))

from synthetic_preference_generator import (
    SyntheticPreferenceGenerator,
    IntentRule,
    load_rules_from_generated_file
)
from quality_metrics import QualityMetrics

## Configuration

In [2]:
RULES_PATH = Path('../data/generated_rules/rules.jsonl')
DATASET_PATH = Path('../data/training_dataset')
OUTPUT_FILE = DATASET_PATH / 'synthetic_preferences.jsonl'
DATASET_PATH.mkdir(parents=True, exist_ok=True)

config_path = Path('../config/training_config.json')
with open(config_path, 'r') as f:
    training_config = json.load(f)

print(f"Train/Val/Test: {training_config['data_config']['train_split']}/{training_config['data_config']['val_split']}/{training_config['data_config']['test_split']}")

Train/Val/Test: 0.8/0.1/0.1


## Load Generated Rules

In [3]:
if RULES_PATH.exists():
    trace_rules = load_rules_from_generated_file(str(RULES_PATH))
    total_rules = sum(len(rules) for rules in trace_rules.values())
    print(f"Loaded {total_rules} rules from {len(trace_rules)} traces")
else:
    print(f"No rules file at {RULES_PATH}")
    print("Run 03_rule_generation.ipynb first")
    trace_rules = {}

No rules file at ../data/generated_rules/rules.jsonl
Run 03_rule_generation.ipynb first


## Initialize Generator

In [4]:
generator = SyntheticPreferenceGenerator(
    confidence_threshold_high=0.7,
    confidence_threshold_low=0.4,
    min_novelty_score=0.3,
    require_platform_context=True,
    synthetic_weight=0.3,
    group_by_dimension=True,
    group_by_platform=True
)
print(f"Generator initialized")

Generator initialized


## Generate Preferences

In [5]:
if trace_rules:
    all_pairs = generator.generate_from_trace_batch(
        trace_rules,
        strategies=['confidence', 'quality', 'completeness', 'novelty', 'frequency', 'constitutional']
    )
    print(f"Generated {len(all_pairs)} preference pairs")
    
    source_counts = Counter(p.source for p in all_pairs)
    print("\nBy strategy:")
    for source, count in sorted(source_counts.items(), key=lambda x: -x[1]):
        print(f"  {source}: {count}")
else:
    all_pairs = []

## Format for DPO

In [6]:
if all_pairs:
    dpo_examples = generator.format_for_dpo(all_pairs, include_weights=True, include_grouping=True)
    print(f"Formatted {len(dpo_examples)} DPO examples")
    print("\nSample:")
    print(json.dumps(dpo_examples[0], indent=2)[:500] + "...")
else:
    dpo_examples = []

No preference pairs generated


## Save Dataset

In [7]:
if dpo_examples:
    with open(OUTPUT_FILE, 'w') as f:
        for ex in dpo_examples:
            f.write(json.dumps(ex) + '\n')
    print(f"Saved {len(dpo_examples)} examples to {OUTPUT_FILE}")
    
    metadata = {
        'created_at': datetime.now().isoformat(),
        'total_examples': len(dpo_examples),
        'source_distribution': dict(source_counts),
        'strategies_used': ['confidence', 'quality', 'completeness', 'novelty', 'frequency', 'constitutional']
    }
    with open(DATASET_PATH / 'metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"Saved metadata")
else:
    print("No examples to save")

No examples to save


## Next Steps

- 05_dataset_analysis.ipynb: Analyze distribution
- 06_pre_training_validation.ipynb: Validate before training