## Импорты и настройка

In [None]:
from data_structures import (
    Example, 
    OptimizationConfig
)
import json
import random
from typing import List, Dict
from hierarchical_optimizer import HierarchicalOptimizer

print("✓ Imports successful")

api_config = {
    "provider": "openai",
    "api_key": "", 
    "model": "gpt-4o-mini" 
}

# api_config = {
#     "provider": "gemini",
#     "gemini_api_key": "", 
#     "gemini_model": "gemini-2.5-flash"  
# }

✓ Imports successful


## Подготовка датасета
Создаем простой датасет для демонстрации (задача классификации тональности)

In [2]:
LABEL_MAP = {0: "negative", 1: "positive"}

def load_jsonl(path: str) -> List[Dict]:
    with open(path, encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def to_examples(data):
    return [
        Example(input_text=item["text"], expected_output=LABEL_MAP[item["label"]])
        for item in data
    ]

def train_val_split(data, val_ratio=0.1, seed=42):
    random.Random(seed).shuffle(data)

    split_idx = int(len(data) * (1 - val_ratio))
    train_data = data[:split_idx]
    val_data = data[split_idx:]

    return train_data, val_data

def sample_n(data, n=100, seed=42):
    rnd = random.Random(seed)
    return rnd.sample(data, min(n, len(data)))

train_data = sample_n(load_jsonl("data/train.jsonl"), 20)
test_data = sample_n(load_jsonl("data/test.jsonl"), 20)

train_split, val_split = train_val_split(train_data, val_ratio=0.1, seed=42)

train_examples = to_examples(train_split)
validation_examples = to_examples(val_split)
test_examples = to_examples(test_data)

print("Dataset prepared:")
print(f"  Train: {len(train_examples)} examples")
print(f"  Validation: {len(validation_examples)} examples")
print(f"  Test: {len(test_examples)} examples")

Dataset prepared:
  Train: 18 examples
  Validation: 2 examples
  Test: 20 examples


## Конфигурация оптимизации

In [3]:
config = OptimizationConfig(
    # Основные параметры
    max_generations=5, 
    population_size=3,

    # Локальная оптимизация
    local_iterations_per_generation=3,
    local_candidates_per_iteration=2,
    local_batch_size=5,
    local_max_examples=20,

    # Глобальная оптимизация
    global_trigger_interval=2,  
    global_candidates=2,
    global_history_window=15,

    # Early stopping
    patience=2,
    min_improvement=0.005,

    # Diversity
    diversity_bonus=0.03,
    similarity_threshold=0.85,

    # Метрики
    metric_weights={
        "accuracy": 0.6,   
        "safety": 0.2,
        "robustness": 0.1,
        "efficiency": 0.1,
        "f1": 0.0
    },

    # API параметры
    temperature=0.7,
    max_tokens=2000
)

print("Configuration created")
print("Key parameters:")
print(f"  Max generations: {config.max_generations}")
print(f"  Population size: {config.population_size}")
print(f"  Local iterations: {config.local_iterations_per_generation}")
print(f"  Global trigger: every {config.global_trigger_interval} generations")

Configuration created
Key parameters:
  Max generations: 5
  Population size: 3
  Local iterations: 3
  Global trigger: every 2 generations


## Создание начального промпта

In [4]:
initial_prompt = """Determine whether the Statement is a lie (Yes) or not (No) based on the Context and other information."""

print("Initial prompt:")
print("-" * 60)
print(initial_prompt)
print("-" * 60)

Initial prompt:
------------------------------------------------------------
Determine whether the Statement is a lie (Yes) or not (No) based on the Context and other information.
------------------------------------------------------------


## Инициализация оптимизатора

In [5]:
optimizer = HierarchicalOptimizer(config=config, api_config=api_config)

Initializing Hierarchical Optimizer...
✓ Initialization complete



## Запуск оптимизации

In [6]:
best_node = optimizer.optimize(
    initial_prompt=initial_prompt,
    train_examples=train_examples,
    validation_examples=validation_examples,
    test_examples=test_examples,
    save_dir="./optimization_results",
)

HIERARCHICAL PROMPT OPTIMIZATION
Configuration:
  Max generations: 5
  Local iterations per generation: 3
  Global trigger interval: 2
  Population size: 3
  Patience: 2

Dataset sizes:
  Train: 18
  Validation: 2
  Test: 20

Evaluating initial prompt...
Initial score: 0.680
  Accuracy: 0.500
  Safety: 1.000
  Robustness: 0.800
  Efficiency: 1.000
  F1: 0.000


GENERATION 1/5

Phase 1: Local Optimization
  Population size: 1

  Optimizing node 1/1 (score: 0.680)

Starting Local Optimization
Starting node: 48ed0494-4401-442b-9651-381147af4c62 (gen 0)
Max iterations: 3


--- Iteration 1 ---
Failures: 2, Successes: 0
Generating text gradients...
Generated 1 gradients
  Generating variants from gradient 1/1
  Generated 2 variants, 2 unique
Generated 2 candidate prompts
  Evaluating candidate 1/2... Score: 0.732
  Evaluating candidate 2/2... Score: 0.133
Evaluated 2 candidates

  Top candidates (by score):
    1. Score: 0.732
    2. Score: 0.133
Best candidate score: 0.732 (Δ +0.052)
✓ Impr

## Анализ результатов

In [8]:
report = optimizer.get_optimization_report()
print('Optimization generations summary:')
for entry in report['optimization_log']:
    print(f"  Generation {entry['generation']}: time {entry['time']:.2f}s, best_score {entry['best_score']:.3f}")

local_stats = report['component_statistics']['local_optimizer']
print('Local optimizer summary:')
print(f"  Total iterations recorded: {local_stats.get('total_iterations', 0)}")
avg_it = local_stats.get('avg_iteration_time')
if avg_it is not None:
    print(f"  Avg iteration time: {avg_it:.2f}s")
else:
    print('  Avg iteration time: N/A')
print(f"  Total LLM calls attributed to local iterations: {local_stats.get('total_llm_calls_by_local', 0)}")
print('Per-iteration breakdown:')
for s in local_stats.get('iteration_stats', []):
    print(f"  Iter {s['iteration']}: time {s['time']:.2f}s, llm_calls {s['llm_calls']}")

Optimization generations summary:
  Generation 1: time 282.12s, best_score 0.782
  Generation 2: time 825.15s, best_score 0.985
  Generation 3: time 1124.28s, best_score 0.990
  Generation 4: time 1466.81s, best_score 0.990
  Generation 5: time 1066.01s, best_score 0.990
Local optimizer summary:
  Total iterations recorded: 26
  Avg iteration time: 138.25s
  Total LLM calls attributed to local iterations: 195
Per-iteration breakdown:
  Iter 1: time 76.25s, llm_calls 5
  Iter 2: time 102.96s, llm_calls 5
  Iter 3: time 102.91s, llm_calls 5
  Iter 1: time 131.18s, llm_calls 5
  Iter 2: time 191.98s, llm_calls 8
  Iter 3: time 175.26s, llm_calls 8
  Iter 1: time 167.40s, llm_calls 10
  Iter 2: time 123.53s, llm_calls 7
  Iter 1: time 69.94s, llm_calls 3
  Iter 2: time 99.92s, llm_calls 5
  Iter 3: time 155.41s, llm_calls 10
  Iter 1: time 222.68s, llm_calls 14
  Iter 2: time 63.65s, llm_calls 3
  Iter 3: time 214.55s, llm_calls 14
  Iter 1: time 205.44s, llm_calls 14
  Iter 2: time 218.18

In [9]:
print("BEST PROMPT FOUND:")
print("=" * 80)
print(best_node.prompt_text)
print("=" * 80)
print(f"Score: {best_node.metrics.composite_score():.3f}")
print(f"Generation: {best_node.generation}")
print(f"Source: {best_node.source.value}")

BEST PROMPT FOUND:
**Task:** Evaluate the truthfulness of the following statement and categorize it as either "True" or "False." To facilitate this task, follow the structured process outlined below, emphasizing rigorous verification of claims.

**Statement:** [Insert Statement Here]

**Verification Process:**

1. **Context Exploration:** 
   - What is the broader context surrounding this statement? Consider its implications and the specific setting, particularly regarding legal or political narratives.

2. **Claim Identification:** 
   - Identify three key claims made within the statement. Clearly articulate what assertions are being made.

3. **Evidence Verification:** 
   - For each key claim, reference primary legal documents, historical records, or reputable sources. List specific types of documents or data that could confirm or refute the statement (e.g., Supreme Court rulings, official military records).

4. **Counterarguments:** 
   - Identify any well-known counterarguments or

In [10]:
metrics = best_node.metrics

print("METRICS:")
print(f"  Composite Score: {metrics.composite_score():.3f}")
print(f"  Accuracy:        {metrics.metrics['accuracy']:.3f}")
print(f"  Safety:          {metrics.metrics['safety']:.3f}")
print(f"  Robustness:      {metrics.metrics['robustness']:.3f}")
print(f"  Efficiency:      {metrics.metrics['efficiency']:.3f}")
print(f"  F1 Score:        {metrics.metrics['f1']:.3f}")

METRICS:
  Composite Score: 0.990
  Accuracy:        1.000
  Safety:          1.000
  Robustness:      0.900
  Efficiency:      1.000
  F1 Score:        0.000


In [11]:
print(optimizer.visualize_optimization_trajectory())


OPTIMIZATION TRAJECTORY

Generation | Best Score | Overall Best | Improvement
------------------------------------------------------------
   1       | 0.782      | 0.782       | +0.102 ███████████████████████████████████████
   2       | 0.985      | 0.985       | +0.203 █████████████████████████████████████████████████
   3       | 0.990      | 0.990       | +0.005 █████████████████████████████████████████████████
   4       | 0.990      | 0.990       | +0.000 █████████████████████████████████████████████████
   5       | 0.990      | 0.990       | +0.000 █████████████████████████████████████████████████




In [12]:
report = optimizer.get_optimization_report()

print("OPTIMIZATION REPORT:")
print("Overall Statistics:")
print(f"   Total time: {report['optimization_info']['total_time_seconds']:.2f}s")
print(f"   Generations: {report['optimization_info']['generations']}")
print(f"   Total nodes explored: {report['component_statistics']['history']['total_nodes']}")

print("Component Statistics:")
print(f"   Local optimizer iterations: {report['component_statistics']['local_optimizer']['total_iterations']}")
print(f"   Local improvements: {report['component_statistics']['local_optimizer']['improvements_count']}")
print(f"   Global optimizer steps: {report['component_statistics']['global_optimizer']['total_global_steps']}")
print(f"   Successful global changes: {report['component_statistics']['global_optimizer']['successful_global_changes']}")

print("Best Global Strategies:")
for i, strategy in enumerate(report['best_global_strategies'][:3], 1):
    print(f"   {i}. {strategy['strategy']['type']}: Score {strategy['score']:.3f}")
    print(f"      {strategy['strategy']['description'][:70]}...")

OPTIMIZATION REPORT:
Overall Statistics:
   Total time: 4780.14s
   Generations: 5
   Total nodes explored: 75
Component Statistics:
   Local optimizer iterations: 26
   Local improvements: 6
   Global optimizer steps: 3
   Successful global changes: 0
Best Global Strategies:
   1. SIMPLIFY: Score 0.985
      Distill complex prompts into more straightforward formats that maintai...
   2. COMBINE: Score 0.985
      Merge insights from successful prompts to create hybrid prompts that l...
   3. EXPAND: Score 0.983
      Broaden the scope of the tasks by integrating additional instruction t...


In [13]:
lineage = optimizer.history.get_lineage(best_node.id)

print("EVOLUTION OF BEST PROMPT:")
print("="*80)

for i, node in enumerate(lineage):
    print(f"Step {i}: Generation {node.generation}, Source: {node.source.value}")
    if node.is_evaluated:
        print(f"  Score: {node.metrics.composite_score():.3f}")

    if node.operations:
        print(f"  Operations:")
        for op in node.operations:
            print(f"    - {op.operation_type.value}: {op.description[:60]}...")

    if i < len(lineage) - 1:  
        print("  ↓")

EVOLUTION OF BEST PROMPT:
Step 0: Generation 2, Source: global
  Score: 0.682
  Operations:
    - restructure: DIVERSIFY: Introduce a wider variety of prompt styles and st...
  ↓
Step 1: Generation 3, Source: local
  Score: 0.990
  Operations:
    - modify_instruction: Edited based on gradient...


In [14]:
comparison = optimizer.compare_with_baseline(
    baseline_prompt=initial_prompt,
    test_examples=test_examples
)

print("COMPARISON WITH BASELINE:")
print("="*80)

print("Baseline:")
for metric, value in comparison['baseline'].items():
    print(f"  {metric:20s}: {value:.3f}")

print("Optimized:")
for metric, value in comparison['optimized'].items():
    print(f"  {metric:20s}: {value:.3f}")

print("Improvements:")
for metric, value in comparison['improvements'].items():
    arrow = "↑" if value > 0 else "↓" if value < 0 else "→"
    print(f"  {metric:20s}: {value:+.3f} {arrow}")


Comparing with baseline...

Comparison Results:
  Baseline score: 0.445
  Optimized score: 0.867
  Improvement: +0.422
COMPARISON WITH BASELINE:
Baseline:
  composite_score     : 0.445
  accuracy            : 0.200
  safety              : 0.850
  robustness          : 0.750
  efficiency          : 0.800
  f1                  : 0.000
Optimized:
  composite_score     : 0.867
  accuracy            : 0.800
  safety              : 1.000
  robustness          : 0.872
  efficiency          : 1.000
  f1                  : 0.000
Improvements:
  composite_score     : +0.422 ↑
  accuracy            : +0.600 ↑
  safety              : +0.150 ↑
  robustness          : +0.122 ↑
  efficiency          : +0.200 ↑
  f1                  : +0.000 →


In [16]:
print("FINAL SUMMARY")
print("="*80)

history_stats = optimizer.history.get_statistics()
local_stats = optimizer.local_optimizer.get_statistics()
global_stats = optimizer.global_optimizer.get_statistics()

print("Overall Statistics:")
print(f"  Total nodes explored: {history_stats['total_nodes']}")
print(f"  Evaluations performed: {history_stats['evaluated_nodes']}")
print(f"  Generations completed: {history_stats['max_generation']}")
print(f"  Best score achieved: {history_stats['best_score']:.3f}")
print(f"  Average score: {history_stats['avg_score']:.3f}")

print("Local Optimization:")
print(f"  Total iterations: {local_stats['total_iterations']}")
print(f"  Improvements found: {local_stats['improvements_count']}")
print(f"  Success rate: {local_stats['improvement_rate']:.1%}")

print("Global Optimization:")
print(f"  Total global steps: {global_stats['total_global_steps']}")
print(f"  Candidates generated: {global_stats['total_candidates_generated']}")
print(f"  Successful changes: {global_stats['successful_global_changes']}")
print(f"  Success rate: {global_stats['success_rate']:.1%}")

print("Optimization complete!")
print(f"   Results saved to: ./optimization_results/")

FINAL SUMMARY
Overall Statistics:
  Total nodes explored: 75
  Evaluations performed: 75
  Generations completed: 5
  Best score achieved: 0.990
  Average score: 0.754
Local Optimization:
  Total iterations: 26
  Improvements found: 6
  Success rate: 23.1%
Global Optimization:
  Total global steps: 3
  Candidates generated: 15
  Successful changes: 0
  Success rate: 0.0%
Optimization complete!
   Results saved to: ./optimization_results/
