## Импорты и настройка

In [1]:
from data_structures import Example
import json
import random
from typing import List, Dict
from hierarchical_optimizer import HierarchicalOptimizer

print("✓ Imports successful")

✓ Imports successful


## Подготовка датасета
Создаем простой датасет для демонстрации (задача классификации тональности)

In [2]:
LABEL_MAP = {0: "negative", 1: "positive"}

def load_jsonl(path: str) -> List[Dict]:
    with open(path, encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def to_examples(data):
    return [
        Example(input_text=item["text"], expected_output=LABEL_MAP[item["label"]])
        for item in data
    ]

def train_val_split(data, val_ratio=0.1, seed=42):
    random.Random(seed).shuffle(data)

    split_idx = int(len(data) * (1 - val_ratio))
    train_data = data[:split_idx]
    val_data = data[split_idx:]

    return train_data, val_data

def sample_n(data, n=100, seed=42):
    rnd = random.Random(seed)
    return rnd.sample(data, min(n, len(data)))

train_data = sample_n(load_jsonl("data/train.jsonl"), 20)
test_data = sample_n(load_jsonl("data/test.jsonl"), 20)

train_split, val_split = train_val_split(train_data, val_ratio=0.1, seed=42)

train_examples = to_examples(train_split)
validation_examples = to_examples(val_split)
test_examples = to_examples(test_data)

print("Dataset prepared:")
print(f"  Train: {len(train_examples)} examples")
print(f"  Validation: {len(validation_examples)} examples")
print(f"  Test: {len(test_examples)} examples")

Dataset prepared:
  Train: 18 examples
  Validation: 2 examples
  Test: 20 examples


## Создание начального промпта

In [3]:
initial_prompt = """Determine whether the Statement is a lie (Yes) or not (No) based on the Context and other information."""

print("Initial prompt:")
print("-" * 60)
print(initial_prompt)
print("-" * 60)

Initial prompt:
------------------------------------------------------------
Determine whether the Statement is a lie (Yes) or not (No) based on the Context and other information.
------------------------------------------------------------


## Инициализация оптимизатора

In [4]:
optimizer = HierarchicalOptimizer()

## Запуск оптимизации

In [5]:
best_node = optimizer.optimize(
    initial_prompt=initial_prompt,
    train_examples=train_examples,
    validation_examples=validation_examples,
    test_examples=test_examples,
    save_dir="./optimization_results",
)

Evaluating initial prompt...
Initial score: 0.650
  Accuracy: 0.500
  Safety: 1.000
  Robustness: 0.750
  Efficiency: 0.000
  F1: 0.000


GENERATION 1/3

Phase 1: Local Optimization
  Population size: 1

  Optimizing node 1/1 (score: 0.650)

Starting Local Optimization


--- Iteration 1 ---
Failures: 2, Successes: 0
Generating text gradients...
Generated 1 gradients
  Generating variants from gradient 1/1
  Generated 2 variants, 2 unique
Generated 2 candidate prompts
  Evaluating candidate 1/2... Score: 0.460
  Evaluating candidate 2/2... Score: 0.650
Evaluated 2 candidates
Best candidate score: 0.650 (Δ +0.000)
✗ No significant improvement
Iteration time: 64.34s — LLM calls: 19 (total: 27)

--- Iteration 2 ---
Failures: 2, Successes: 0
Generating text gradients...
Generated 1 gradients
  Generating variants from gradient 1/1
  Generated 2 variants, 2 unique
Generated 2 candidate prompts
  Evaluating candidate 1/2... Score: 0.670
  Evaluating candidate 2/2... Score: 0.870
Evaluated 2 c

## Анализ результатов

In [6]:
report = optimizer.get_optimization_report()
print('Optimization generations summary:')
for entry in report['optimization_log']:
    print(f"  Generation {entry['generation']}: time {entry['time']:.2f}s, best_score {entry['best_score']:.3f}")

local_stats = report['component_statistics']['local_optimizer']
print('Local optimizer summary:')
print(f"  Total iterations recorded: {local_stats.get('total_iterations', 0)}")
avg_it = local_stats.get('avg_iteration_time')
if avg_it is not None:
    print(f"  Avg iteration time: {avg_it:.2f}s")
else:
    print('  Avg iteration time: N/A')
print(f"  Total LLM calls attributed to local iterations: {local_stats.get('total_llm_calls_by_local', 0)}")
print('Per-iteration breakdown:')
for s in local_stats.get('iteration_stats', []):
    print(f"  Iter {s['iteration']}: time {s['time']:.2f}s, llm_calls {s['llm_calls']}")

Optimization generations summary:
  Generation 1: time 198.89s, best_score 0.965
  Generation 2: time 144.72s, best_score 0.965
  Generation 3: time 152.43s, best_score 0.965
Local optimizer summary:
  Total iterations recorded: 7
  Avg iteration time: 69.36s
  Total LLM calls attributed to local iterations: 117
Per-iteration breakdown:
  Iter 1: time 64.34s, llm_calls 19
  Iter 2: time 55.28s, llm_calls 19
  Iter 3: time 75.55s, llm_calls 19
  Iter 1: time 51.44s, llm_calls 11
  Iter 2: time 93.26s, llm_calls 19
  Iter 1: time 53.01s, llm_calls 11
  Iter 2: time 92.61s, llm_calls 19


In [7]:
print("BEST PROMPT FOUND:")
print("=" * 80)
print(best_node.prompt_text)
print("=" * 80)
print(f"Score: {best_node.metrics.composite_score():.3f}")
print(f"Generation: {best_node.generation}")
print(f"Source: {best_node.source.value}")

BEST PROMPT FOUND:
Evaluate the statement below and determine if it is a lie (Yes) or not (No). When making your assessment, please adhere to the following detailed criteria:

1. **Contextual Analysis**: Examine the broader context in which the statement is made, including the political environment. Is there a significant political backdrop that could affect perception?
   
2. **Tone and Language**: Analyze the language used in the statement. Does it carry partisan connotations or suggest bias? Is it emotionally charged or neutral?
   
3. **Truthfulness vs. Exaggeration**: Determine if the statement is factually accurate or if it involves exaggeration. Consider if the statement presents a distorted view of facts to enhance the subject's image.

4. **Implications for Public Perception**: Reflect on the potential impact of the statement on public opinion. How might it affect the credibility of the speaker and their associated political party?

**Evaluation Guidance**: 
- Rate each criter

In [8]:
metrics = best_node.metrics

print("METRICS:")
print(f"  Composite Score: {metrics.composite_score():.3f}")
print(f"  Accuracy:        {metrics.metrics['accuracy']:.3f}")
print(f"  Safety:          {metrics.metrics['safety']:.3f}")
print(f"  Robustness:      {metrics.metrics['robustness']:.3f}")
print(f"  Efficiency:      {metrics.metrics['efficiency']:.3f}")
print(f"  F1 Score:        {metrics.metrics['f1']:.3f}")

METRICS:
  Composite Score: 0.965
  Accuracy:        1.000
  Safety:          1.000
  Robustness:      0.825
  Efficiency:      0.000
  F1 Score:        0.000


In [9]:
print(optimizer.visualize_optimization_trajectory())


OPTIMIZATION TRAJECTORY

Generation | Best Score | Overall Best | Improvement
------------------------------------------------------------
   1       | 0.965      | 0.965       | +0.315 ████████████████████████████████████████████████
   2       | 0.965      | 0.965       | +0.000 ████████████████████████████████████████████████
   3       | 0.965      | 0.965       | +0.000 ████████████████████████████████████████████████




In [10]:
report = optimizer.get_optimization_report()

print("OPTIMIZATION REPORT:")
print("Overall Statistics:")
print(f"   Total time: {report['optimization_info']['total_time_seconds']:.2f}s")
print(f"   Generations: {report['optimization_info']['generations']}")
print(f"   Total nodes explored: {report['component_statistics']['history']['total_nodes']}")

print("Component Statistics:")
print(f"   Local optimizer iterations: {report['component_statistics']['local_optimizer']['total_iterations']}")
print(f"   Local improvements: {report['component_statistics']['local_optimizer']['improvements_count']}")
print(f"   Global optimizer steps: {report['component_statistics']['global_optimizer']['total_global_steps']}")
print(f"   Successful global changes: {report['component_statistics']['global_optimizer']['successful_global_changes']}")

print("Best Global Strategies:")
for i, strategy in enumerate(report['best_global_strategies'][:3], 1):
    print(f"   {i}. {strategy['strategy']['type']}: Score {strategy['score']:.3f}")
    print(f"      {strategy['strategy']['description'][:70]}...")

OPTIMIZATION REPORT:
Overall Statistics:
   Total time: 509.59s
   Generations: 3
   Total nodes explored: 13
Component Statistics:
   Local optimizer iterations: 7
   Local improvements: 2
   Global optimizer steps: 0
   Successful global changes: 0
Best Global Strategies:


In [11]:
lineage = optimizer.history.get_lineage(best_node.id)

print("EVOLUTION OF BEST PROMPT:")
print("="*80)

for i, node in enumerate(lineage):
    print(f"Step {i}: Generation {node.generation}, Source: {node.source.value}")
    if node.is_evaluated:
        print(f"  Score: {node.metrics.composite_score():.3f}")

    if node.operations:
        print(f"  Operations:")
        for op in node.operations:
            print(f"    - {op.operation_type.value}: {op.description[:60]}...")

    if i < len(lineage) - 1:  
        print("  ↓")

EVOLUTION OF BEST PROMPT:
Step 0: Generation 0, Source: initial
  Score: 0.650
  ↓
Step 1: Generation 1, Source: local
  Score: 0.870
  Operations:
    - modify_instruction: Edited based on gradient...
  ↓
Step 2: Generation 2, Source: local
  Score: 0.965
  Operations:
    - modify_instruction: Edited based on gradient...


In [12]:
comparison = optimizer.compare_with_baseline(
    baseline_prompt=initial_prompt,
    test_examples=test_examples
)

print("COMPARISON WITH BASELINE:")
print("="*80)

print("Baseline:")
for metric, value in comparison['baseline'].items():
    print(f"  {metric:20s}: {value:.3f}")

print("Optimized:")
for metric, value in comparison['optimized'].items():
    print(f"  {metric:20s}: {value:.3f}")

print("Improvements:")
for metric, value in comparison['improvements'].items():
    arrow = "↑" if value > 0 else "↓" if value < 0 else "→"
    print(f"  {metric:20s}: {value:+.3f} {arrow}")


Comparing with baseline...

Comparison Results:
  Baseline score: 0.424
  Optimized score: 0.729
  Improvement: +0.304
COMPARISON WITH BASELINE:
Baseline:
  composite_score     : 0.424
  accuracy            : 0.200
  safety              : 0.850
  robustness          : 0.672
  efficiency          : 0.000
  f1                  : 0.000
Optimized:
  composite_score     : 0.729
  accuracy            : 0.750
  safety              : 0.550
  robustness          : 0.845
  efficiency          : 0.000
  f1                  : 0.000
Improvements:
  composite_score     : +0.304 ↑
  accuracy            : +0.550 ↑
  safety              : -0.300 ↓
  robustness          : +0.172 ↑
  efficiency          : +0.000 →
  f1                  : +0.000 →


In [13]:
print("FINAL SUMMARY")
print("="*80)

history_stats = optimizer.history.get_statistics()
local_stats = optimizer.local_optimizer.get_statistics()
global_stats = optimizer.global_optimizer.get_statistics()

print("Overall Statistics:")
print(f"  Total nodes explored: {history_stats['total_nodes']}")
print(f"  Evaluations performed: {history_stats['evaluated_nodes']}")
print(f"  Generations completed: {history_stats['max_generation']}")
print(f"  Best score achieved: {history_stats['best_score']:.3f}")
print(f"  Average score: {history_stats['avg_score']:.3f}")

print("Local Optimization:")
print(f"  Total iterations: {local_stats['total_iterations']}")
print(f"  Improvements found: {local_stats['improvements_count']}")
print(f"  Success rate: {local_stats['improvement_rate']:.1%}")

print("Global Optimization:")
print(f"  Total global steps: {global_stats['total_global_steps']}")
print(f"  Candidates generated: {global_stats['total_candidates_generated']}")
print(f"  Successful changes: {global_stats['successful_global_changes']}")
print(f"  Success rate: {global_stats['success_rate']:.1%}")

print("Optimization complete!")
print(f"   Results saved to: ./optimization_results/")

FINAL SUMMARY
Overall Statistics:
  Total nodes explored: 13
  Evaluations performed: 13
  Generations completed: 3
  Best score achieved: 0.965
  Average score: 0.665
Local Optimization:
  Total iterations: 7
  Improvements found: 2
  Success rate: 28.6%
Global Optimization:
  Total global steps: 0
  Candidates generated: 0
  Successful changes: 0
  Success rate: 0.0%
Optimization complete!
   Results saved to: ./optimization_results/
