## Импорты и настройка

In [None]:
from datasets import load_dataset
from data_structures import Example
from hierarchical_optimizer import HierarchicalOptimizer

### Функции для подготовки данных

In [2]:
def squad_v2_to_examples(data):
    examples = []
    for item in data:
        input_text = (f'# Question: \n {item["question"]} \n'
                      f'# Context: \n {item["context"]} \n')
        expected_output = item['answers']['text'][0] if item['answers']['text'] else 'No answer'
        examples.append(Example(input_text=input_text, expected_output=expected_output))
    return examples

def get_squad_v2_data(train_num: int, val_ratio: float, test_num: int):
    ds_train = load_dataset('rajpurkar/squad_v2', split='train')
    ds_test = load_dataset('rajpurkar/squad_v2', split='validation')

    split = ds_train.train_test_split(test_size=val_ratio)
    ds_train = split['train']
    ds_val = split['test']

    ds_train = ds_train.shuffle()
    ds_val = ds_val.shuffle()
    ds_test = ds_test.shuffle()

    train_split = ds_train.select(range(train_num))
    val_split = ds_val.select(range(int(train_num * val_ratio)))
    test_split = ds_test.select(range(test_num))

    train_examples = squad_v2_to_examples(train_split)
    validation_examples = squad_v2_to_examples(val_split)
    test_examples = squad_v2_to_examples(test_split)

    return train_examples, validation_examples, test_examples

def data_fabric(dataset: str = 'squad_v2', train_num: int = 50, val_ratio: float = 0.4, test_num: int = 50):
    squad_v2_initial_prompt = """Answer the question based on the context. If there is no answer in the context then just return 'No answer'.\n"""

    train_examples, validation_examples, test_examples = get_squad_v2_data(train_num, val_ratio, test_num)
    initial_prompt = squad_v2_initial_prompt
    
    return train_examples, validation_examples, test_examples, initial_prompt

## Подготовка датасета
Создаем простой датасет для демонстрации (задача классификации тональности)

In [3]:
LABEL_MAP = {0: "truth", 1: "lie"}

train_examples, validation_examples, test_examples, initial_prompt = data_fabric('squad_v2')

print("Dataset prepared:")
print(f"  Train: {len(train_examples)} examples")
print(f"  Validation: {len(validation_examples)} examples")
print(f"  Test: {len(test_examples)} examples")

Dataset prepared:
  Train: 50 examples
  Validation: 20 examples
  Test: 50 examples


## Создание начального промпта

In [4]:
print("Initial prompt:")
print("-" * 60)
print(initial_prompt)
print("-" * 60)

Initial prompt:
------------------------------------------------------------
Answer the question based on the context. If there is no answer in the context then just return 'No answer'.

------------------------------------------------------------


## Инициализация оптимизатора

In [5]:
optimizer = HierarchicalOptimizer()

## Запуск оптимизации

In [6]:
best_node = optimizer.optimize(
    initial_prompt=initial_prompt,
    train_examples=train_examples,
    validation_examples=validation_examples,
    test_examples=test_examples,
    save_dir="./optimization_results",
)

Evaluating initial prompt...
Initial score: 0.842
  Accuracy: 0.733
  Safety: 1.000
  Robustness: 0.773
  Efficiency: 1.000
  F1: 0.702


GENERATION 1/3

Phase 1: Local Optimization
  Population size: 1

  Optimizing node 1/1 (score: 0.842)

Starting Local Optimization


--- Iteration 1 ---
Failures: 11, Successes: 19
Generating text gradients...
Clustering failures by error type...
  Cluster 'Missing Information': 7 failures
  Cluster 'Incorrect Answer': 10 failures
  Cluster 'Overly Specific Answer': 6 failures
  Cluster 'Misunderstanding Context': 2 failures
  Cluster 'Misinterpretation of Question': 2 failures
Generating contrastive gradient...
Generated 6 gradients
  Generating variants from gradient 1/6
  Generating variants from gradient 2/6
  Generating variants from gradient 3/6
  Generating variants from gradient 4/6
  Generating variants from gradient 5/6
  Generating variants from gradient 6/6
  Generated 30 variants, 24 unique
Generated 24 candidate prompts
  Evaluating ca

## Анализ результатов

In [7]:
report = optimizer.get_optimization_report()
print('Optimization generations summary:')
for entry in report['optimization_log']:
    print(f"  Generation {entry['generation']}: time {entry['time']:.2f}s, best_score {entry['best_score']:.3f}")

local_stats = report['component_statistics']['local_optimizer']
print('Local optimizer summary:')
print(f"  Total iterations recorded: {local_stats.get('total_iterations', 0)}")
avg_it = local_stats.get('avg_iteration_time')
if avg_it is not None:
    print(f"  Avg iteration time: {avg_it:.2f}s")
else:
    print('  Avg iteration time: N/A')
print(f"  Total LLM calls attributed to local iterations: {local_stats.get('total_llm_calls_by_local', 0)}")
print('Per-iteration breakdown:')
for s in local_stats.get('iteration_stats', []):
    print(f"  Iter {s['iteration']}: time {s['time']:.2f}s, llm_calls {s['llm_calls']}")

Optimization generations summary:
  Generation 1: time 2281.16s, best_score 0.853
  Generation 2: time 748.64s, best_score 0.853
  Generation 3: time 0.00s, best_score 0.853
Local optimizer summary:
  Total iterations recorded: 3
  Avg iteration time: 1000.20s
  Total LLM calls attributed to local iterations: 3255
Per-iteration breakdown:
  Iter 1: time 2276.31s, llm_calls 2551
  Iter 1: time 724.30s, llm_calls 704
  Iter 1: time 0.00s, llm_calls 0


In [8]:
print("BEST PROMPT FOUND:")
print("=" * 80)
print(best_node.prompt_text)
print("=" * 80)
print(f"Score: {best_node.metrics.composite_score():.3f}")
print(f"Generation: {best_node.generation}")
print(f"Source: {best_node.source.value}")

BEST PROMPT FOUND:
Answer the question based on the context. Only provide answers that are explicitly stated in the context. If there is no answer found, return 'No answer'.
Score: 0.853
Generation: 1
Source: local


In [9]:
metrics = best_node.metrics

print("METRICS:")
print(f"  Composite Score: {metrics.composite_score():.3f}")
print(f"  Accuracy:        {metrics.metrics['accuracy']:.3f}")
print(f"  Safety:          {metrics.metrics['safety']:.3f}")
print(f"  Robustness:      {metrics.metrics['robustness']:.3f}")
print(f"  Efficiency:      {metrics.metrics['efficiency']:.3f}")
print(f"  F1 Score:        {metrics.metrics['f1']:.3f}")

METRICS:
  Composite Score: 0.853
  Accuracy:        0.733
  Safety:          1.000
  Robustness:      0.823
  Efficiency:      1.000
  F1 Score:        0.710


In [10]:
print(optimizer.visualize_optimization_trajectory())


OPTIMIZATION TRAJECTORY

Generation | Best Score | Overall Best | Improvement
------------------------------------------------------------
   1       | 0.853      | 0.853       | +0.012 ██████████████████████████████████████████
   2       | 0.853      | 0.853       | +0.000 ██████████████████████████████████████████
   3       | 0.853      | 0.853       | +0.000 ██████████████████████████████████████████




In [11]:
report = optimizer.get_optimization_report()

print("OPTIMIZATION REPORT:")
print("Overall Statistics:")
print(f"   Total time: {report['optimization_info']['total_time_seconds']:.2f}s")
print(f"   Generations: {report['optimization_info']['generations']}")
print(f"   Total nodes explored: {report['component_statistics']['history']['total_nodes']}")

print("Component Statistics:")
print(f"   Local optimizer iterations: {report['component_statistics']['local_optimizer']['total_iterations']}")
print(f"   Local improvements: {report['component_statistics']['local_optimizer']['improvements_count']}")
print(f"   Global optimizer steps: {report['component_statistics']['global_optimizer']['total_global_steps']}")
print(f"   Successful global changes: {report['component_statistics']['global_optimizer']['successful_global_changes']}")

print("Best Global Strategies:")
for i, strategy in enumerate(report['best_global_strategies'][:3], 1):
    print(f"   {i}. {strategy['strategy']['type']}: Score {strategy['score']:.3f}")
    print(f"      {strategy['strategy']['description'][:70]}...")

OPTIMIZATION REPORT:
Overall Statistics:
   Total time: 3101.10s
   Generations: 3
   Total nodes explored: 33
Component Statistics:
   Local optimizer iterations: 3
   Local improvements: 1
   Global optimizer steps: 0
   Successful global changes: 0
Best Global Strategies:


In [12]:
lineage = optimizer.history.get_lineage(best_node.id)

print("EVOLUTION OF BEST PROMPT:")
print("="*80)

for i, node in enumerate(lineage):
    print(f"Step {i}: Generation {node.generation}, Source: {node.source.value}")
    if node.is_evaluated:
        print(f"  Score: {node.metrics.composite_score():.3f}")

    if node.operations:
        print(f"  Operations:")
        for op in node.operations:
            print(f"    - {op.operation_type.value}: {op.description[:60]}...")

    if i < len(lineage) - 1:  
        print("  ↓")

EVOLUTION OF BEST PROMPT:
Step 0: Generation 0, Source: initial
  Score: 0.842
  ↓
Step 1: Generation 1, Source: local
  Score: 0.853
  Operations:
    - modify_instruction: Edited based on gradient...


In [13]:
comparison = optimizer.compare_with_baseline(
    baseline_prompt=initial_prompt,
    test_examples=test_examples
)

print("COMPARISON WITH BASELINE:")
print("="*80)

print("Baseline:")
for metric, value in comparison['baseline'].items():
    print(f"  {metric:20s}: {value:.3f}")

print("Optimized:")
for metric, value in comparison['optimized'].items():
    print(f"  {metric:20s}: {value:.3f}")

print("Improvements:")
for metric, value in comparison['improvements'].items():
    arrow = "↑" if value > 0 else "↓" if value < 0 else "→"
    print(f"  {metric:20s}: {value:+.3f} {arrow}")


Comparing with baseline...



Comparison Results:
  Baseline score: 0.677
  Optimized score: 0.719
  Improvement: +0.042
COMPARISON WITH BASELINE:
Baseline:
  composite_score     : 0.677
  accuracy            : 0.433
  safety              : 0.967
  robustness          : 0.563
  efficiency          : 1.000
  f1                  : 0.420
Optimized:
  composite_score     : 0.719
  accuracy            : 0.467
  safety              : 1.000
  robustness          : 0.677
  efficiency          : 1.000
  f1                  : 0.452
Improvements:
  composite_score     : +0.042 ↑
  accuracy            : +0.033 ↑
  safety              : +0.033 ↑
  robustness          : +0.113 ↑
  efficiency          : +0.000 →
  f1                  : +0.032 ↑


In [14]:
print("FINAL SUMMARY")
print("="*80)

history_stats = optimizer.history.get_statistics()
local_stats = optimizer.local_optimizer.get_statistics()
global_stats = optimizer.global_optimizer.get_statistics()

print("Overall Statistics:")
print(f"  Total nodes explored: {history_stats['total_nodes']}")
print(f"  Evaluations performed: {history_stats['evaluated_nodes']}")
print(f"  Generations completed: {history_stats['max_generation']}")
print(f"  Best score achieved: {history_stats['best_score']:.3f}")
print(f"  Average score: {history_stats['avg_score']:.3f}")

print("Local Optimization:")
print(f"  Total iterations: {local_stats['total_iterations']}")
print(f"  Improvements found: {local_stats['improvements_count']}")
print(f"  Success rate: {local_stats['improvement_rate']:.1%}")

print("Global Optimization:")
print(f"  Total global steps: {global_stats['total_global_steps']}")
print(f"  Candidates generated: {global_stats['total_candidates_generated']}")
print(f"  Successful changes: {global_stats['successful_global_changes']}")
print(f"  Success rate: {global_stats['success_rate']:.1%}")

print("Optimization complete!")
print(f"   Results saved to: ./optimization_results/")

FINAL SUMMARY
Overall Statistics:
  Total nodes explored: 33
  Evaluations performed: 33
  Generations completed: 2
  Best score achieved: 0.853
  Average score: 0.797
Local Optimization:
  Total iterations: 3
  Improvements found: 1
  Success rate: 33.3%
Global Optimization:
  Total global steps: 0
  Candidates generated: 0
  Successful changes: 0
  Success rate: 0.0%
Optimization complete!
   Results saved to: ./optimization_results/
