In [2]:
%pip install pydantic-ai python-dotenv nest_asyncio
from IPython.display import clear_output ; clear_output()

from dotenv import load_dotenv ; load_dotenv(override=True)
import nest_asyncio ; nest_asyncio.apply()

import os
import random
from pprint import pprint
from typing import List, Dict
from pprint import pprint, pformat

from pydantic import BaseModel, Field
from pydantic_ai import Agent

_ai_models = (
    [] + 
    (['openai:gpt-4o', 'openai:gpt-4o-mini'] if os.environ.get('OPENAI_API_KEY') else []) +
    (['gemini-1.5-pro', 'gemini-2.0-flash-exp'] if os.environ.get('GEMINI_API_KEY') else []) +
    (['claude-3-5-haiku-latest', 'claude-3-5-sonnet-latest'] if os.environ.get('ANTHROPIC_API_KEY') else [])
)
print(f"Available AI models: {pformat(_ai_models)}\n")

AI_MODEL = os.environ['AI_MODEL'] if os.environ.get('AI_MODEL') else random.choice(_ai_models)
print(f"Using AI model: {AI_MODEL}")

Available AI models: ['openai:gpt-4o',
 'openai:gpt-4o-mini',
 'gemini-1.5-pro',
 'gemini-2.0-flash-exp',
 'claude-3-5-haiku-latest',
 'claude-3-5-sonnet-latest']

Using AI model: claude-3-5-sonnet-latest


In [3]:
class GeneratorResponse(BaseModel):
    thoughts: str = Field(..., description='Your understanding of the task and feedback and how you plan to improve')
    response: str = Field(..., description="The generated solution.")


async def generate(prompt: str, task: str, context: str = "") -> tuple[str, str]:
    """Generate and improve a solution based on feedback."""
    full_prompt = f"{prompt}\n{context}\nTask: {task}" if context else f"{prompt}\nTask: {task}"
    response = (await Agent(AI_MODEL, result_type=GeneratorResponse).run(full_prompt)).data
    thoughts = response.thoughts
    result = response.response
    
    print("\n=== GENERATION START ===")
    print(f"Thoughts:\n{thoughts}\n")
    print(f"Generated:\n{result}")
    print("=== GENERATION END ===\n")
    
    return thoughts, result


class EvaluatorResponse(BaseModel):
    evaluation: str = Field(..., description='PASS, NEEDS_IMPROVEMENT, or FAIL')
    feedback: str = Field(..., description='What needs improvement and why.')


async def evaluate(prompt: str, content: str, task: str) -> tuple[str, str]:
    """Evaluate if a solution meets requirements."""
    full_prompt = f"{prompt}\nOriginal task: {task}\nContent to evaluate: {content}"
    response = (await Agent(AI_MODEL, result_type=EvaluatorResponse).run(full_prompt)).data
    evaluation = response.evaluation
    feedback = response.feedback
    
    print("=== EVALUATION START ===")
    print(f"Status: {evaluation}")
    print(f"Feedback: {feedback}")
    print("=== EVALUATION END ===\n")
    
    return evaluation, feedback


async def loop(task: str, evaluator_prompt: str, generator_prompt: str) -> tuple[str, list[dict]]:
    """Keep generating and evaluating until requirements are met."""
    memory = []
    chain_of_thought = []
    
    thoughts, result = await generate(generator_prompt, task)
    memory.append(result)
    chain_of_thought.append({"thoughts": thoughts, "result": result})
    
    while True:
        evaluation, feedback = await evaluate(evaluator_prompt, result, task)
        if evaluation == "PASS":
            return result, chain_of_thought
            
        context = "\n".join([
            "Previous attempts:",
            *[f"- {m}" for m in memory],
            f"\nFeedback: {feedback}"
        ])
        
        thoughts, result = await generate(generator_prompt, task, context)
        memory.append(result)
        chain_of_thought.append({"thoughts": thoughts, "result": result})

In [4]:
evaluator_prompt = """
Evaluate this following code implementation for:
1. code correctness
2. time complexity
3. style and best practices

You should be evaluating only and not attemping to solve the task.
Only output "PASS" if all criteria are met and you have no further suggestions for improvements."""

generator_prompt = """
Your goal is to complete the task based on <user input>. If there are feedback 
from your previous generations, you should reflect on them to improve your solution."""

task = """
<user input>
Implement a Stack with:
1. push(x)
2. pop()
3. getMin()
All operations should be O(1).
</user input>
"""

result, chain_of_thought = await loop(task, evaluator_prompt, generator_prompt)

print("=== FINAL RESULT ===")
print(result)

print("\n=== CHAIN OF THOUGHT ===")
pprint(chain_of_thought)


=== GENERATION START ===
Thoughts:
For this task, I need to implement a Stack data structure with push, pop, and getMin operations, all with O(1) time complexity. The challenging part is maintaining the minimum value in O(1) time.

To achieve O(1) for all operations including getMin():
1. We'll use two stacks:
   - main_stack: stores all elements
   - min_stack: keeps track of minimums
2. The min_stack will store elements in a way that its top always has the current minimum
3. Push: When pushing, we'll compare with min_stack's top and push to min_stack if needed
4. Pop: When popping, we'll also pop from min_stack if the popped value equals min_stack's top
5. GetMin: Simply return min_stack's top

This approach ensures all operations remain O(1) while correctly maintaining the minimum value.

Generated:
class MinStack:
    def __init__(self):
        self.main_stack = []  # Main stack to store all elements
        self.min_stack = []   # Stack to track minimums

    def push(self, x: i