In [10]:
%pip install -r requirements.txt
from IPython.display import clear_output ; clear_output()

In [11]:
from util import initialize, show
AI_MODEL = initialize()

from typing import List, Dict

from pydantic import BaseModel, Field
from pydantic_ai import Agent

Attempting to instrument while already instrumented


Available AI models:
['openai:gpt-4o',
 'openai:gpt-4o-mini',
 'gemini-1.5-pro',
 'gemini-2.0-flash-exp',
 'claude-3-5-haiku-latest',
 'claude-3-5-sonnet-latest']
Using AI model: openai:gpt-4o


In [12]:
class GeneratorResponse(BaseModel):
    thoughts: str = Field(..., description='Your understanding of the task and feedback and how you plan to improve')
    response: str = Field(..., description="The generated solution.")


async def generate(prompt: str, task: str, context: str = "") -> tuple[str, str]:
    """Generate and improve a solution based on feedback."""
    full_prompt = (
        f"{prompt}\n{context}\nTask: {task}"
        if context
        else f"{prompt}\nTask: {task}"
    )
    generator_agent = Agent(AI_MODEL, result_type=GeneratorResponse)
    response = (await generator_agent.run(full_prompt)).data
    thoughts = response.thoughts
    result = response.response
    
    show('', title='Generation')
    show(thoughts, title='Thoughts')
    show(result, title='Generated')
    
    return thoughts, result


class EvaluatorResponse(BaseModel):
    evaluation: str = Field(..., description='PASS, NEEDS_IMPROVEMENT, or FAIL')
    feedback: str = Field(..., description='What needs improvement and why.')


async def evaluate(prompt: str, content: str, task: str) -> tuple[str, str]:
    """Evaluate if a solution meets requirements."""
    full_prompt = f"{prompt}\nOriginal task: {task}\nContent to evaluate: {content}"
    evaluator_agent = Agent(AI_MODEL, result_type=EvaluatorResponse)
    response = (await evaluator_agent.run(full_prompt)).data
    evaluation = response.evaluation
    feedback = response.feedback
    
    show('', title='Evaluation')
    show(evaluation, title='Status')
    show(feedback, title='Feedback')
    
    return evaluation, feedback


async def loop(
        task: str, evaluator_prompt: str, generator_prompt: str
    ) -> tuple[str, list[dict]]:
    """Keep generating and evaluating until requirements are met."""
    memory = []
    chain_of_thought = []
    
    thoughts, result = await generate(generator_prompt, task)
    memory.append(result)
    chain_of_thought.append({"thoughts": thoughts, "result": result})
    
    while True:
        evaluation, feedback = await evaluate(evaluator_prompt, result, task)
        if evaluation == "PASS":
            return result, chain_of_thought
            
        context = "\n".join([
            "Previous attempts:",
            *[f"- {m}" for m in memory],
            f"\nFeedback: {feedback}"
        ])
        
        thoughts, result = await generate(generator_prompt, task, context)
        memory.append(result)
        chain_of_thought.append({"thoughts": thoughts, "result": result})

In [13]:
evaluator_prompt = """
Evaluate this following code implementation for:
1. code correctness
2. time complexity
3. style and best practices
4. documentation and formatting

You should be evaluating only and not attemping to solve the task.
Evaluate the code critically and make sure you don't miss any opportunities for improvement.
Only output "PASS" if all criteria are met and you have no further suggestions for improvements,
otherwise output "NEEDS_IMPROVEMENT" or "FAIL".
"""

generator_prompt = """
Your goal is to complete the task based on <user input>. If there are feedback 
from your previous generations, you should reflect on them to improve your solution."""

task = """
<user input>
Implement a Stack with:
1. push(x)
2. pop()
3. getMin()
All operations should be O(1).
</user input>
"""

result, chain_of_thought = await loop(task, evaluator_prompt, generator_prompt)

show(result, title='Final Result')
show(chain_of_thought, title='Chain of Thought')


Generation
----------


Thoughts
--------

The challenge here is to implement a Stack with constant time operations for push, pop, and retrieving the minimum element. This can be achieved by maintaining an additional stack that keeps track of the minimum elements.


Generated
---------

Here's an implementation of a Stack that supports `push`, `pop`, and `getMin` operations, all in constant time:

```python
class MinStack:
    def __init__(self):
        self.main_stack = []
        self.min_stack = []

    def push(self, x):
        # Push the value onto the main stack
        self.main_stack.append(x)
        # Push onto min_stack the minimum value
        if not self.min_stack or x <= self.min_stack[-1]:
            self.min_stack.append(x)

    def pop(self):
        # Pop from the main stack
        popped_value = self.main_stack.pop()
        # If the popped value is the same as the last value in the min_stack, pop it from the min_stack
        if popped_value == self.min_stack[