In [1]:
# Load env variables and create client
from dotenv import load_dotenv
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
model = "claude-haiku-4-5"

In [None]:
# Helper functions
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=0.5, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [3]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    return chat(messages, stop_sequences=["```"])

In [6]:
dataset = generate_dataset()

with open("eval_dataset.json", "w") as f:
    json.dump(json.loads(dataset), f, indent=2)

In [7]:
def grade_by_model(test_case, output):
    eval_prompt = f"""
You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

Original Task:
<task>
{test_case["task"]}
</task>

Solution to Evaluate:
<solution>
{output}
</solution>

Output Format
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
- "strengths": An array of 1-3 key strengths
- "weaknesses": An array of 1-3 key areas for improvement
- "reasoning": A concise explanation of your overall assessment
- "score": A number between 1-10

Respond with JSON. Keep your response concise and direct.
Example response shape:
{{
    "strengths": string[],
    "weaknesses": string[],
    "reasoning": string,
    "score": number
}}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [8]:
from statistics import mean


def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
    Please solve the following task:
    {test_case['task']}
    """
    messages = []
    add_user_message(messages, prompt)
    return chat(messages)

def run_test_case(test_case):
    """Calls run_prompt, then grade the result"""
    output = run_prompt(test_case)

    grade = grade_by_model(test_case, output)
    score = grade["score"]
    reasoning = grade["reasoning"]
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning,
    }

def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each test case"""
    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["score"] for result in results])
    print(f"Average Score: {average_score}")
    
    return results

In [9]:
with open("eval_dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average Score: 6.666666666666667


In [10]:
print(json.dumps(results, indent=2))

[
  {
    "output": "# AWS S3 Bucket Name Parser\n\nHere's a robust solution with multiple approaches:\n\n## Solution 1: Simple and Clean (Recommended)\n\n```python\ndef parse_s3_bucket_name(s3_uri: str) -> str:\n    \"\"\"\n    Parse the bucket name from an S3 URI.\n    \n    Args:\n        s3_uri: S3 URI in the format 's3://bucket-name/key/path'\n        \n    Returns:\n        The S3 bucket name\n        \n    Raises:\n        ValueError: If the URI is invalid\n    \"\"\"\n    if not s3_uri.startswith('s3://'):\n        raise ValueError(f\"Invalid S3 URI: {s3_uri}. Must start with 's3://'\")\n    \n    # Remove 's3://' prefix and split by '/'\n    parts = s3_uri[5:].split('/')\n    bucket_name = parts[0]\n    \n    if not bucket_name:\n        raise ValueError(f\"Invalid S3 URI: {s3_uri}. No bucket name found\")\n    \n    return bucket_name\n\n\n# Test cases\nif __name__ == \"__main__\":\n    # Valid cases\n    assert parse_s3_bucket_name('s3://my-bucket/key/path') == 'my-bucket'\n