In [5]:
# SETUP
from dotenv import load_dotenv
from anthropic import Anthropic

# Load env variables
load_dotenv()

# Create an API client
client = Anthropic()
model="claude-sonnet-4-20250514"

# Make a request

def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)

def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model":model,
        "max_tokens":1000,
        "messages":messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [4]:
import json


def generate_dataset():
    prompt = """
Generate an evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects, each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
  {
    "task": "Description of task",
  },
  ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a single regex
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""
    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

dataset = generate_dataset()
with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)

In [7]:
def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = """
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Task: {task}
    Solution: {solution}
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    """
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [8]:
def run_prompt(test_case):
    prompt = f"""Please solve the following task:
    
    {test_case["task"]}
"""
    messages = []
    add_user_message(messages, prompt)
    output = chat(messages)
    return output

def run_test_case(test_case):
    output = run_prompt(test_case)

    model_evaluation = grade_by_model(test_case, output)
    score = model_evaluation["score"]

    return {
        "output": output,
        "test_case": test_case,
        "score": score        
    }

def run_eval(dataset):
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    return results

In [9]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

print(json.dumps(results, indent=2))

[
  {
    "output": "Here's a Python function that extracts the bucket name from an AWS S3 bucket ARN:\n\n```python\ndef extract_bucket_name_from_arn(arn):\n    \"\"\"\n    Extract the bucket name from an AWS S3 bucket ARN.\n    \n    Args:\n        arn (str): The S3 bucket ARN in format 'arn:aws:s3:::bucket-name'\n    \n    Returns:\n        str: The bucket name\n    \n    Raises:\n        ValueError: If the ARN format is invalid\n    \"\"\"\n    if not isinstance(arn, str):\n        raise ValueError(\"ARN must be a string\")\n    \n    if not arn:\n        raise ValueError(\"ARN cannot be empty\")\n    \n    # Split the ARN by colons\n    arn_parts = arn.split(':')\n    \n    # Validate ARN format\n    if len(arn_parts) != 6:\n        raise ValueError(\"Invalid ARN format. Expected 6 parts separated by colons\")\n    \n    if arn_parts[0] != 'arn':\n        raise ValueError(\"ARN must start with 'arn'\")\n    \n    if arn_parts[1] != 'aws':\n        raise ValueError(\"ARN must specif