# dependencies

In [1]:
from anthropic import Anthropic

from dotenv import load_dotenv
import os
load_dotenv()

import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

MODEL="claude-3-haiku-20240307"
TEMPERATURE=0.7

try:
    client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
except Exception as e:
    print(f"Error initializing Anthropic client: {e}")

# helper functions

In [2]:
messages = []

def add_user_message(messages, text):
    messages.append({"role": "user", "content": text})
    
def add_assistant_message(messages, text):
    messages.append({"role": "assistant", "content": text})

def chat(messages, system_prompt="", stop_sequences=[]):
    message = client.messages.create(
        model=MODEL,
        max_tokens=1000,
        temperature=TEMPERATURE,
        system=system_prompt,
        messages=messages,
        stop_sequences=stop_sequences
    )

    return message.content[0].text

# generate evaluation dataset

In [3]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "json" or "python" or "regex",
        "solution_criteria": "Key criteria for evaluating the solution"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "Here are some tasks that can be solved with Python, JSON, or Regex for AWS-related tasks: ```json")
    res = chat(messages=messages, stop_sequences=["```"])
    
    with open("aws_tasks.json", "w") as f:
        json.dump(json.loads(res), f, indent=2)

    logger.info("Dataset generated and saved to aws_tasks.json")


# Graders

In [4]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    
    prompt = f"""
    Please solve the following task:

    {test_case["task"]}

    * Respond only with Python, JSON, or a plain Regex
    * Do not add any comments or commentary or explanation
    """

    messages = []
    add_user_message(messages, prompt)
    res = chat(messages=messages)

    return res

## Model Grader 

In [5]:
def grade_by_model(test_case, output):
    evaluation_prompt = f"""
    You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

    Original Task:
    <task>
    {test_case["task"]}
    </task>

    Solution to Evaluate:
    <solution>
    {output}
    </solution>

    Criteria you should use to evaluate the solution:
    <criteria>
    {test_case["solution_criteria"]}
    </criteria>

    Output Format
    Provide your evaluation as a structured JSON object with the following fields, in this specific order:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement
    - "reasoning": A concise explanation of your overall assessment
    - "score": A number between 1-10

    Respond with JSON. Keep your response concise and direct.
    Example response shape:
    {{
        "strengths": string[],
        "weaknesses": string[],
        "reasoning": string,
        "score": number
    }}
    """         

    messages = []
    add_user_message(messages, evaluation_prompt)
    add_assistant_message(messages, "```json")
    eval_text = chat(messages=messages, stop_sequences=["```"])

    return json.loads(eval_text)

## Code grader 

### Functions to validate the output structure

In [6]:
import re
import ast


def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_by_code(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)


# combined evaluation

In [7]:
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    
    output = run_prompt(test_case)
    
    model_grader_res = grade_by_model(test_case=test_case, output=output)
    logger.info("model grading successful.")

    code_grader_res = grade_by_code(response=output, test_case=test_case)
    logger.info("code grading successful.")

    score = (model_grader_res["score"] + code_grader_res) / 2
    
    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "strengths": model_grader_res["strengths"],
        "weaknesses": model_grader_res["weaknesses"],
        "reasoning": model_grader_res["reasoning"]
    }

def run_evaluation(dataset_file="aws_tasks.json"):
    """Loads the dataset and calls run_test_case with each case"""

    from statistics import mean
    
    with open(dataset_file, "r") as f:
        dataset = json.load(f)

    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    avgerage_score = mean([result["score"] for result in results])
    results.append({
        "average_score": avgerage_score
    })

    with open("evaluation_results.json", "w") as f:
        json.dump(results, f, indent=2)
    
    logger.info("Evaluation completed and results saved to 'evaluation_results.json'.")

# main function

In [10]:
def main():
    """Main function to run the evaluation"""
    
    dataset_file = "aws_tasks.json"
    if not os.path.exists(dataset_file):
        logger.info(f"Dataset file '{dataset_file}' not found. Generating dataset...")
        generate_dataset()
    
    run_evaluation(dataset_file)

In [11]:
main()

2025-08-07 00:37:26 - __main__ - INFO - Dataset file 'aws_tasks.json' not found. Generating dataset...
2025-08-07 00:37:29 - httpx - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-08-07 00:37:29 - __main__ - INFO - Dataset generated and saved to aws_tasks.json
2025-08-07 00:37:30 - httpx - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-08-07 00:37:31 - httpx - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-08-07 00:37:31 - __main__ - INFO - model grading successful.
2025-08-07 00:37:31 - __main__ - INFO - code grading successful.
2025-08-07 00:37:34 - httpx - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-08-07 00:37:35 - httpx - INFO - HTTP Request: POST https://api.anthropic.com/v1/messages "HTTP/1.1 200 OK"
2025-08-07 00:37:35 - __main__ - INFO - model grading successful.
2025-08-07 00:37:35 - __main__ - INFO - code grading s