# AgentCore 온라인 Evaluation

프로그래밍 방식 테스트를 위한 Agent 호출 및 evaluation 워크플로우.

## Import

In [None]:
import boto3
from IPython.display import Markdown, display
# utils 모듈에서 evaluation 관련 헬퍼 함수들 import
from utils import (
    EvaluationClient,
    generate_session_id,
    invoke_and_evaluate,
)

# Set your AWS Credentials

# os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
# os.environ['AWS_ACCESS_KEY_ID'] = ''
# os.environ['AWS_SECRET_ACCESS_KEY'] = ''
# os.environ['AWS_SESSION_TOKEN'] = ''


## 설정

In [None]:
AGENT_ID = "strands_claude_eval"
# ARN 형식: <YOUR_ACCOUNT_ID>를 실제 AWS 계정 ID로 교체 필요
AGENT_ARN = f"arn:aws:bedrock-agentcore:us-east-1:<YOUR_ACCOUNT_ID>:runtime/{AGENT_ID}"
REGION = "us-east-1"

## 실험 설정

In [None]:
EXPERIMENT_NAME = "my_experiment_v1"

# None으로 설정 시 13개 evaluator 모두 실행 (comprehensive mode)
# 특정 리스트(예: FLEXIBLE_EVALUATORS)로 설정 시 해당 evaluator만 실행
EXPERIMENT_EVALUATORS = None  # Runs all 13 evaluators

EXPERIMENT_SCOPE = "session"  # EXPERIMENT_EVALUATORS가 None일 때는 무시됨
EXPERIMENT_DELAY = 120 # trace가 AgentCore observability에 반영되기까지 최소 120초 필요

planned_session = generate_session_id() 

# metadata는 선택사항이지만 추적을 위해 권장
EXPERIMENT_PROMPTS = [
    {"prompt": "What is 2 + 2?", "session_id": "", "metadata": {"category": "math"}},
    {"prompt": "What is the capital of France?", "session_id": "", "metadata": {"category": "geography"}},
    {"prompt": "Tell me about quantum physics", "session_id": "", "metadata": {"category": "science"}},
    {"prompt": "Hello, can you help me with math?", "session_id": planned_session, "metadata": {"turn": 1}},
    {"prompt": "What is 15 * 23?", "session_id": planned_session, "metadata": {"turn": 2}},
]

## Client 초기화

In [None]:
# Initialize AgentCore client
agentcore_client = boto3.client('bedrock-agentcore', region_name=REGION)

# Initialize Evaluation client (utils 모듈의 커스텀 클래스)
eval_client = EvaluationClient(
    region=REGION,
)

## 실험 실행

In [None]:
# session과 span 모두에서 사용 가능한 evaluator 목록
FLEXIBLE_EVALUATORS = [
    "Builtin.Correctness",
    "Builtin.Faithfulness",
    "Builtin.Helpfulness",
    "Builtin.ResponseRelevance",
    "Builtin.Conciseness",
    "Builtin.Coherence",
    "Builtin.InstructionFollowing",
    "Builtin.Refusal",
    "Builtin.Harmfulness",
    "Builtin.Stereotyping"
]

# session scope에서만 사용 가능한 evaluator
SESSION_ONLY_EVALUATORS = ["Builtin.GoalSuccessRate"]

# span scope에서만 사용 가능한 evaluator (tool 관련 평가)
SPAN_ONLY_EVALUATORS = [
    "Builtin.ToolSelectionAccuracy",
    "Builtin.ToolParameterAccuracy"
]

In [None]:
batch_results = []

# 실행할 evaluator 개수 계산 (None이면 전체 13개)
eval_count = len(EXPERIMENT_EVALUATORS) if EXPERIMENT_EVALUATORS else 13
print(f"Experiment: {EXPERIMENT_NAME} | Prompts: {len(EXPERIMENT_PROMPTS)} | Evaluators: {eval_count}\n")

for i, config in enumerate(EXPERIMENT_PROMPTS, 1):
    prompt_text = config["prompt"]
    session_id = config.get("session_id", "")
    metadata = config.get("metadata", {})
        
    try:
        # agent 호출 및 evaluation 실행 (utils 모듈의 헬퍼 함수)
        returned_session_id, content, results = invoke_and_evaluate(
            agentcore_client=agentcore_client,
            eval_client=eval_client,
            agent_arn=AGENT_ARN,
            agent_id=AGENT_ID,
            region=REGION,
            prompt=prompt_text,
            experiment_name=EXPERIMENT_NAME,
            session_id=session_id,
            metadata=metadata,
            evaluators=EXPERIMENT_EVALUATORS,
            scope=EXPERIMENT_SCOPE,
            delay=EXPERIMENT_DELAY,
            flexible_evaluators=FLEXIBLE_EVALUATORS,
            session_only_evaluators=SESSION_ONLY_EVALUATORS,
            span_only_evaluators=SPAN_ONLY_EVALUATORS
        )
        
        # agent 응답을 Markdown 형식으로 출력
        if content:
            display(Markdown(str(content[0])))
        
        batch_results.append({
            "session_id": returned_session_id,
            "prompt": prompt_text,
            "results": results
        })
                
    except Exception as e:
        print(f"Error: {e}\n")
        batch_results.append({"prompt": prompt_text, "error": str(e)})