# Actor Simulator를 사용한 AgentCore 온라인 Evaluation

**Pipeline:** AgentCore Online Eval Setup → DatasetGenerator → ActorSimulator → Agent 호출 → AgentCore가 CloudWatch를 통해 평가

이 노트북:
1. `eval_config.py`에서 설정을 로드합니다
2. 내장 메트릭을 사용하여 AgentCore 온라인 evaluation을 설정합니다
3. DatasetGenerator를 사용하여 테스트 케이스를 생성합니다
4. actor simulator를 실행하여 멀티턴 대화로 agent를 호출합니다
5. AgentCore가 CloudWatch를 통해 자동으로 추적 및 평가합니다

## 1. Import 및 설정

In [None]:
import boto3
import json
import os
from strands_evals import ActorSimulator, Case  # Strands evaluation 라이브러리
from strands_evals.generators import DatasetGenerator  # 테스트 케이스 자동 생성
from strands_eval_config import *  # 외부 설정 파일에서 변수 import

os.environ['AWS_DEFAULT_REGION'] = AWS_REGION
print("Configuration loaded from eval_config.py")

## 2. AgentCore 온라인 Evaluation 설정

In [None]:
# AgentCore evaluation controlplane client 생성
evaluation_client = boto3.client(
    'agentcore-evaluation-controlplane',
    region_name=AWS_REGION,
)

# 온라인 evaluation 설정 생성
create_config_response = evaluation_client.create_online_evaluation_config(
    onlineEvaluationConfigName=EVAL_CONFIG_NAME+'_3',
    description=EVAL_DESCRIPTION,
    rule={
        "samplingConfig": {"samplingPercentage": SAMPLING_PERCENTAGE},  # 샘플링 비율 설정
        "sessionConfig": {"sessionTimeoutMinutes": SESSION_TIMEOUT_MINUTES}  # 세션 타임아웃
    },
    dataSourceConfig={
        "cloudWatchLogs": {  # CloudWatch Logs에서 데이터 수집
            "logGroupNames": [LOG_GROUP_NAME],
            "serviceNames": [SERVICE_NAME]
        }
    },
    evaluators=[{"evaluatorId": evaluator_id} for evaluator_id in EVALUATORS],  # 평가자 목록
    evaluationExecutionRoleArn=EVALUATION_ROLE_ARN,
    enableOnCreate=True  # 생성 즉시 활성화
)

config_id = create_config_response['onlineEvaluationConfigId']
config_details = evaluation_client.get_online_evaluation_config(onlineEvaluationConfigId=config_id)

print(f"Created config: {config_id}")
print(f"Status: {config_details['status']}")

## 3. AgentCore Runtime Client

In [None]:
agentcore_client = boto3.client('bedrock-agentcore', region_name=AWS_REGION)

def invoke_agentcore(user_message):
    # AgentCore runtime 호출
    boto3_response = agentcore_client.invoke_agent_runtime(
        agentRuntimeArn=AGENT_ARN,
        qualifier=QUALIFIER,
        payload=json.dumps({"prompt": user_message})
    )
    
    content = []
    # 스트리밍 응답 처리
    if "text/event-stream" in boto3_response.get("contentType", ""):
        for line in boto3_response["response"].iter_lines(chunk_size=1):
            if line:
                line = line.decode("utf-8")
                if line.startswith("data: "):
                    line = line[6:]  # "data: " 접두사 제거
                    content.append(line)
    else:
        # 비스트리밍 응답 처리
        events = []
        for event in boto3_response.get("response", []):
            events.append(event)
        if events:
            content.append(json.loads(events[0].decode("utf-8")))
    
    return "\n".join(str(c) for c in content)

## 4. 테스트 케이스 생성

In [None]:
# DatasetGenerator 초기화 (입력/출력 타입 모두 str)
generator = DatasetGenerator[str, str](str, str)

# Agent 특성을 설명하는 task description 생성
task_description = f"""
Task: {AGENT_CAPABILITIES}
Limitations: {AGENT_LIMITATIONS}
Available tools: {', '.join(AGENT_TOOLS)}
Complexity: {AGENT_COMPLEXITY}
"""

# 테스트 케이스 자동 생성 (비동기)
dataset = await generator.from_scratch_async(
    topics=AGENT_TOPICS,
    task_description=task_description,
    num_cases=NUM_TEST_CASES
)

print(f"Generated {len(dataset.cases)} test cases")

## 5. 테스트 케이스 미리보기

In [None]:
for i, case in enumerate(dataset.cases, 1):
    print(f"\nCase {i}: {case.input}")
    print(f"Expected: {case.expected_output}")

## 6. Task 함수 정의

In [None]:
def task_function(case: Case) -> str:
    # ActorSimulator 생성 (사용자 역할 시뮬레이션)
    user_sim = ActorSimulator.from_case_for_user_simulator(case=case, max_turns=MAX_TURNS)
    
    user_message = case.input
    final_response = ""
    
    print(f"\n{'='*80}")
    print(f"Test Case: {case.input}")
    print(f"Expected: {case.expected_output}")
    print(f"{'='*80}")
    
    turn = 1
    # 멀티턴 대화 시뮬레이션
    while user_sim.has_next():
        print(f"\nTurn {turn}: {user_message}")
        agent_response = invoke_agentcore(user_message)
        final_response = agent_response
        print(f"Agent: {agent_response[:200]}...")
        
        # 다음 사용자 메시지 생성
        user_result = user_sim.act(agent_response)
        user_message = str(user_result.structured_output.message)
        turn += 1
    
    return final_response

## 7. Evaluation 실행

In [None]:
results = []

# 모든 테스트 케이스 실행
for i, case in enumerate(dataset.cases, 1):
    print(f"\n\n{'#'*80}")
    print(f"# Running Test Case {i}/{len(dataset.cases)}")
    print(f"{'#'*80}")
    
    try:
        response = task_function(case)
        results.append({
            "case_number": i,
            "input": case.input,
            "expected": case.expected_output,
            "actual": response,
            "status": "success"
        })
    except Exception as e:
        print(f"ERROR: {e}")
        results.append({
            "case_number": i,
            "input": case.input,
            "expected": case.expected_output,
            "actual": str(e),
            "status": "error"
        })

# 결과 요약 출력
print(f"\n\nCompleted {len(results)} test cases")
print(f"Successful: {sum(1 for r in results if r['status'] == 'success')}")
print(f"Errors: {sum(1 for r in results if r['status'] == 'error')}")

## 8. 결과 보기

In [None]:
import pandas as pd

df = pd.DataFrame(results)
df

# 끝