In [3]:
from datasets import Dataset
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from ragas import evaluate
import pandas as pd

# 엑셀 파일 불러오기
df = pd.read_excel("모델 비교.xlsx").head(15)

# ChainRouter 평가용 Dataset
chainrouter_dataset = Dataset.from_pandas(df.rename(columns={
    "질문": "question",
    "ChainRouter": "answer"
}).assign(contexts=lambda x: [[a] for a in x["answer"]]))

# Agent 평가용 Dataset
agent_dataset = Dataset.from_pandas(df.rename(columns={
    "질문": "question",
    "Agent": "answer"
}).assign(contexts=lambda x: [[a] for a in x["answer"]]))

# 평가 수행
metrics = [faithfulness, answer_relevancy]
chainrouter_result = evaluate(chainrouter_dataset, metrics=metrics)
agent_result = evaluate(agent_dataset, metrics=metrics)

print("🔹 ChainRouter 결과:", chainrouter_result)
print("🔹 Agent 결과:", agent_result)

Evaluating: 100%|██████████| 30/30 [00:38<00:00,  1.27s/it]
Evaluating: 100%|██████████| 30/30 [00:30<00:00,  1.00s/it]


🔹 ChainRouter 결과: {'faithfulness': 0.9889, 'answer_relevancy': 0.6942}
🔹 Agent 결과: {'faithfulness': 0.9833, 'answer_relevancy': 0.6946}


In [8]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_correctness, answer_relevancy
import evaluate as hf_evaluate

# 엑셀 파일 로드
df = pd.read_excel("aa.xlsx")

# ChainRouter 평가용 데이터셋 구성
chain_ds = Dataset.from_pandas(df.rename(columns={
    "question": "question",
    "ChainRouter": "answer",
    "Ground_thruth": "ground_truth"
}).assign(contexts=lambda x: [[a] for a in x["answer"]]))

# Agent 평가용 데이터셋 구성
agent_ds = Dataset.from_pandas(df.rename(columns={
    "question": "question",
    "Agent": "answer",
    "Ground_thruth": "ground_truth"
}).assign(contexts=lambda x: [[a] for a in x["answer"]]))

# ragas 평가
metrics = [answer_correctness, answer_relevancy]
chain_result = evaluate(chain_ds, metrics=metrics)
agent_result = evaluate(agent_ds, metrics=metrics)

print("🔹 ChainRouter (ragas)")
print(chain_result)

print("\n🔹 Agent (ragas)")
print(agent_result)

# -----------------------
# BERTScore 평가 추가
# -----------------------
bertscore = hf_evaluate.load("bertscore")

# 평가용 텍스트 추출
chain_preds = df["ChainRouter"].tolist()
agent_preds = df["Agent"].tolist()
references = df["Ground_thruth"].tolist()

# BERTScore 평가
chain_bertscore = bertscore.compute(predictions=chain_preds, references=references, lang="ko")
agent_bertscore = bertscore.compute(predictions=agent_preds, references=references, lang="ko")

# 평균 점수 출력
print("\n🔹 ChainRouter (BERTScore)")
print(f"Precision: {sum(chain_bertscore['precision'])/len(chain_bertscore['precision']):.4f}")
print(f"Recall:    {sum(chain_bertscore['recall'])/len(chain_bertscore['recall']):.4f}")
print(f"F1:        {sum(chain_bertscore['f1'])/len(chain_bertscore['f1']):.4f}")

print("\n🔹 Agent (BERTScore)")
print(f"Precision: {sum(agent_bertscore['precision'])/len(agent_bertscore['precision']):.4f}")
print(f"Recall:    {sum(agent_bertscore['recall'])/len(agent_bertscore['recall']):.4f}")
print(f"F1:        {sum(agent_bertscore['f1'])/len(agent_bertscore['f1']):.4f}")


Evaluating: 100%|██████████| 14/14 [00:23<00:00,  1.68s/it]
Evaluating: 100%|██████████| 14/14 [00:26<00:00,  1.93s/it]


🔹 ChainRouter (ragas)
{'answer_correctness': 0.3977, 'answer_relevancy': 0.7286}

🔹 Agent (ragas)
{'answer_correctness': 0.5776, 'answer_relevancy': 0.6236}


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



🔹 ChainRouter (BERTScore)
Precision: 0.6000
Recall:    0.6777
F1:        0.6345

🔹 Agent (BERTScore)
Precision: 0.6418
Recall:    0.7301
F1:        0.6823


In [10]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_correctness, answer_relevancy
import evaluate as hf_evaluate

# 엑셀 파일 불러오기
df = pd.read_excel("모델 비교.xlsx").dropna(subset=["ChainRouter", "Agent", "Ground_Truth"])

# ragas 평가용 데이터셋 구성
def make_dataset(answer_column):
    return Dataset.from_pandas(df.rename(columns={
        "질문": "question",
        answer_column: "answer",
        "Ground_Truth": "ground_truth"
    }).assign(contexts=lambda x: [[a] for a in x["answer"]]))

# ChainRouter와 Agent용 데이터셋
chain_ds = make_dataset("ChainRouter")
agent_ds = make_dataset("Agent")

# ragas 평가 실행
metrics = [answer_correctness, answer_relevancy]
chain_result = evaluate(chain_ds, metrics=metrics)
agent_result = evaluate(agent_ds, metrics=metrics)

print("🔹 ChainRouter (ragas 평가)")
print(chain_result)

print("\n🔹 Agent (ragas 평가)")
print(agent_result)

# -----------------------------
# BERTScore 평가
# -----------------------------
bertscore = hf_evaluate.load("bertscore")

chain_preds = df["ChainRouter"].tolist()
agent_preds = df["Agent"].tolist()
references = df["Ground_Truth"].tolist()

chain_bertscore = bertscore.compute(predictions=chain_preds, references=references, lang="ko")
agent_bertscore = bertscore.compute(predictions=agent_preds, references=references, lang="ko")

print("\n🔹 ChainRouter (BERTScore)")
print(f"Precision: {sum(chain_bertscore['precision'])/len(chain_bertscore['precision']):.4f}")
print(f"Recall:    {sum(chain_bertscore['recall'])/len(chain_bertscore['recall']):.4f}")
print(f"F1:        {sum(chain_bertscore['f1'])/len(chain_bertscore['f1']):.4f}")

print("\n🔹 Agent (BERTScore)")
print(f"Precision: {sum(agent_bertscore['precision'])/len(agent_bertscore['precision']):.4f}")
print(f"Recall:    {sum(agent_bertscore['recall'])/len(agent_bertscore['recall']):.4f}")
print(f"F1:        {sum(agent_bertscore['f1'])/len(agent_bertscore['f1']):.4f}")


Evaluating: 100%|██████████| 30/30 [00:30<00:00,  1.00s/it]
Evaluating: 100%|██████████| 30/30 [00:36<00:00,  1.21s/it]


🔹 ChainRouter (ragas 평가)
{'answer_correctness': 0.3336, 'answer_relevancy': 0.7452}

🔹 Agent (ragas 평가)
{'answer_correctness': 0.3522, 'answer_relevancy': 0.6954}

🔹 ChainRouter (BERTScore)
Precision: 0.6188
Recall:    0.6722
F1:        0.6437

🔹 Agent (BERTScore)
Precision: 0.6167
Recall:    0.6759
F1:        0.6442
