# Extraction

In [13]:
import re
import pandas as pd

# Read the uploaded log file
log_path = "/workspace/jke/pipeline/pipeline_results_qwen_final_exprement_50.log"

# Extract Question and Final Answer pairs
questions = []
answers = []

with open(log_path, "r", encoding="utf-8") as f:
    content = f.read()

# Regex to capture Question and Final Answer blocks
pattern = re.compile(r"Question:\s*(.*?)\nFinal Answer:\s*(.*?)(?=\n=+)", re.S)
matches = pattern.findall(content)

for q, a in matches:
    questions.append(q.strip())
    answers.append(a.strip())

# Create DataFrame
df = pd.DataFrame({"Question": questions, "Final Answer": answers})

# Save to CSV
output_path = "predict_50.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

# Evaluation

In [2]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13
[0m

In [3]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py (from rouge-score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge-score
[33m  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24987 sha256=5b4a8b7e078650373a3ff9bdf768

In [15]:
import re
import json
import pandas as pd
from bert_score import score
from rouge_score import rouge_scorer

# 간단 문자열 정규화 (공백 압축 + 따옴표/공백 제거)
_ws = re.compile(r"\s+")
def _norm(s: pd.Series) -> pd.Series:
    s = s.astype(str).fillna("")
    s = s.map(lambda x: x.strip().strip('"').strip("'"))
    s = s.map(lambda x: _ws.sub(" ", x))
    return s.str.strip()

def eval_from_two_csv(
    gt_path: str,
    pred_path: str,
    merge_on: str = "id",
    lang: str = "ko",
    use_xlm: bool = True,
    save_path: str = "evaluation_results.json"  # 저장 경로 (확장자 .json/.csv 둘 다 가능)
):
    # ===== 1) 데이터 로드 및 merge =====
    df_gt = pd.read_csv(gt_path)
    df_pred = pd.read_csv(pred_path)

    df = pd.merge(df_gt, df_pred, on=merge_on, how="inner")

    if "ground_truth" not in df.columns:
        raise KeyError("ground_truth 컬럼이 필요합니다 (test.csv).")
    if "Final Answer" not in df.columns:
        raise KeyError("prediction CSV에 'Final Answer' 컬럼이 필요합니다.")

    gts   = df["ground_truth"].astype(str).fillna("").tolist()
    preds = df["Final Answer"].astype(str).fillna("").tolist()

    # ===== 2) BERTScore =====
    if use_xlm:
        P, R, F1 = score(preds, gts, model_type="xlm-roberta-large")
    else:
        P, R, F1 = score(preds, gts, lang=lang)

    # ===== 3) Exact Match =====
    gt_norm = _norm(df["ground_truth"]).str.lower()
    pr_norm = _norm(df["Final Answer"]).str.lower()
    exact_match = (gt_norm == pr_norm).astype(int)
    exact_match_accuracy = float(exact_match.mean()) if len(exact_match) else 0.0

    # ===== 4) ROUGE (Precision, Recall, F1) =====
    scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

    rouge_scores = {"rouge1": {"p":[], "r":[], "f":[]},
                    "rouge2": {"p":[], "r":[], "f":[]},
                    "rougeL": {"p":[], "r":[], "f":[]}}

    for pred, gt in zip(preds, gts):
        scores = scorer.score(gt, pred)
        for k in ["rouge1","rouge2","rougeL"]:
            rouge_scores[k]["p"].append(scores[k].precision)
            rouge_scores[k]["r"].append(scores[k].recall)
            rouge_scores[k]["f"].append(scores[k].fmeasure)

    rouge_summary = {}
    for k in rouge_scores:
        rouge_summary[f"{k}_precision"] = sum(rouge_scores[k]["p"]) / len(rouge_scores[k]["p"])
        rouge_summary[f"{k}_recall"]    = sum(rouge_scores[k]["r"]) / len(rouge_scores[k]["r"])
        rouge_summary[f"{k}_f1"]        = sum(rouge_scores[k]["f"]) / len(rouge_scores[k]["f"])

    # ===== 결과 dict =====
    results = {
        "bert_score_precision": P.mean().item(),
        "bert_score_recall": R.mean().item(),
        "bert_score_f1": F1.mean().item(),
        "exact_match_accuracy": exact_match_accuracy,
        **rouge_summary
    }

    # ===== 결과 저장 =====
    if save_path.endswith(".json"):
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
    elif save_path.endswith(".csv"):
        pd.DataFrame([results]).to_csv(save_path, index=False, encoding="utf-8-sig")

    print(f"평가 결과 저장 완료 → {save_path}")
    return results


import os

# 평가할 prediction 파일들
prediction_files = [
    "predict_1.csv",
    "predict_5.csv",
    "predict_10.csv",
    "predict_20.csv",
    "predict_50.csv",
]

# ground truth 파일
gt_file = "test.csv"

all_results = []

for pred_file in prediction_files:
    # 저장 파일명 (ex: eval_results_predict_1.csv)
    base_name = os.path.splitext(os.path.basename(pred_file))[0]
    save_csv = f"eval_results_{base_name}.csv"

    results = eval_from_two_csv(
        gt_path=gt_file,
        pred_path=pred_file,
        merge_on="question",   # id 기준이면 "id"로 바꿔주세요
        save_path=save_csv
    )

    # 어떤 prediction 결과인지 구분 위해 이름 추가
    results["prediction_file"] = pred_file
    all_results.append(results)

# 여러 결과를 하나의 CSV로 합치기
summary_path = "eval_results_summary.csv"
pd.DataFrame(all_results).to_csv(summary_path, index=False, encoding="utf-8-sig")
print(f"전체 결과 요약 저장 완료 → {summary_path}")



평가 결과 저장 완료 → eval_results_predict_1.csv
평가 결과 저장 완료 → eval_results_predict_5.csv
평가 결과 저장 완료 → eval_results_predict_10.csv
평가 결과 저장 완료 → eval_results_predict_20.csv
평가 결과 저장 완료 → eval_results_predict_50.csv
전체 결과 요약 저장 완료 → eval_results_summary.csv


In [16]:
import pandas as pd

results = pd.read_csv("eval_results_summary.csv")

In [19]:
results

Unnamed: 0,bert_score_precision,bert_score_recall,bert_score_f1,exact_match_accuracy,rouge1_precision,rouge1_recall,rouge1_f1,rouge2_precision,rouge2_recall,rouge2_f1,rougeL_precision,rougeL_recall,rougeL_f1,prediction_file
0,0.827405,0.863926,0.844023,0.151246,0.195149,0.470792,0.199681,0.051308,0.10577,0.053191,0.194412,0.469515,0.19921,predict_1.csv
1,0.825578,0.864199,0.843278,0.161922,0.216157,0.503312,0.218037,0.075978,0.141788,0.075283,0.21561,0.501516,0.217499,predict_5.csv
2,0.832851,0.870039,0.849894,0.197509,0.252376,0.538728,0.255308,0.078885,0.139305,0.075671,0.251375,0.537083,0.254545,predict_10.csv
3,0.837923,0.875247,0.854887,0.204626,0.292003,0.574528,0.294936,0.093867,0.145926,0.088286,0.28924,0.569557,0.29205,predict_20.csv
4,0.843506,0.876638,0.858485,0.256228,0.347771,0.613358,0.349879,0.09805,0.148343,0.096073,0.343115,0.607663,0.345485,predict_50.csv
