# Qwen Evaluation Notebook

생성된 결과(JSON)를 `eval.py` 로직과 동일하게 F1 평가하고 요약을 확인할 수 있는 노트북입니다.

## 1. 경로 설정

In [None]:
from pathlib import Path

ROOT = Path.cwd().resolve()
# 예시: outputs/feta_tab_qwen/qwen_visdmrag
EVAL_DIR = ROOT / 'outputs' / 'feta_tab_qwen' / 'qwen_visdmrag'
EVAL_DIR

## 2. 평가 함수 (eval.py와 동일)

In [None]:
import json
import re
import string
from collections import Counter

import pandas as pd


def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def word_tokenize(text):
    return normalize_answer(text).split()


def calculate_f1(prediction, ground_truth):
    prediction_counter = Counter(prediction)
    ground_truth_counter = Counter(ground_truth)

    true_positives = sum((prediction_counter & ground_truth_counter).values())
    false_positives = sum(prediction_counter.values()) - true_positives
    false_negatives = sum(ground_truth_counter.values()) - true_positives

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) else 0

    if precision + recall == 0:
        return 0
    return 2 * precision * recall / (precision + recall)


def evaluate_directory(path: Path):
    records = []
    for json_file in sorted(path.glob('*.json')):
        with json_file.open() as fh:
            data = json.load(fh)
        pred = str(data.get('Answer') or data.get('answer') or '')
        gt = str(data.get('gt_answer') or '')
        if not pred or not gt:
            score = None
        else:
            score = calculate_f1(word_tokenize(pred), word_tokenize(gt))
        records.append({'file': json_file.name, 'f1': score})
    return pd.DataFrame(records)


## 3. 평가 실행

In [None]:
if not EVAL_DIR.exists():
    raise FileNotFoundError(f'EVAL_DIR not found: {EVAL_DIR}')

results_df = evaluate_directory(EVAL_DIR)
results_df

## 4. 통계 요약

In [None]:
valid_df = results_df.dropna(subset=['f1'])
avg_f1 = valid_df['f1'].mean() if not valid_df.empty else float('nan')
count = len(valid_df)
print(f'Average F1: {avg_f1:.4f} over {count} files')
valid_df.describe()
