In [None]:
# score_submission.ipynb
# 제출 파일(submission)과 정답(answer)을 비교해 character-level micro F1을 계산합니다.

In [1]:
from pathlib import Path
import pandas as pd

In [2]:
def num_same_chars(prediction: str, answer: str) -> int:
    same = 0
    for i in range(min(len(prediction), len(answer))):
        if prediction[i] == answer[i]:
            same += 1
    return same


def f1_from_counts(num_same: int, pred_len: int, answer_len: int) -> float:
    if pred_len == 0 and answer_len == 0:
        return 1.0
    if pred_len == 0 or answer_len == 0:
        return 0.0
    precision = num_same / pred_len
    recall = num_same / answer_len
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)


In [None]:
'''
submission_path에서 제출 파일명을 설정해주세요!
'''
submission_path = #Path('submission/submission.csv')
answer_path = Path('data/answer.csv')

id_col = 'ID'
pred_col = 'output'
answer_col = 'output'
encoding = 'utf-8-sig'


In [5]:
def score_submission(
    submission_path: Path | str,
    answer_path: Path | str,
    *,
    id_col: str = 'ID',
    pred_col: str = 'output',
    answer_col: str = 'output',
    encoding: str = 'utf-8-sig',
) -> tuple[int, float]:
    submission_path = Path(submission_path)
    answer_path = Path(answer_path)

    sub_df = pd.read_csv(submission_path, encoding=encoding)
    ans_df = pd.read_csv(answer_path, encoding=encoding)

    for path, df in [(submission_path, sub_df), (answer_path, ans_df)]:
        if id_col not in df.columns:
            raise ValueError(
                f'Missing column {id_col!r} in {path}. Columns: {df.columns.tolist()}'
            )

    if pred_col not in sub_df.columns:
        raise ValueError(
            f'Missing prediction column {pred_col!r} in {submission_path}. '
            f'Columns: {sub_df.columns.tolist()}'
        )
    if answer_col not in ans_df.columns:
        raise ValueError(
            f'Missing answer column {answer_col!r} in {answer_path}. '
            f'Columns: {ans_df.columns.tolist()}'
        )

    sub_df = sub_df[[id_col, pred_col]].copy()
    ans_df = ans_df[[id_col, answer_col]].copy()

    if sub_df[id_col].isna().any():
        raise ValueError(f'{submission_path} contains empty IDs.')
    if ans_df[id_col].isna().any():
        raise ValueError(f'{answer_path} contains empty IDs.')

    if sub_df[id_col].duplicated().any():
        dup_examples = (
            sub_df.loc[sub_df[id_col].duplicated(), id_col].astype(str).head(10).tolist()
        )
        raise ValueError(f'Duplicate IDs found in {submission_path}. Examples: {dup_examples}')
    if ans_df[id_col].duplicated().any():
        dup_examples = (
            ans_df.loc[ans_df[id_col].duplicated(), id_col].astype(str).head(10).tolist()
        )
        raise ValueError(f'Duplicate IDs found in {answer_path}. Examples: {dup_examples}')

    sub_df[id_col] = sub_df[id_col].astype(str)
    ans_df[id_col] = ans_df[id_col].astype(str)

    sub_ids = set(sub_df[id_col])
    ans_ids = set(ans_df[id_col])
    missing = ans_ids - sub_ids
    extra = sub_ids - ans_ids
    if missing or extra:
        parts: list[str] = ['ID mismatch between submission and answer.']
        if missing:
            preview = sorted(list(missing))[:10]
            parts.append(f'- Missing in submission: {len(missing)} (e.g. {preview})')
        if extra:
            preview = sorted(list(extra))[:10]
            parts.append(f'- Extra in submission: {len(extra)} (e.g. {preview})')
        raise ValueError('\n'.join(parts))

    sub_df = sub_df.rename(columns={pred_col: 'prediction'})
    ans_df = ans_df.rename(columns={answer_col: 'answer'})

    merged = ans_df.merge(sub_df, on=id_col, how='inner', validate='one_to_one')
    if len(merged) != len(ans_df):
        raise ValueError(
            'Internal error: merged rows != answer rows '
            f'({len(merged)} != {len(ans_df)}).'
        )

    merged['prediction'] = merged['prediction'].fillna('').astype(str)
    merged['answer'] = merged['answer'].fillna('').astype(str)

    total_same = 0
    total_pred_len = 0
    total_answer_len = 0

    for prediction, answer in zip(merged['prediction'].tolist(), merged['answer'].tolist()):
        pred_len = len(prediction)
        answer_len = len(answer)
        num_same = num_same_chars(prediction, answer)

        total_same += num_same
        total_pred_len += pred_len
        total_answer_len += answer_len

    f1 = f1_from_counts(total_same, total_pred_len, total_answer_len)
    return len(merged), f1


In [6]:
rows, f1 = score_submission(
    submission_path,
    answer_path,
    id_col=id_col,
    pred_col=pred_col,
    answer_col=answer_col,
    encoding=encoding,
)

print(f'Rows: {rows}')
print(f'F1: {f1:.6f}')


Rows: 1263
F1: 0.971315
