In [None]:
%cd ./FinanceRAG
#각자 drive의 FinanceRAG-main의 경로를 복사해서 넣으면 됨

In [None]:
!pip install pytrec_eval --quiet
!pip install datasets --quiet

In [3]:
import pytrec_eval
from typing import Dict, List, Tuple
import logging

logger = logging.getLogger(__name__)

def evaluate(
            qrels: Dict[str, Dict[str, int]],
            results: Dict[str, Dict[str, float]],
            k_values: List[int],
            ignore_identical_ids: bool = True
    ) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]]:

        if ignore_identical_ids:
            popped = []
            for qid, rels in results.items():
                for pid in list(rels):
                    if qid == pid:
                        results[qid].pop(pid)  # remove identical query-document pairs
                        popped.append(pid)

        # Filter results to only keep queries that are present in qrels
        filtered_results = {qid: rels for qid, rels in results.items() if qid in qrels}

        # Initialize dictionaries for evaluation metrics
        ndcg = {}
        _map = {}
        recall = {}
        precision = {}

        # Initialize metric values for each k in k_values
        for k in k_values:
            ndcg[f"NDCG@{k}"] = 0.0
            _map[f"MAP@{k}"] = 0.0
            recall[f"Recall@{k}"] = 0.0
            precision[f"P@{k}"] = 0.0

        # Define strings for pytrec_eval evaluation
        map_string = "map_cut." + ",".join([str(k) for k in k_values])
        ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
        recall_string = "recall." + ",".join([str(k) for k in k_values])
        precision_string = "P." + ",".join([str(k) for k in k_values])

        # Perform evaluation using pytrec_eval with filtered results
        evaluator = pytrec_eval.RelevanceEvaluator(qrels,
                                                   {map_string, ndcg_string, recall_string, precision_string})
        scores = evaluator.evaluate(filtered_results)

        # Aggregate the scores for each query and each k
        for query_id in scores.keys():
            for k in k_values:
                ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)]
                _map[f"MAP@{k}"] += scores[query_id]["map_cut_" + str(k)]
                recall[f"Recall@{k}"] += scores[query_id]["recall_" + str(k)]
                precision[f"P@{k}"] += scores[query_id]["P_" + str(k)]

        # Compute the average scores for each k
        for k in k_values:
            ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"] / len(scores), 5)
            _map[f"MAP@{k}"] = round(_map[f"MAP@{k}"] / len(scores), 5)
            recall[f"Recall@{k}"] = round(recall[f"Recall@{k}"] / len(scores), 5)
            precision[f"P@{k}"] = round(precision[f"P@{k}"] / len(scores), 5)

        # Log the results for each metric
        for _eval in [ndcg, _map, recall, precision]:
            logger.info("\n")
            for k in _eval.keys():
                print("{}: {:.4f}".format(k, _eval[k]))

        return ndcg, _map, recall, precision

result_output 파일에서 test set에 대한 점수 계산

In [4]:
import pandas as pd
import json

In [None]:
# 답변 레이블의 30%가 포함된 TSV 파일 로드
df = pd.read_csv('../files/ConvFinQA_qrels.tsv', sep='\t')

# TSV 데이터를 평가를 위한 사전 형식으로 변환
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
df_submission = pd.read_csv('../results/ConvFinQA/results_score.csv')
df_submission_dict = df_submission.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

result = evaluate(qrels = qrels_dict, results = df_submission_dict, k_values = [1, 5, 10], ignore_identical_ids = True)

In [None]:
# 답변 레이블의 30%가 포함된 TSV 파일 로드
df = pd.read_csv('../files/FinanceBench_qrels.tsv', sep='\t')

# TSV 데이터를 평가를 위한 사전 형식으로 변환
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
df_submission = pd.read_csv('../results/FinanceBench/results_score.csv')
df_submission_dict = df_submission.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

result = evaluate(qrels = qrels_dict, results = df_submission_dict, k_values = [1, 5, 10], ignore_identical_ids = True)

In [None]:
# 답변 레이블의 30%가 포함된 TSV 파일 로드
df = pd.read_csv('../files/FinDER_qrels.tsv', sep='\t')

# TSV 데이터를 평가를 위한 사전 형식으로 변환
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
df_submission = pd.read_csv('../results/FinDER/results_score.csv')
df_submission_dict = df_submission.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

result = evaluate(qrels = qrels_dict, results = df_submission_dict, k_values = [1, 5, 10], ignore_identical_ids = True)

In [None]:
# 답변 레이블의 30%가 포함된 TSV 파일 로드
df = pd.read_csv('../files/FinQA_qrels.tsv', sep='\t')

# TSV 데이터를 평가를 위한 사전 형식으로 변환
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
df_submission = pd.read_csv('../results/FinQA/results_score.csv')
df_submission_dict = df_submission.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

result = evaluate(qrels = qrels_dict, results = df_submission_dict, k_values = [1, 5, 10], ignore_identical_ids = True)

In [None]:
# 답변 레이블의 30%가 포함된 TSV 파일 로드
df = pd.read_csv('../files/FinQABench_qrels.tsv', sep='\t')

# TSV 데이터를 평가를 위한 사전 형식으로 변환
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
df_submission = pd.read_csv('../results/FinQABench/results_score.csv')
df_submission_dict = df_submission.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

result = evaluate(qrels = qrels_dict, results = df_submission_dict, k_values = [1, 5, 10], ignore_identical_ids = True)

In [None]:
# 답변 레이블의 30%가 포함된 TSV 파일 로드
df = pd.read_csv('../files/MultiHeirtt_qrels.tsv', sep='\t')

# TSV 데이터를 평가를 위한 사전 형식으로 변환
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
df_submission = pd.read_csv('../results/MultiHiertt/results_score.csv')
df_submission_dict = df_submission.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

result = evaluate(qrels = qrels_dict, results = df_submission_dict, k_values = [1, 5, 10], ignore_identical_ids = True)

In [None]:
# 답변 레이블의 30%가 포함된 TSV 파일 로드
df = pd.read_csv('../files/TATQA_qrels.tsv', sep='\t')

# TSV 데이터를 평가를 위한 사전 형식으로 변환
qrels_dict = df.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()
df_submission = pd.read_csv('../results/TAT-QA/results_score.csv')
df_submission_dict = df_submission.groupby('query_id').apply(lambda x: dict(zip(x['corpus_id'], x['score']))).to_dict()

result = evaluate(qrels = qrels_dict, results = df_submission_dict, k_values = [1, 5, 10], ignore_identical_ids = True)