In [68]:
import pandas as pd
import numpy as np
from utils1.retreiver_setting import faiss_retriever_loading
from utils1.chain_setting import create_chain
from transformers import BertTokenizer
from rank_bm25 import BM25Okapi

account_retriever, business_retriever, business_retriever2, self_retriever = faiss_retriever_loading()
simple_chain, classification_chain, account_chain, extract_chain, business_chain, hybrid_chain, financial_chain = create_chain()

In [69]:
from transformers import BertTokenizer
from rank_bm25 import BM25Okapi

# 한국어 형태소 분석기
def preprocess(text):
    tokenizer = BertTokenizer.from_pretrained('kykim/bert-kor-base')
    tokens = tokenizer.tokenize(text)  # BERT tokenizer로 단어 분리
    return tokens

# BM25 계산 함수
def calculate_bm25(query, documents):
    # 문서 토큰화 (BERT tokenizer 사용)
    corpus = [preprocess(doc.page_content) for doc in documents]

    # BM25 모델 적용
    bm25 = BM25Okapi(corpus)

    # 쿼리 토큰화 (BERT tokenizer 사용)
    query_tokens = preprocess(query)  # 쿼리도 형태소 분석
    scores = bm25.get_scores(query_tokens)

    return scores

# BM25 검색 함수
def bm25_search(query, top_k=5):
    documents = account_retriever.invoke(query)  # 문서 검색
    bm25_scores = calculate_bm25(query, documents)  # BM25 점수 계산

    # 문서 ID와 내용만 포함한 결과 반환
    sorted_docs = sorted(zip(documents, bm25_scores), key=lambda x: x[1], reverse=True)[:top_k]
    return [{'id': doc.id, 'content': doc.page_content} for doc, _ in sorted_docs]  # 문서 객체가 아닌, 문서 ID와 내용을 반환

In [70]:
def compute_metrics(predicted, relevant_dict, k=5):
    """
    Precision@k, Recall@k, MRR, AP 계산
    """
    # Precision@k: 상위 k 중 관련(grade>=1) 문서 비율
    hits = sum([1 for doc in predicted[:k] if doc['id'] in relevant_dict])  # 'id'로 문서 ID 비교
    precision = hits / k

    # Recall@k: 관련 문서 총 개수 대비 상위 k 중 회수된 관련 개수
    total_relevant = len(relevant_dict)
    recall = hits / total_relevant if total_relevant > 0 else 0

    # MRR: 첫 번째 관련 문서 위치 기반
    rr = 0
    for idx, doc in enumerate(predicted):
        if doc['id'] in relevant_dict:  # 'id'로 문서 ID 비교
            rr = 1 / (idx + 1)
            break

    # 단일 AP 계산 (MAP를 위해)
    num_correct = 0
    precisions = []
    for i, doc in enumerate(predicted[:k]):
        if doc['id'] in relevant_dict:  # 'id'로 문서 ID 비교
            num_correct += 1
            precisions.append(num_correct / (i + 1))
    ap = np.mean(precisions) if precisions else 0

    return precision, recall, rr, ap


def evaluate_all(method_results, queries, k=5):
    """
    모든 쿼리에 대해 성능 평가를 수행하고 평균을 계산합니다.
    """
    prec_list, rec_list, rr_list, ap_list = [], [], [], []

    for query in queries:
        qid = query['query_id']
        relevant_dict = parse_relevant(query['relevant_doc_ids'])
        predicted = method_results[qid]

        p, r, rr, ap = compute_metrics(predicted, relevant_dict, k)

        prec_list.append(p)
        rec_list.append(r)
        rr_list.append(rr)
        ap_list.append(ap)

    # 평균 지표 반환
    return {
        'P@k': np.mean(prec_list),
        'R@k': np.mean(rec_list),
        'MRR': np.mean(rr_list),
        'MAP': np.mean(ap_list)
    }

In [71]:
# 예시로 주어진 queries 리스트
queries = [
    {"query_id": "Q1", "query_text": "기업회계기준서 제1109호 금융상품 관련", "relevant_doc_ids": "a9792da2-2636-400e-a37b-6d7ce7547778=1"},
    {"query_id": "Q2", "query_text": "사업결합 관련 기업회계기준서", "relevant_doc_ids": "27da9efc-1aa5-4ab3-98f0-3a0e10ba2b9c=1;256e05b0-43a5-43d4-b696-7c7405abc463=2"},
    {"query_id": "Q3", "query_text": "회계정책과 회계추정치 변경 관련", "relevant_doc_ids": "19df546f-a4ce-4b40-8971-2730cc6e24f4=1;256e05b0-43a5-43d4-b696-7c7405abc463=2"},
    {"query_id": "Q4", "query_text": "농림어업 관련 회계기준서", "relevant_doc_ids": "f15bf88e-1f13-44d2-88fb-0ea1f67633cc=1;5e47c132-1b65-4476-bdac-6fc5b4089fea=2"}
]


def handle_accounting_non_bm25(question: str) -> list:
    """
    BM25를 사용하지 않고 회계 질문에 답변하는 함수
    """
    # 문서 가져오기 (BM25 미적용, 상위 5개 문서만 가져옴)
    docs = account_retriever.invoke(question)
    docs = docs[:5]  # BM25 적용하지 않으면 그냥 상위 5개 문서
    return [{'id': doc.id, 'content': doc.page_content} for doc in docs]

def parse_relevant(relevant_doc_ids):
    return {doc_id.split('=')[0] for doc_id in relevant_doc_ids.split(';')}

# BM25 계산 함수 예시
from rank_bm25 import BM25Okapi

def calculate_bm25(query, documents):
    corpus = [doc.page_content.split() for doc in documents]
    bm25 = BM25Okapi(corpus)
    query_tokens = query.split()  # query를 토큰화
    scores = bm25.get_scores(query_tokens)
    return scores


# 평가
bm25_results = {}
for query in queries:
    qid = query['query_id']
    query_text = query['query_text']
    bm25_results[qid] = bm25_search(query_text, top_k=5)

# Non-BM25 평가
non_bm25_results = {}
for query in queries:
    qid = query['query_id']
    query_text = query['query_text']
    non_bm25_results[qid] = handle_accounting_non_bm25(query_text)

bm25_evaluation = evaluate_all(bm25_results, queries, k=5)
non_bm25_evaluation = evaluate_all(non_bm25_results, queries, k=5)

print("BM25 Evaluation:", bm25_evaluation)
print("Non-BM25 Evaluation:", non_bm25_evaluation)

BM25 Evaluation: {'P@k': np.float64(0.05), 'R@k': np.float64(0.125), 'MRR': np.float64(0.0625), 'MAP': np.float64(0.0625)}
Non-BM25 Evaluation: {'P@k': np.float64(0.15000000000000002), 'R@k': np.float64(0.375), 'MRR': np.float64(0.4583333333333333), 'MAP': np.float64(0.4583333333333333)}


In [72]:
pd.DataFrame([bm25_evaluation, non_bm25_evaluation], index=['BM25', 'Non-BM25'])

Unnamed: 0,P@k,R@k,MRR,MAP
BM25,0.05,0.125,0.0625,0.0625
Non-BM25,0.15,0.375,0.458333,0.458333
