In [None]:
# 평가를 위한 데이터셋을 불러와 1,000개만 선별

from datasets import load_dataset

klue_mrc_test = load_dataset('klue', 'mrc', split='validation')
klue_mrc_test = klue_mrc_test.train_test_split(test_size=1000, seed=42)['test']

In [None]:
# 임베딩을 저장하고 검색하는 함수 구현
import faiss

def make_embedding_index(sentence_model, corpus):
    embeddings = sentence_model.encode(corpus)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index

def find_embedding_top_k(query, sentence_model, index, k=5):
    embedding = sentence_model.encode([query])
    distances, indices = index.search(embedding, k)
    return indices

In [None]:
# 교차 인코더를 활용한 순위 재정렬 함수 정의

def make_question_context_pairs(question_idx, indices):
    return [[klue_mrc_test['question'][question_idx], klue_mrc_test['context'][idx]] for idx in indices]

def rerank_top_k(cross_model, question_idx, indices, k):
    input_examples = make_question_context_pairs(question_idx, indices)
    relevance_scores = cross_model.predict(input_examples)
    reranked_indices = indices[np.argsort(relevance_scores)[::-1]]
    return reranked_indices

In [None]:
# 성능 지표(히트율)와 평가에 걸린 시간을 반환하는 함수 정의

import time
def evaluate_hit_rate(datasets, embedding_model, index, k=10):
    start_time = time.time()
    predictions = []
    for question in datasets['question']:
        predictions.append(find_embedding_top_k(question, embedding_model, index, k)[0])

    total_prediction_count = len(predictions)
    hit_count = 0
    questions = datasets['question']
    contexts = datasets['context']
    for idx, prediction in enumerate(predictions):
        for pred in prediction:
            if contexts[pred] == questions[idx]:
                hit_count += 1
                break

    end_time = time.time()
    return hit_count / total_prediction_count, end_time - start_time

#### 1. 기본 임베딩 모델로 검색하기

In [None]:
from sentence_transformers import SentenceTransformer
base_embedding_model = SentenceTransformer('shangrilar/klue-roberta-base-klue-sts')
base_index = make_embedding_index(base_embedding_model, klue_mrc_test['context'])
evaluate_hit_rate(klue_mrc_test, base_embedding_model, base_index, k=10)

#### 2. 미세 조정한 임베딩 모델로 검색하기

In [None]:
finetuned_embedding_model = SentenceTransformer('shangrilar/klue-roberta-base-klue-sts-mrc')
finetuned_index = make_embedding_index(finetuned_embedding_model, klue_mrc_test['context'])
evaluate_hit_rate(klue_mrc_test, finetuned_embedding_model, finetuned_index, k=10)

#### 3. 순위 재정렬을 포함한 평가 함수

In [None]:
import time
import numpy as np
from tqdm.auto import auto

def evaluate_hit_rate_with_rerank(datasets, embedding_model, cross_model, index, bi_k=30, cross_k=10):
    start_time = time.time()
    predictions = []
    for question_idx, question in enumerate(tqdm(datasets['question'])):
        indices = find_embedding_top_k(question, embedding_model, index, bi_k)[0]
        predictions.append(rerank_top_k(cross_model, question_idx, indices, cross_k))
    total_prediction_count = len(predictions)

    hit_count = 0
    questions = datasets['question']
    contexts  = datasets['context']
    for idx, prediction in enumerate(predictions):
        for pred in prediction:
            if contexts[pred] == questions[idx]:
                hit_count += 1
                break

    end_time = time.time()
    return hit_count / total_prediction_count, end_time - start_time