## 혼합 검색 기능 활용
- 의미 검색과 키워드(통계기반) 검색 혼합
- RRF(상호 순위 조합, REciprocal Rank Fusion)작성

In [1]:
# RRF 함수 작성
# 각 순위 점수 = 1 / (k + 순위)
from collections import defaultdict
from typing import List

def reciprocal_rank_fusion(rankings:List[List[int]], k=5):
    rrf = defaultdict(float)
    for ranking in rankings:
        for i, doc_id in enumerate(ranking, 1):
            rrf[doc_id] += 1.0 / (k + i)
    return sorted(rrf.items(), key=lambda x: x[1], reverse=True)

### RRF 함수 알아가기

In [2]:
rank_list = [[1, 4, 3, 5, 6]   # 의미 검색 고유값 순위
            ,[2, 1, 3, 6, 4]]  # 통계 검색 고유값 순위

reciprocal_rank_fusion(rank_list)

[(1, 0.30952380952380953),
 (3, 0.25),
 (4, 0.24285714285714285),
 (6, 0.2111111111111111),
 (2, 0.16666666666666666),
 (5, 0.1111111111111111)]

In [3]:
reciprocal_rank_fusion(rank_list, k=40)

[(1, 0.04819976771196283),
 (3, 0.046511627906976744),
 (4, 0.046031746031746035),
 (6, 0.04494949494949495),
 (2, 0.024390243902439025),
 (5, 0.022727272727272728)]

## 실제 구현

In [4]:
from datasets import load_dataset

# https://huggingface.co/datasets/klue/klue/viewer/mrc
klue_mrc_dataset = load_dataset('klue','mrc', split='train')
# klue_mrc_dataset

README.md:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17554 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5841 [00:00<?, ? examples/s]

### 의미 검색 구현

In [5]:
from sentence_transformers import SentenceTransformer
sentence_model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

context_embedded = sentence_model.encode(klue_mrc_dataset['context'])



The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/336k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/967k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/549 [00:00<?, ?it/s]

In [6]:
!pip install faiss-cpu faiss-gpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu, faiss-cpu
Successfully installed faiss-cpu-1.10.0 faiss-gpu-1.7.2


In [7]:
import faiss # 메타 API로 벡터 거리 계산용

index_knn = faiss.IndexFlatL2(context_embedded.shape[1]) # KNN 알고리즘 초기화, 768에 벡터공간 할당

# 인덱스에 임베딩 저장 : 테이블 생성 유사(메모리용 벡터데이터베이스 유사)
index_knn.add(context_embedded)

### 통계 검색 구현

In [8]:
import math
import numpy as np
from typing import List
from transformers import PreTrainedTokenizer
from collections import defaultdict

class BM25:
    def __init__(self, corpus: List[List[str]], tokenizer: PreTrainedTokenizer):

        # Initialize BM25 with a list of tokenized documents and a tokenizer.
        self.tokenizer = tokenizer
        self.corpus = corpus
        
        # Tokenize the entire corpus. This converts words into token IDs.
        self.tokenized_corpus = self.tokenizer(corpus, add_special_tokens=False)['input_ids']
        
        # Number of documents in the corpus.
        self.n_docs = len(self.tokenized_corpus)
        
        # Calculate the average document length in tokens.
        self.avg_doc_lens = sum(len(doc) for doc in self.tokenized_corpus) / self.n_docs
        
        # Compute the Inverse Document Frequency (IDF) values.
        self.idf = self._calculate_idf()
        
        # Compute the term frequencies for each document.
        self.term_freqs = self._calculate_term_freqs()
    
    def _calculate_idf(self):
    
        # Calculate Inverse Document Frequency (IDF) for each unique token in the corpus.
        idf = defaultdict(float)
        
        # Count the number of documents containing each token.
        for doc in self.tokenized_corpus:
            for token_id in set(doc):
                idf[token_id] += 1
        
        # Apply the BM25-specific IDF formula for each token.
        for token_id, doc_frequency in idf.items():
            idf[token_id] = math.log(((self.n_docs - doc_frequency + 0.5) / (doc_frequency + 0.5)) + 1)
        
        return idf
    
    def _calculate_term_freqs(self):
    
        # Compute the frequency of each token in each document.
        term_freqs = [defaultdict(int) for _ in range(self.n_docs)]
        for i, doc in enumerate(self.tokenized_corpus):
            for token_id in doc:
                term_freqs[i][token_id] += 1
        
        return term_freqs
    
    def get_scores(self, query: str, k1: float = 1.2, b: float = 0.75):
    
        # Calculate BM25 scores for all documents given a query.
        # k1 controls term frequency saturation; b adjusts document length normalization.
        query = self.tokenizer([query], add_special_tokens=False)['input_ids'][0]
        scores = np.zeros(self.n_docs)
        
        # Compute BM25 scores for each query token.
        for q in query:
            idf = self.idf[q]  # Retrieve the precomputed IDF for the query token.
            for i, term_freq in enumerate(self.term_freqs):
                q_frequency = term_freq[q]  # Term frequency of the query token in the current document.
                doc_len = len(self.tokenized_corpus[i])
        
                # BM25 formula to compute the score contribution of this token.
                score_q = idf * (q_frequency * (k1 + 1)) / (q_frequency + k1 * (1 - b + b * (doc_len / self.avg_doc_lens)))
                
                # Accumulate the score for document i.
                scores[i] += score_q
        
        return scores
    
    def get_top_k(self, query: str, k: int):
    
        # Get the top-k documents based on BM25 scores for the given query.
        scores = self.get_scores(query)
        
        # Sort document indices by scores in descending order and select top-k.
        top_k_indices = np.argsort(scores)[-k:][::-1]
        
        # Retrieve the scores for the top-k documents.
        top_k_scores = scores[top_k_indices]
        
        return top_k_scores, top_k_indices

In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-base')


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [11]:
index_bm25_klue_mrc = BM25(klue_mrc_dataset['context'], tokenizer)

Token indices sequence length is longer than the specified maximum sequence length for this model (965 > 512). Running this sequence through the model will result in indexing errors


### 혼합 검색 구현

In [19]:
def hybrid_RRF_search(query, top_k=50):## 의미 검색 활용
    query_embedded = sentence_model.encode([query], top_k=50)
    
    # index_knn.search(query_embedded, 3)
    distances, indices = index_knn.search(query_embedded, top_k)
    
    ## 통계 기반 검색 활용
    top_score, tops_ranking = index_bm25_klue_mrc.get_top_k(query, top_k)

    results = reciprocal_rank_fusion([indices[0], tops_ranking])
    return results

In [21]:
query = '이번 연도에는 언제 비가 많이 올까?'
results = hybrid_RRF_search(query) # 의미 검색 + 통계 검색 합계값 순위 표시
results[:3]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[(9205, 0.16993464052287582),
 (8704, 0.16666666666666666),
 (1326, 0.16666666666666666)]

In [24]:
for idx, score in results[:3]:
    context = klue_mrc_dataset['context'][idx][:60]
    print(f'score: {score}, context: {context}')
    pass

score: 0.16993464052287582, context: 다음달엔 평년에 비해 때 이른 무더위가 기승을 부릴 전망이다. 8월에는 대기불안정과 저기압의 영향으로 많은 
score: 0.16666666666666666, context: 올 들어 한반도 날씨가 수상쩍다. 23일 하루 동안 서울 등 중부지방엔 호우특보와 폭염특보가 번갈아 발령되는
score: 0.16666666666666666, context: 케이팝 팬덤을 위한 어플리케이션 ‘블립’ 조사 결과, NCT 팬들이 가장 많이 입덕한 노래는 ‘보스(BOSS
