In [22]:
import pandas as pd
df = pd.read_excel("../resources/comfort_datasets.xlsx")

In [36]:
from onnxruntime import InferenceSession
from transformers import AutoTokenizer
import torch
import numpy as np
from tqdm import tqdm

MODEL_PATH = "j5ng/et5-sentence-comfort"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
sess = InferenceSession("./encoder.onnx_uint8.onnx" , providers=["CPUExecutionProvider"])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
def mean_pooling(model_output, attention_mask):
    model_output = torch.from_numpy(model_output[0])
    # First element of model_output contains all token embeddings
    token_embeddings = model_output
    attention_mask = torch.from_numpy(attention_mask)
    input_mask_expanded = attention_mask.unsqueeze(
        -1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask, input_mask_expanded, sum_mask

def embedding_query(query: str, normalize_embeddings=False) -> np.ndarray:
    # user turn sequence to query embedding
    model_inputs = tokenizer(query, return_tensors="pt")
    inputs_onnx = {k: v.cpu().detach().numpy()
                   for k, v in model_inputs.items()}
    sequence = sess.run(None, inputs_onnx)
    query_embedding = mean_pooling(
        sequence, inputs_onnx["attention_mask"])[0][0]

    if normalize_embeddings:
        query_embedding = query_embedding / \
            np.linalg.norm(query_embedding)

    return query_embedding.numpy()

In [24]:
import faiss

index = faiss.read_index("./faiss_onnx_uint8")

In [28]:
def reply(query: str):
    embedding = np.expand_dims(embedding_query(query, normalize_embeddings=True), axis=0)
    D, I = index.search(embedding, 5)
    return df.loc[I[0]]

In [123]:
from kiwipiepy import Kiwi
from kiwipiepy.utils import Stopwords
import pickle

stopwords = Stopwords()
kiwi = Kiwi()

def tokenize(text):
    tokens = kiwi.tokenize(text, stopwords=stopwords)
    return [ (pos.form, pos.tag) for pos in tokens]

bm25result = pickle.load(open('bm25result', 'rb'))
    
def bm25_chatbot_response(query, n=5):
    tokenized_query = tokenize(query)
    scores = bm25result.get_scores(tokenized_query)
    top_5_indices = np.argsort(scores)[-5:]
    res = df.loc[top_5_indices[::-1]]
    return res

In [51]:
class RRF:
    def __init__(self):
        self.EMBED_WEIGHT: int = 0.6
        self.KEYWORD_WEIGHT: int = 0.4
    
    def get_ranking(self, query1_ids, query2_ids):
        # 중복 없는 모든 값들을 구합니다.
        all_values = list(set(query1_ids + query2_ids))

        # 결과를 저장할 딕셔너리를 초기화합니다.
        ranking = {}

        # 모든 값을 순회하면서 해당 값의 순위를 구합니다.
        for value in all_values:
            rank1 = query1_ids.index(value) + 1 if value in query1_ids else None
            rank2 = query2_ids.index(value) + 1 if value in query2_ids else None
            ranking[value] = [rank1, rank2]

        return ranking
    
    def reciprocal_rank(self, rank):
        """주어진 순위에 대한 Reciprocal Rank를 계산하는 함수"""
        try:
            return 1 / rank
        except TypeError:
            return 0.0
        
    def get_rrf_scores(self, query1_ids, query2_ids):
        
        ranking = self.get_ranking(query1_ids, query2_ids)
        
        ids = []
        scores = []

        for key in ranking.keys():
            # 각 검색 시스템의 순위
            embed_rank = ranking[key][0]
            keyword_rank = ranking[key][1]

            # 각 검색 시스템의 Reciprocal Rank 계산
            embed_rr = self.reciprocal_rank(embed_rank)
            keyword_rr = self.reciprocal_rank(keyword_rank)

            # 가중치가 적용된 Reciprocal Rank 계산
            rrf = (self.EMBED_WEIGHT * embed_rr) + (self.KEYWORD_WEIGHT * keyword_rr)

            scores.append(rrf)
            ids.append(key)

        sorted_scores = sorted(scores, reverse=True)
        sorted_ids = [key for _, key in sorted(zip(scores, ranking.keys()), reverse=True)]
        return {"ids": sorted_ids, "scores": sorted_scores}

In [52]:
rrf = RRF()

In [163]:
query = "왜 나한테만 그래?"
rerank = rrf.get_rrf_scores(reply(query).index.to_list() , bm25_chatbot_response(query).index.to_list())
res_df = df.loc[rerank['ids']]
res_df['scores'] = rerank['scores']
res_df.head(3)

Unnamed: 0,user,answer,class,scores
17520,선생님이 나한테 왜 그럴까?,선생님께 서운한 일이 있으셨나봐요.. 그래도 좋은 분이실테니 잘 지내시길 바랄게요,일상,0.6
10593,네가 나한테 왜 그랬는지 아직도 이해가 안돼.,상대방의 행동으로 인해 상처를 받았군요. 남에게 상처를 주는 행동으로 하는 사람들의...,사연,0.4
23831,왜 나한테만 이런일이 몰려올까,저도 비슷한 고민을 많이 해요. 생각이 많은 것보다 단순하게 사는 게 오히려 좋은 ...,사연,0.3
