In [5]:
from pinecone import Pinecone
from FlagEmbedding import FlagModel

model = FlagModel('bge_large_fin',
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

pc = Pinecone(api_key="621f7574-8c97-4f46-8c5e-186dd099d33b")
Index = pc.Index("bge-fin")


def search_arxiv_texts(query):
    query_vector = model.encode_queries([query])[0].tolist()

    response = Index.query(
        vector=query_vector,
        top_k=1,
        include_metadata=True
    )

    arxiv_texts = [str(match['metadata']['arxiv_text']) for match in response['matches']]
    pmid = [int(match['metadata']['pmid']) for match in response['matches']]
    # ans = {}
    # for i in range(5):
    #     ans[pmid[i]] = arxiv_texts[i]

    return [pmid[0], arxiv_texts[0]]


# query = "What was the purpose of the US Food and Drug Administration-cosponsored forum on laser-based imaging?"
# top_arxiv_texts = search_arxiv_texts(query)
# print(top_arxiv_texts)


In [91]:
import math
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from typing import List
import yake
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")
kw_extractor = yake.KeywordExtractor(n=1,
                                     dedupLim=0.9,
                                     top=10,
                                     features=None)


class BM25:
    def __init__(self, corpus: List[List[str]], k1=1.5, b=0.95):
        self.corpus = corpus
        self.k1 = k1
        self.b = b
        self.documents_number = len(corpus)
        self.avgdl = sum(len(document) for document in corpus) / self.documents_number
        self.df = self._calculate_df()
        self.idf = self._calculate_idf()

    def _calculate_df(self):
        df = {}
        for document in self.corpus:
            for word in set(document):
                df[word] = df.get(word, 0) + 1
        return df

    def _calculate_idf(self):
        idf = {}
        for word, freq in self.df.items():
            idf[word] = math.log((self.documents_number - freq + 0.5) / (freq + 0.5) + 1)
        return idf

    def _score(self, document, query):
        score = 0.0
        for word in query:
            if word in self.df:
                idf = self.idf[word]
                term_freq = document.count(word)
                score += (idf * term_freq * (self.k1 + 1)) / (
                        term_freq + self.k1 * (1 - self.b + self.b * len(document) / self.avgdl))
        return score

    def get_scores(self, query):
        scores = []
        for index, document in enumerate(self.corpus):
            score = self._score(document, query)
            scores.append((index, score))
        return scores


def search(query: str, df, keywords, bm25, top_k):
    tokenized_query = query.split()
    tokenized_query.extend(keywords)
    scores = bm25.get_scores(tokenized_query)
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    result = {}
    if top_k == 1:
        for doc_index, score in sorted_scores:
            return [df.iloc[doc_index]['PMID'], df.iloc[doc_index]['chunk_text']]
        
    for doc_index, score in sorted_scores:
        pmid = df.iloc[doc_index]['PMID']
        result[pmid] = score
    return list(result.keys())


def extract_keywords(text):
    keywords = kw_extractor.extract_keywords(text)
    keywords_set = {word for word, _ in keywords}
    return list(set(keywords_set))


def query_pre_process(query):
    proper_nouns = extract_keywords(query)
    proper_nouns = [word for word in proper_nouns if word.lower() not in ENGLISH_STOP_WORDS] * 1
    return proper_nouns

df = pd.read_csv('./PubmedDataSet.csv')
texts = df['Abstract'].tolist()
tokenized_texts = [doc.split() for doc in texts]
bm25_abstract = BM25(tokenized_texts)
df2 = pd.read_csv('./splitted_pubmed_data_NLTK.csv')

def weightBM25(query):
    keywords = query_pre_process(query)
    result_pmids_scores = search(query, df, keywords, bm25_abstract, top_k=30)
    mask = df2['PMID'].isin(result_pmids_scores)
    df_t = df2[mask]
    texts = df_t['chunk_text'].tolist()
    tokenized_texts = [doc.split() for doc in texts]
    
    bm25_chunk = BM25(tokenized_texts)
    result_chunk = search(query, df_t, keywords, bm25_chunk, top_k=1)
    return result_chunk
    
    # res = []
    # for id in result_pmids_scores:
    #     res = [id, df_t[df_t["PMID"] == id]['chunk_text'].iloc[0]]
    # return res


In [106]:
def retrieval(question):    
    res_semantic = search_arxiv_texts(question)[1]
    res_lex = weightBM25(question)[1]
    question += ' ' + res_semantic + ' ' + res_lex

    res_semantic = search_arxiv_texts(question)[0]
    res_lex = weightBM25(question)[0]
    
    intersectPMID = [res_semantic, res_lex]
    # print(intersectPMID)
    return intersectPMID
    # if len(intersectPMID) != 0:
    #     for id in res_semantic:
    #         if id in intersectPMID:
    #             return id
    #             break
    # else:
    #     return res_semantic[0]


In [107]:
import random
df_test = pd.read_csv('./evaluation.csv')
test = []

for i, row in df_test.iterrows():
    small_list = [row["Question"], row["PMID"]]
    test.append(small_list)
random.seed(42)
random.shuffle(test)
test = test[:100]

In [108]:
t = 0
for question, pmid in tqdm(test):
    top_pmids = retrieval(question)
    if pmid in top_pmids:
        t += 1
print(f'top1-accuracy of PRF search {t / len(test)}')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [32:37<00:00, 19.57s/it]

recall of mixed search algorithm 0.79



