In [60]:
from pinecone import Pinecone
from FlagEmbedding import FlagModel

model = FlagModel('bge_large_fin',
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
                  use_fp16=True)

pc = Pinecone(api_key="621f7574-8c97-4f46-8c5e-186dd099d33b")
Index = pc.Index("bge-fin")


def search_arxiv_texts(query):
    query_vector = model.encode_queries([query])[0].tolist()

    response = Index.query(
        vector=query_vector,
        top_k=5,
        include_metadata=True
    )

    arxiv_texts = [str(match['metadata']['arxiv_text']) for match in response['matches']]
    pmid = [int(match['metadata']['pmid']) for match in response['matches']]
    ans = {}
    for i in range(5):
        ans[pmid[i]] = arxiv_texts[i]

    return ans


# query = "What was the purpose of the US Food and Drug Administration-cosponsored forum on laser-based imaging?"
# top_arxiv_texts = search_arxiv_texts(query)
# print(top_arxiv_texts)


In [74]:
import math
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from typing import List
import yake
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")
kw_extractor = yake.KeywordExtractor(n=1,
                                     dedupLim=0.9,
                                     top=10,
                                     features=None)


class BM25:
    def __init__(self, corpus: List[List[str]], k1=1.5, b=0.95):
        self.corpus = corpus
        self.k1 = k1
        self.b = b
        self.documents_number = len(corpus)
        self.avgdl = sum(len(document) for document in corpus) / self.documents_number
        self.df = self._calculate_df()
        self.idf = self._calculate_idf()

    def _calculate_df(self):
        df = {}
        for document in self.corpus:
            for word in set(document):
                df[word] = df.get(word, 0) + 1
        return df

    def _calculate_idf(self):
        idf = {}
        for word, freq in self.df.items():
            idf[word] = math.log((self.documents_number - freq + 0.5) / (freq + 0.5) + 1)
        return idf

    def _score(self, document, query):
        score = 0.0
        for word in query:
            if word in self.df:
                idf = self.idf[word]
                term_freq = document.count(word)
                score += (idf * term_freq * (self.k1 + 1)) / (
                        term_freq + self.k1 * (1 - self.b + self.b * len(document) / self.avgdl))
        return score

    def get_scores(self, query):
        scores = []
        for index, document in enumerate(self.corpus):
            score = self._score(document, query)
            scores.append((index, score))
        return scores


def search(query: str, df, keywords, bm25, top_k):
    tokenized_query = query.split()
    tokenized_query.extend(keywords)
    scores = bm25.get_scores(tokenized_query)
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    result = {}
    for doc_index, score in sorted_scores:
        pmid = df.iloc[doc_index]['PMID']
        result[pmid] = score
    return list(result.keys())


def extract_keywords(text):
    keywords = kw_extractor.extract_keywords(text)
    keywords_set = {word for word, _ in keywords}
    return list(set(keywords_set))


def query_pre_process(query):
    proper_nouns = extract_keywords(query)
    proper_nouns = [word for word in proper_nouns if word.lower() not in ENGLISH_STOP_WORDS] * 1
    return proper_nouns

df = pd.read_csv('./PubmedDataSet.csv')
texts = df['Abstract'].tolist()
tokenized_texts = [doc.split() for doc in texts]
bm25_abstract = BM25(tokenized_texts)
df2 = pd.read_csv('./splitted_pubmed_data_NLTK.csv')

def weightBM25(query):
    keywords = query_pre_process(query)
    result_pmids_scores = search(query, df, keywords, bm25_abstract, top_k=30)
    mask = df2['PMID'].isin(result_pmids_scores)
    df_t = df2[mask]
    texts = df_t['chunk_text'].tolist()
    tokenized_texts = [doc.split() for doc in texts]
    
    bm25_chunk = BM25(tokenized_texts)
    result_pmids_scores = search(query, df_t, keywords, bm25_chunk, top_k=5)
    
    res = {}
    for id in result_pmids_scores:
        res[id] = df_t[df_t["PMID"] == id]['chunk_text'].iloc[0]
    return res


In [69]:
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"


def QWEN_score(sentence1, sentence2):
    """
    cos similarity of two sentences
    :param sentence1: string
    :param sentence2: string
    :return: number
    """

    model_name = "Qwen/Qwen1.5-0.5B-Chat"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)

    inputs1 = tokenizer(sentence1, return_tensors="pt", padding=True, truncation=True).to(device)
    inputs2 = tokenizer(sentence2, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs1 = model(**inputs1).last_hidden_state
        outputs2 = model(**inputs2).last_hidden_state

    features1 = outputs1[:, 0, :]
    features2 = outputs2[:, 0, :]

    cosine_similarity = torch.nn.functional.cosine_similarity(features1, features2)
    print(cosine_similarity.item())
    return cosine_similarity.item()
    
def filter(query, res):
    ans_pmid = 0
    max_cos = -1.0
    for pmid, chunk in res.items():
        t = QWEN_score(query, chunk)
        if t > max_cos:
            max_cos = t
            ans_pmid = pmid
    return max_cos,ans_pmid

In [71]:
def retrieval(question):    
    res_semantic = search_arxiv_texts(question)
    cos1,res_semantic_pmid = filter(question, res_semantic)
    
    
    res_lex = weightBM25(question)
    res_lex_pmid = filter(question, res_lex)

    # if res_lex_pmid == []:
    #     return list(res_lex.keys())[0]
    # return res_lex_pmid[0]
    
    # intersectChunk = list(set(res_semantic).intersection(set(res_lex)))
    # if len(intersectChunk) != 0:
    #     for id in res_semantic:
    #         if id in intersectChunk:
    #             return id
    #             break
    # else:
    #     return [1]

In [72]:
import random
df_test = pd.read_csv('./evaluation.csv')
test = []

for i, row in df_test.iterrows():
    small_list = [row["Question"], row["PMID"]]
    test.append(small_list)
random.seed(42)
random.shuffle(test)
test = test[:10]

In [73]:
t = 0
for question, pmid in tqdm(test):
    top_pmids = retrieval(question)
    if pmid == top_pmids:
        t += 1
print(f'recall of mixed search algorithm {t / len(test)}')

  0%|                                                                                                                                                              | 0/10 [00:00<?, ?it/s]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 10%|███████████████                                                                                                                                       | 1/10 [00:03<00:31,  3.55s/it]

0.8314301371574402


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9903689622879028


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 20%|██████████████████████████████                                                                                                                        | 2/10 [00:09<00:37,  4.67s/it]

0.9824298024177551


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.937097430229187


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9541096687316895


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 30%|█████████████████████████████████████████████                                                                                                         | 3/10 [00:15<00:38,  5.55s/it]

0.9541096687316895


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8655787706375122


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9072640538215637


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9520996809005737


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 40%|████████████████████████████████████████████████████████████                                                                                          | 4/10 [00:24<00:42,  7.01s/it]

0.9852910041809082


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 50%|███████████████████████████████████████████████████████████████████████████                                                                           | 5/10 [00:28<00:28,  5.76s/it]

0.7115401029586792


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8314301371574402


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8314301371574402


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9072640538215637


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8617032766342163


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 60%|██████████████████████████████████████████████████████████████████████████████████████████                                                            | 6/10 [00:40<00:31,  7.97s/it]

0.8617032766342163


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8717578649520874


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8717578649520874


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9322625994682312


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                             | 7/10 [00:50<00:25,  8.49s/it]

0.8686384558677673


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.966816782951355


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8163858652114868


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9857695698738098


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9226570129394531


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 80%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 8/10 [01:01<00:18,  9.25s/it]

0.8275462985038757


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8314301371574402


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.8314301371574402


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.926376461982727


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9614694714546204


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 9/10 [01:11<00:09,  9.69s/it]

0.8218483328819275


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0.9350156188011169


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [01:16<00:00,  7.68s/it]

0.9620165824890137
recall of mixed search algorithm 0.7



