In [1]:
import math
import random
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import spacy
import yake
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from typing import List

from tqdm import tqdm


nlp = spacy.load("en_core_web_sm")
kw_extractor = yake.KeywordExtractor(n=1,
                                     dedupLim=0.9,
                                     top=10,
                                     features=None)


class BM25:
    def __init__(self, corpus: List[List[str]], k1=1.5, b=0.95):
        self.corpus = corpus
        self.k1 = k1
        self.b = b
        self.documents_number = len(corpus)
        self.avgdl = sum(len(document) for document in corpus) / self.documents_number
        self.df = self._calculate_df()
        self.idf = self._calculate_idf()

    def _calculate_df(self):
        df = {}
        for document in self.corpus:
            for word in set(document):
                df[word] = df.get(word, 0) + 1
        return df

    def _calculate_idf(self):
        idf = {}
        for word, freq in self.df.items():
            idf[word] = math.log((self.documents_number - freq + 0.5) / (freq + 0.5) + 1)
        return idf

    def _score(self, document, query):
        score = 0.0
        for word in query:
            if word in self.df:
                idf = self.idf[word]
                term_freq = document.count(word)
                score += (idf * term_freq * (self.k1 + 1)) / (
                        term_freq + self.k1 * (1 - self.b + self.b * len(document) / self.avgdl))
        return score

    def get_scores(self, query):
        scores = []
        for index, document in enumerate(self.corpus):
            score = self._score(document, query)
            scores.append((index, score))
        return scores

In [2]:
def search_bm25(query: str, bm25, top_k):
    global df
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    result = {}
    for doc_index, score in sorted_scores:
        pmid = df.iloc[doc_index]['PMID']
        result[pmid] = score
    return list(result.keys())

In [3]:
def search(query: str, df, keywords, bm25, top_k):
    tokenized_query = query.split()
    tokenized_query.extend(keywords)
    scores = bm25.get_scores(tokenized_query)
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    result = {}
    for doc_index, score in sorted_scores:
        pmid = df.iloc[doc_index]['PMID']
        result[pmid] = score
    return list(result.keys())

In [4]:
def extract_keywords(text):
    keywords = kw_extractor.extract_keywords(text)
    keywords_set = {word for word, _ in keywords}
    return list(set(keywords_set))


def query_pre_process(query):
    proper_nouns = extract_keywords(query)
    proper_nouns = [word for word in proper_nouns if word.lower() not in ENGLISH_STOP_WORDS] * 1
    return proper_nouns

In [5]:
df = pd.read_csv('./PubmedDataSet.csv')
texts = df['Abstract'].tolist()
tokenized_texts = [doc.split() for doc in texts]
bm25_abstract = BM25(tokenized_texts)

df_test = pd.read_csv('./evaluation.csv')
test = []

for index, row in df_test.iterrows():
    small_list = [row["Question"], row["PMID"]]
    test.append(small_list)
random.seed(42)
random.shuffle(test)
test = test[:100]
    


In [11]:
t = 0
for query, pmid in tqdm(test):
    top_pmids = search_bm25(query, bm25_abstract, 1)
    if pmid in top_pmids:
        t += 1
print(f'top1-accuracy of bm25 {t / len(test)}')

100%|█████████████████████████████████████████| 100/100 [03:21<00:00,  2.02s/it]

top1-accuracy of bm25 0.71





In [12]:
Q = len(test)
MRR = 0
for query, pmid in tqdm(test):
    top_pmids = search_bm25(query, bm25_abstract, 1)
    try:
        rank = top_pmids.index(pmid) + 1
        MRR += 1 / rank
    except:
        rank = 0
MRR = MRR / Q
print(f'MRR of bm25 is {MRR}')

100%|█████████████████████████████████████████| 100/100 [03:20<00:00,  2.01s/it]

MRR of bm25 is 0.71





In [13]:
import random

import pandas as pd
import spacy
import yake
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from typing import List

from tqdm import tqdm



nlp = spacy.load("en_core_web_sm")
kw_extractor = yake.KeywordExtractor(n=1,
                                     dedupLim=0.9,
                                     top=10,
                                     features=None)

In [14]:
df = pd.read_csv('./PubmedDataSet.csv')
texts = df['Abstract'].tolist()
tokenized_texts = [doc.split() for doc in texts]
bm25_abstract = BM25(tokenized_texts)

In [15]:
df2 = pd.read_csv('./splitted_pubmed_data_NLTK.csv')
df_test = pd.read_csv('./evaluation.csv')
test = []
for index, row in df_test.iterrows():
    small_list = [row["Question"], row["PMID"]]
    test.append(small_list)
random.seed(42)
random.shuffle(test)
test = test[:100]

In [16]:
t = 0
for query, pmid in tqdm(test):
    keywords = query_pre_process(query)
    result_pmids_scores = search(query, df, keywords, bm25_abstract, top_k=30)
    mask = df2['PMID'].isin(result_pmids_scores)
    df_t = df2[mask]
    texts = df_t['chunk_text'].tolist()
    tokenized_texts = [doc.split() for doc in texts]

    bm25_chunk = BM25(tokenized_texts)
    result_pmids_scores = search(query, df_t, keywords, bm25_chunk, top_k=1)
    if pmid in result_pmids_scores:
        t += 1
print(f'top1-accuracy of hierarchical bm25  {t / len(test)}')

100%|█████████████████████████████████████████| 100/100 [04:55<00:00,  2.96s/it]

top1-accuracy of hierarchical bm25  0.74





In [17]:
Q = len(test)
MRR = 0
for query, pmid in tqdm(test):
    keywords = query_pre_process(query)
    result_pmids_scores = search(query, df, keywords, bm25_abstract, top_k=30)
    mask = df2['PMID'].isin(result_pmids_scores)
    df_t = df2[mask]
    texts = df_t['chunk_text'].tolist()
    tokenized_texts = [doc.split() for doc in texts] 

    bm25_chunk = BM25(tokenized_texts)
    result_pmids_scores = search(query, df_t, keywords, bm25_chunk, top_k=1)

    try:
        rank = result_pmids_scores.index(pmid) + 1
        MRR += 1 / rank
    except:
        rank = 0
MRR = MRR / Q
print(f'MRR of hierarchical bm25 is {MRR}')

100%|█████████████████████████████████████████| 100/100 [04:59<00:00,  3.00s/it]

MRR of hierarchical bm25 is 0.74



