In [None]:
import json
from sentence_transformers import SentenceTransformer, util
from summarizer.sbert import SBertSummarizer
from top2vec import Top2Vec

In [None]:
data_size = 100 # the number of documents that serve as contexts
search_text_size = 128 # the number of tokens that are ingested at a time to search for the most similar documents

In [None]:
# Read data_size amount of documents
articles = []
file = open('pubmed-dataset/pubmed-dataset/val.txt', encoding='utf-8', mode='r')
for i in range(data_size):
    line = file.readline()
    art = json.loads(line)
    abstract = " ".join([l.replace('<S>', '').replace('</S>', '') for l in art['abstract_text']])
    passage = " ".join(art['article_text'])
    articles.append([abstract, passage])
file.close()

In [None]:
model = SBertSummarizer('paraphrase-MiniLM-L6-v2') # summarizer model
encoder = SentenceTransformer('all-MiniLM-L6-v2') # encoder model

In [None]:
def encode(encoder, text, window_size, stride):
    """
        Encode a text into a vector by splitting it into windows of size window_size with stride stride. 
    """
    words = text.split()

    if len(words) <= window_size:
        return encoder([text])
    
    windows = []
    for i in range(0, len(words) - stride, stride):
        sub_text = " ".join(words[i:i + window_size])
        windows.append(sub_text)

    r = encoder(windows)
    return encoder(windows).mean(axis=0)

In [None]:
# collect both the abstracts and the main contents of the context documents
abses = []
arts = []
for abs, art in articles:
    if len(abs.split()) > 256:
        abs = model(abs, min_length=search_text_size)
    if len(art.split()) > 256:
        art = model(art, min_length=search_text_size)
    abses.append(abs)
    arts.append(art)

arts_embeddings = encoder.encode(arts) # encode the main contents of the context documents
abs_embeddings = encoder.encode(abses) # encode the abstracts of the context documents

In [None]:
# create a cluster of the context documents so that searching is easier

docs = [art for abs, art in articles] # take the articles as the documents to be clustered
topic_model = Top2Vec(docs, embedding_model="all-MiniLM-L6-v2") # cluster the documents using Top2Vec model

In [None]:
# search for the most similar documents to the question using the encoding of the question

question = "who decides whether stroke status is correct?" # the question to be answered
question_embedding = encoder.encode(question, convert_to_tensor=True) # encode the question using the encoder model
hits = util.semantic_search(question_embedding, arts_embeddings, top_k=5) # search for the most similar documents to the question
reference_text = articles[hits[0][0]['corpus_id']][1] # take the most similar document as the reference text

In [17]:
# top 5 documents relevant for given the question

hits

[[{'corpus_id': 1, 'score': 0.4862869679927826},
  {'corpus_id': 0, 'score': 0.36778485774993896},
  {'corpus_id': 51, 'score': 0.3365175426006317},
  {'corpus_id': 82, 'score': 0.3221747875213623},
  {'corpus_id': 98, 'score': 0.31176990270614624}]]

In [18]:
# load the question answering model

question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') 

In [20]:
# instead of Top2Vec, we can also use the semantic_search to search for the most similar documents to the question
# the search will use strided windows of size window_size with stride stride
window_size = 256 
stride = 128
reference_text = articles[hits[0][0]['corpus_id']][1]
sub = []
words = reference_text.split() # split the reference text into words
for i in range(0, len(words) - stride, stride):
    sub_text = " ".join(words[i:i+window_size])
    sub.append(sub_text)

# for each sub-text, compute the embedding and search for the most similar sub-texts to the question
sub_embs = encoder.encode(sub)
sub_hits = util.semantic_search(question_embedding, sub_embs, top_k=5) # get top 5 most similar sub-texts to the question
for h in sub_hits[0]:
    print(h)


{'corpus_id': 26, 'score': 0.6570420265197754}
{'corpus_id': 0, 'score': 0.6569380760192871}
{'corpus_id': 1, 'score': 0.6192592978477478}
{'corpus_id': 17, 'score': 0.6186012029647827}
{'corpus_id': 8, 'score': 0.6182588934898376}


In [21]:
# using the sub-text search, find the answer to the question by zooming into the most similar sub-text
sub_reference_text = sub[sub_hits[0][0]['corpus_id']]
result = question_answerer(question=question, context=sub_reference_text)
print(result)

{'score': 0.920117974281311, 'start': 484, 'end': 496, 'answer': 'neurologists'}


In [22]:
# using the Top2Vec model, find the answer to the question by zooming into the most similar document

found_doc = topic_model.query_documents(question, 5)
print(found_doc)

(array(["there is an epidemic of stroke in low and middle - income countries due to rapidly increasing prevalence of vascular risk factors such as hypertension , diabetes mellitus , dyslipidaemia . the need has thus arisen to conduct studies aimed at epidemiologic , genetic and phenotypic characterisation of stroke in sub - saharan africa to provide evidence - based information to confront this menace . to this end , it is important that controls selected for comparison with cases are recruited with a high degree of certainty that they indeed do not have stroke or tia to allow for valid comparisons to be made . in view of this simple , quick and accurate assessment of symptoms of stroke is needed for epidemiologic studies particularly those of the case  control type . in 2000 , meschia et al . developed the questionnaire for verifying stroke - free status ( qvsfs ) as a method for verifying the stroke - free phenotype in participants of clinical , epidemiological and genetic studies . 

In [24]:
# Answer of the Top2Vec model
result = question_answerer(question=question, context=found_doc[0][0])
print(result)

{'score': 0.9832802414894104, 'start': 19799, 'end': 19811, 'answer': 'neurologists'}
