In [24]:
import json
from sentence_transformers import SentenceTransformer, util
from summarizer.sbert import SBertSummarizer
from transformers import BertTokenizer,AlbertTokenizer,AutoTokenizer, AlbertForQuestionAnswering, AutoModelForQuestionAnswering ,BertForQuestionAnswering

from transformers import BertTokenizer
import torch
import numpy as np
from transformers import pipeline
from top2vec import Top2Vec

In [28]:
data_size = 100
search_text_size = 128

In [29]:
articles = []
file = open('data/val.txt', encoding='utf-8', mode='r')
for i in range(data_size):
    line = file.readline()
    art = json.loads(line)
    abstract = " ".join([l.replace('<S>', '').replace('</S>', '') for l in art['abstract_text']])
    passage = " ".join(art['article_text'])
    articles.append([abstract, passage])
file.close()

In [4]:
model = SBertSummarizer('paraphrase-MiniLM-L6-v2')
encoder = SentenceTransformer('all-MiniLM-L6-v2')

In [20]:
def encode(encoder, text, window_size, stride):
    words = text.split()

    if len(words) <= window_size:
        return encoder([text])
    
    windows = []
    for i in range(0, len(words) - stride, stride):
        sub_text = " ".join(words[i:i + window_size])
        windows.append(sub_text)

    r = encoder(windows)
    return encoder(windows).mean(axis=0)


# encs = []
# for abs, art in articles:
#     en = encode(encoder.encode, art, 256, 128)
#     encs.append(en)

# encs = np.vstack(encs)
# print(encs.shape)

(10, 384)


In [5]:
# print(articles[0][1])
# print("================================")
abses = []
arts = []
for abs, art in articles:
    if len(abs.split()) > 256:
        abs = model(abs, min_length=search_text_size)
    if len(art.split()) > 256:
        art = model(art, min_length=search_text_size)
    abses.append(abs)
    arts.append(art)

arts_embeddings = encoder.encode(arts)
abs_embeddings = encoder.encode(abses)

In [26]:
# arts[0]

'if undiagnosed , asymptomatic vte can lead to chronic venous disease or recurrent vte and long - term debilitating sequelae such as postthrombotic syndrome and chronic thromboembolic pulmonary hypertension . given the growing burden of vte in india and lack of substantial indian data on characteristics of vte patients , use of diagnostics tools , prophylaxis , treatment options , and clinical outcomes in vte , there was a need to systematically collect such data . risk factors for venous thromboembolism based on a review of the available records , 157 patients had a single co - morbidity , 81 had two co - morbidities , 23 had three co - morbidities , and 16 had four or more co - morbidities . ( myocardial infarction , heart failure , chronic obstructive pulmonary disease , ventilator dependency , sepsis , or pneumonia ) [ table 4 ] . a total of 31% ( 143/462 ) patients had dvt in the right limb , 54% ( 249/462 ) in the left limb and 9% ( 41/462 ) in both limbs ( site not known in 29 p

In [31]:
docs = [art for abs, art in articles]
topic_model = Top2Vec(docs, embedding_model="all-MiniLM-L6-v2")

2023-04-01 00:39:32,977 - top2vec - INFO - Pre-processing documents for training
2023-04-01 00:39:35,264 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2023-04-01 00:39:35,778 - top2vec - INFO - Creating joint document/word embedding
2023-04-01 00:39:41,855 - top2vec - INFO - Creating lower dimension embedding of documents
2023-04-01 00:39:45,722 - top2vec - INFO - Finding dense areas of documents
2023-04-01 00:39:45,730 - top2vec - INFO - Finding topics


In [6]:
abs_embeddings.shape
arts_embeddings.shape

(10, 384)

In [7]:
question = "who decides whether stroke status is correct?"
question_embedding = encoder.encode(question, convert_to_tensor=True)
hits = util.semantic_search(question_embedding, arts_embeddings, top_k=5)
reference_text = articles[hits[0][0]['corpus_id']][1]

In [8]:
hits

[[{'corpus_id': 1, 'score': 0.5580443739891052},
  {'corpus_id': 0, 'score': 0.36922186613082886},
  {'corpus_id': 2, 'score': 0.2462112158536911},
  {'corpus_id': 4, 'score': 0.2442772388458252},
  {'corpus_id': 5, 'score': 0.19890615344047546}]]

In [9]:
articles[1][0]

' backgroundthe questionnaire for verifying stroke - free status ( qvsfs ) has been validated in western populations as a method for verifying stroke - free status in participants of clinical , epidemiological and genetic studies .   this instrument has not been validated in low - income settings where populations have limited knowledge of stroke symptoms and literacy levels are low.objectiveto simultaneously validate the 8-item qvsfs in 3 languages spoken in west africa ( yoruba , hausa and akan ) for ascertainment of stroke - free status of control subjects in siren.methodsusing a cross - sectional study design , 100 participants each from the 3 linguistic groups will be consecutively recruited from neurology and general medicine clinics of 5 tertiary referral hospitals in nigeria and ghana .   ascertainment of stroke status will be determined by neurologists using structured neurological examination , review of case records and neuro - imaging ( gold standard ) .   the relative perf

In [10]:
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')

In [11]:
window_size = 256
stride = 128
reference_text = articles[hits[0][0]['corpus_id']][1]
sub = []
words = reference_text.split()
for i in range(0, len(words) - stride, stride):
    sub_text = " ".join(words[i:i+window_size])
    sub.append(sub_text)

sub_embs = encoder.encode(sub)
sub_hits = util.semantic_search(question_embedding, sub_embs, top_k=5)
for h in sub_hits[0]:
    print(h)


{'corpus_id': 26, 'score': 0.6570421457290649}
{'corpus_id': 0, 'score': 0.6569384932518005}
{'corpus_id': 1, 'score': 0.6192595958709717}
{'corpus_id': 17, 'score': 0.6186014413833618}
{'corpus_id': 8, 'score': 0.6182590126991272}


In [12]:
sub_reference_text = sub[sub_hits[0][0]['corpus_id']]
result = question_answerer(question=question, context=sub_reference_text)
print(result)

{'score': 0.9201184511184692, 'start': 484, 'end': 496, 'answer': 'neurologists'}


In [33]:
found_doc = topic_model.query_documents(question, 1)
print(found_doc[0][0])

there is an epidemic of stroke in low and middle - income countries due to rapidly increasing prevalence of vascular risk factors such as hypertension , diabetes mellitus , dyslipidaemia . the need has thus arisen to conduct studies aimed at epidemiologic , genetic and phenotypic characterisation of stroke in sub - saharan africa to provide evidence - based information to confront this menace . to this end , it is important that controls selected for comparison with cases are recruited with a high degree of certainty that they indeed do not have stroke or tia to allow for valid comparisons to be made . in view of this simple , quick and accurate assessment of symptoms of stroke is needed for epidemiologic studies particularly those of the case  control type . in 2000 , meschia et al . developed the questionnaire for verifying stroke - free status ( qvsfs ) as a method for verifying the stroke - free phenotype in participants of clinical , epidemiological and genetic studies . having on

In [34]:
result = question_answerer(question=question, context=found_doc[0][0])
print(result)

  tensor = as_tensor(value)
  p_mask = np.asarray(


{'score': 0.9833484888076782, 'start': 19799, 'end': 19811, 'answer': 'neurologists'}


In [36]:
found_doc[1][0]

0.6279185