# Implements rolling questions

- Try Topic Modeling, Glove Word Embeddings for similarity 
- Seq2Seq modelling for question generation.

In [2]:
answer = """background on 31 december 2019 the world health organization
            china country office was informed of cases of pneumonia of
            unknown aetiology since then there have been over 75 000 cases
            globally of the 2019 novel coronavirus 2000 deaths and over 14
            000 cases recovered outbreaks of novel agents represent
            opportunities for clinical research to inform realtime public
            health action in 2018 we conducted a systematic review to
            identify priority research questions for severe acute
            respiratory syndrome related coronavirus sarscov and middle east
            respiratory syndrome related coronavirus merscov here we review
            information available on covid19 and provide an evidencedbased
            framework for priority clinical research in the current outbreak
            methods three bibliographic databases were searched to identify
            clinical studies published on sarscov and merscov in the
            outbreak setting studies were grouped thematically according to
            clinical research questions addressed in february 2020 available
            information on covid19 was reviewed and compared to the results
            of the sarscov and merscov systematic review from the research
            objectives for sarscov and merscov ten themes in the literature
            were identified clinical characterisation prognosis diagnosis
            clinical management viral pathogenesis epidemiological
            characterisation infection prevention and controltransmission
            susceptibility psychosocial and aetiology for covid19 some
            information on clinical presentation diagnostic testing and
            aetiology is available but many clinical research gaps have yet
            to be filled based on a systematic review of other severe
            coronaviruses we summarise the state of clinical research for
            covid19 highlight the research gaps and provide recommendations
            for the implementation of standardised protocols databased on
            internationally standardised protocols will inform clinical
            practice realtime
            """

### POS Tagging

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(answer)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

background background NOUN NN nsubjpass xxxx True False
on on ADP IN prep xx True True
31 31 NUM CD nummod dd False False
december december PROPN NNP pobj xxxx True False
2019 2019 NUM CD nummod dddd False False
the the DET DT det xxx True True
world world PROPN NNP compound xxxx True False
health health PROPN NNP compound xxxx True False
organization organization PROPN NNP appos xxxx True False

             
             SPACE _SP  
     False False
china china PROPN NNP compound xxxx True False
country country PROPN NNP compound xxxx True False
office office PROPN NNP nsubjpass xxxx True False
was be AUX VBD auxpass xxx True True
informed inform VERB VBN ROOT xxxx True False
of of ADP IN prep xx True True
cases case NOUN NNS pobj xxxx True False
of of ADP IN prep xx True True
pneumonia pneumonia NOUN NN pobj xxxx True False
of of ADP IN prep xx True True

             
             SPACE _SP  
     False False
unknown unknown ADJ JJ amod xxxx True False
aetiology aetiology NOUN NN p

### Extract Named Entities

In [4]:
import spacy

bag_of_words = []
nlp = spacy.load("en_core_web_sm")
doc = nlp(answer)
for ent in doc.ents:
    bag_of_words.append((ent.text))
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

31 december 2019 14 30 DATE
the world health organization
            china country office 31 93 ORG
2019 235 239 DATE
2000 258 262 CARDINAL
2018 453 457 DATE
middle east 625 636 LOC
coronavirus merscov 678 697 PERSON
three 885 890 CARDINAL
february 2020 1139 1152 DATE
ten 1361 1364 CARDINAL


### TF-IDF

Find words that don't appear frequently.

In [6]:
# https://www.machinelearningplus.com/nlp/gensim-tutorial/#8howtocreatethetfidfmatrixcorpusingensim

from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import numpy as np

documents = [answer]

# Create the Dictionary and Corpus
mydict = corpora.Dictionary([simple_preprocess(line) for line in documents])
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]

# Show the Word Weights in Corpus
for doc in corpus:
    print([[mydict[id], freq] for id, freq in doc])

# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')

# Show the TF-IDF weights
for doc in tfidf[corpus]:
    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])

[['according', 1], ['action', 1], ['acute', 1], ['addressed', 1], ['aetiology', 3], ['agents', 1], ['an', 1], ['and', 11], ['available', 3], ['background', 1], ['based', 1], ['be', 1], ['been', 1], ['bibliographic', 1], ['but', 1], ['cases', 3], ['china', 1], ['clinical', 10], ['compared', 1], ['conducted', 1], ['coronavirus', 3], ['coronaviruses', 1], ['country', 1], ['covid', 4], ['current', 1], ['databased', 1], ['databases', 1], ['deaths', 1], ['december', 1], ['diagnosis', 1], ['diagnostic', 1], ['east', 1], ['epidemiological', 1], ['evidencedbased', 1], ['february', 1], ['filled', 1], ['for', 7], ['framework', 1], ['from', 1], ['gaps', 2], ['globally', 1], ['grouped', 1], ['have', 2], ['health', 2], ['here', 1], ['highlight', 1], ['identified', 1], ['identify', 2], ['implementation', 1], ['in', 5], ['infection', 1], ['inform', 2], ['information', 3], ['informed', 1], ['internationally', 1], ['is', 1], ['literature', 1], ['management', 1], ['many', 1], ['merscov', 4], ['methods', 

### Sentence Similarity using Word Embeddings

Good medium article: https://medium.com/@adriensieg/text-similarities-da019229c894

Remove Stop Words

LDA: https://www.kaggle.com/ktattan/lda-and-document-similarity

Get BERT Embeddings for words: https://towardsdatascience.com/bow-to-bert-2695cdb19787