In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import stop_words
import os
from nltk.corpus import stopwords 
import nltk
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
#nltk.download('stopwords')

In [3]:
stop = set(['?',',',':','@']) 

In [4]:
ROOT = '../data/'

QA_dict = {}

In [5]:
os.listdir(ROOT)

['CDC.xlsx',
 'Coronavirus (COVID-19) frequently asked questions _ CDC.pdf',
 'covidquestionsca.xlsx',
 'EAC.xlsx',
 'JHU.xlsx',
 'MOHKE.xlsx',
 'sources.txt',
 'WHO.xlsx']

In [6]:
paths = [ROOT+d for d in os.listdir(ROOT) if d.endswith(".xlsx")]

In [7]:
paths

['../data/CDC.xlsx',
 '../data/covidquestionsca.xlsx',
 '../data/EAC.xlsx',
 '../data/JHU.xlsx',
 '../data/MOHKE.xlsx',
 '../data/WHO.xlsx']

In [8]:
data = [ pd.read_excel(p, names = ['q','a']) for p in paths]

In [9]:
QA = pd.DataFrame()

In [10]:
for d in data[:]:
    if d.shape[0]>0:
        print(d.shape)
        QA = pd.concat((QA,d),axis = 0)

(116, 2)
(10, 2)
(26, 2)
(86, 2)
(85, 2)


In [11]:
QA.shape

(323, 2)

In [12]:
QA.sample(10)

Unnamed: 0,q,a
4,What are the symptoms of COVID-19?,Some people become infected but don’t develop ...
73,Can I touch and hold my newborn baby if I have...,"Yes. Close contact and early, exclusive breast..."
31,Are women taking oral contraceptives more at r...,There is an elevated risk of clotting from ora...
3,Why is the virus so quick to spread,There are a couple of factors that help the vi...
44,How long is the incubation period for COVID-19?,Most estimates of the incubation period for CO...
36,What is multisystem in ammatory syndrome in ch...,CDC is working with state and local health dep...
35,Why are there reagent shortages that are impac...,It is unclear why these shortages are occurrin...
35,"Is there a vaccine, drug or treatment for COVI...",The most effective ways to protect yourself an...
45,How can I protect myself against coronaviruses?,The World Health Organization suggests avoidin...
58,Is there anything I should not do?,Taking antibiotics


In [13]:
 snow = nltk.stem.SnowballStemmer('english')

In [14]:
def clean(q_a):
    temp = []
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in q_a:
        sentence = str(sentence)
        sentence = sentence.lower()                 # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]\d',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]@',r' ',sentence)        #Removing Punctuations
        words = [snow.stem(word) for word in sentence.split() if word not in stop]   # Stemming and removing stopwords
        temp.append(words)
    q_a = temp  
    return q_a

In [15]:
def clean_col(c):
    sent = []
    for row in c:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        sent.append(sequ)
    c = sent
    return c

In [23]:
def combine_cols(l1,l2):
    r = []
    for q, a in  zip(l1,l2):
        r.append(q+" "+a)
    return r

In [24]:
q_new =  clean_col(clean(QA['q']))
a_new =  clean_col(clean(QA['a']))

In [25]:
qa_combined = combine_cols(q_new,a_new)

In [26]:
QA['q_new'] =  q_new
QA['a_new'] = a_new
QA['qa_combined'] = qa_combined

In [27]:
QA.head()

Unnamed: 0,q,a,q_new,a_new,qa_combined
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what is a novel coronavirus?,a novel coronavirus is a new coronavirus that...,what is a novel coronavirus? a novel coronav...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",whi is the diseas be call coronavirus diseas ...,"on februari 11, 2020 the world health organ a...",whi is the diseas be call coronavirus diseas ...
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,whi might someon blame or avoid individu and ...,peopl in the u.s. may be worri or anxious abo...,whi might someon blame or avoid individu and ...
3,How can people help stop stigma related to COV...,People can ght stigma by providing social supp...,how can peopl help stop stigma relat to covid...,peopl can ght stigma by provid social support...,how can peopl help stop stigma relat to covid...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...,whi do some state covid-19 case number someti...,cdc overal case number are valid through a co...,whi do some state covid-19 case number someti...


In [29]:
count_vect = TfidfVectorizer(ngram_range=(1,2))
bow_data = count_vect.fit_transform(QA['qa_combined'])

In [30]:
tq = ["cornonavirus"]
tq = clean(tq)
tq = clean_col(tq)

In [31]:
r = count_vect.transform(tq)

In [32]:
for i,(q , a) in enumerate(zip(QA['q'],QA['a'])):
    QA_dict[i] = [q,a]                   

In [33]:
def cv_test(q):
    tq = [q]
    tq = clean(tq)
    tq = clean_col(tq)
    r = count_vect.transform(tq)
    return r

In [34]:
QA['q'].sample(20)

35     Why are there reagent shortages that are impac...
1                        Why is it called a coronavirus?
23                        How can I help protect myself?
105                                 Can I walk my dog?\n
24      Can I get COVID-19 through second-hand clothe...
32                            When is testing important?
81                 What does more severe illness mean?\n
4              Who is most at risk of catching COVID-19?
59                 Is the source causing COVID-19 known?
14     Can someone who has been quarantined for COVID...
15                  Is there a vaccine against COVID-19?
57     Are there acceptable designs of PPE that local...
17     What can I do to protect myself and prevent th...
83     Are people with high blood pressure (hypertens...
17     What can I do to ensure my workplace is safe f...
68     Can people expect to be immune once they’ve ha...
114    What are community mitigation actions for COVI...
22     What is known about how 

In [35]:
QA.shape[0] == bow_data.shape[0]

True

In [36]:
bow_data[0].shape

(1, 16234)

In [37]:
r.shape

(1, 16234)

In [None]:
while True:
    q = input("Q: ")
    q = cv_test(q)
    ranks = np.array([cosine_similarity(q,bow_data[i]) for i in range(QA.shape[0])]).reshape(QA.shape[0],1)
    loc = np.argmax(ranks)  
    a = QA_dict[loc][1]
    print("A: ",a)

Q: what is coronavirus
A:  Coronaviruses are a large family of viruses which may cause illness in animals or humans. 
Q: keep my family safe
A:  Teach and reinforce everyday preventive actions.
Parents and caretakers play an important role in teaching children to wash their hands. Explain that hand
washing can keep them healthy and stop the virus from spreading to others.
Be a good role model—if you wash your hands often, they’re more likely to do the same.
Make handwashing a family activity.

Q: signs

Trouble breathing
Persistent pain or pressure in the chest
New confusion or inability to arouse
Bluish lips or face
Q: testing where
A:  There are actually many tests being used to diagnose COVID-19 that the U.S. Food & Drug Administration (FDA) has
authorized for use during the current emergency. All of these viral tests identify the virus in samples from the
respiratory system, such as from nasal or nasopharyngeal swabs. Some tests are conducted at the testing site you
visit, and resu