In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import stop_words
import os
from nltk.corpus import stopwords 
import nltk
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
#nltk.download('stopwords')

In [3]:
stop = set(['?',',',':','@']) 

In [4]:
ROOT = '../data/'

QA_dict = {}

In [5]:
os.listdir(ROOT)

['CDC.xlsx',
 'Coronavirus (COVID-19) frequently asked questions _ CDC.pdf',
 'covidquestionsca.xlsx',
 'EAC.xlsx',
 'JHU.xlsx',
 'MOHKE.xlsx',
 'sources.txt',
 'WHO.xlsx']

In [6]:
paths = [ROOT+d for d in os.listdir(ROOT) if d.endswith(".xlsx")]

In [7]:
paths

['../data/CDC.xlsx',
 '../data/covidquestionsca.xlsx',
 '../data/EAC.xlsx',
 '../data/JHU.xlsx',
 '../data/MOHKE.xlsx',
 '../data/WHO.xlsx']

In [8]:
data = [ pd.read_excel(p, names = ['q','a']) for p in paths]

In [9]:
QA = pd.DataFrame()

In [10]:
for d in data[:]:
    if d.shape[0]>0:
        print(d.shape)
        QA = pd.concat((QA,d),axis = 0)

(116, 2)
(10, 2)
(26, 2)
(86, 2)
(85, 2)


In [11]:
QA.shape

(323, 2)

In [12]:
QA.sample(10)

Unnamed: 0,q,a
102,Should I avoid contact with pets or other anim...,"We are still learning about this virus, but it..."
3,What are the symptoms of COVID-19?,The most common symptoms of COVID-19 are fever...
48,Cleaning produce brought home from the grocery...,Here is general guidance on fruits and vegetab...
61,How did the first human SARS-CoV-2 infections ...,"SARS-CoV, the virus which caused the SARS outb..."
94,Who should clean and disinfect community space...,Regular cleaning sta can clean and disinfect c...
65,Will businesses and schools close or stay clos...,"CDC makes recommendations, shares information,..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...
14,Are there different “species” of coronavirus l...,Among the thousands of samples of the long str...
82,Does the science support limiting the number o...,Yes. We should be practicing social distancing...
115,Who is involved in community mitigation action...,"Individuals, communities, schools, businesses ..."


In [13]:
 snow = nltk.stem.SnowballStemmer('english')

In [14]:
def clean(q_a):
    temp = []
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in q_a:
        sentence = str(sentence)
        sentence = sentence.lower()                 # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]\d',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]@',r' ',sentence)        #Removing Punctuations
        words = [snow.stem(word) for word in sentence.split() if word not in stop]   # Stemming and removing stopwords
        temp.append(words)
    q_a = temp  
    return q_a

In [15]:
def clean_col(c):
    sent = []
    for row in c:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        sent.append(sequ)
    c = sent
    return c

In [16]:
q_new =  clean_col(clean(QA['q']))
a_new =  clean_col(clean(QA['a']))

In [17]:
QA['q_new'] =  q_new
QA['a_new'] = a_new

In [18]:
QA.head()

Unnamed: 0,q,a,q_new,a_new
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what is a novel coronavirus?,a novel coronavirus is a new coronavirus that...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",whi is the diseas be call coronavirus diseas ...,"on februari 11, 2020 the world health organ a..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,whi might someon blame or avoid individu and ...,peopl in the u.s. may be worri or anxious abo...
3,How can people help stop stigma related to COV...,People can ght stigma by providing social supp...,how can peopl help stop stigma relat to covid...,peopl can ght stigma by provid social support...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...,whi do some state covid-19 case number someti...,cdc overal case number are valid through a co...


In [23]:
count_vect = CountVectorizer(binary=True)
bow_data = count_vect.fit_transform(QA['q_new'])

In [24]:
tq = ["cornonavirus"]
tq = clean(tq)
tq = clean_col(tq)

In [25]:
r = count_vect.transform(tq)

In [26]:
for i,(q , a) in enumerate(zip(QA['q'],QA['a'])):
    QA_dict[i] = [q,a]                   

In [27]:
def cv_test(q):
    tq = [q]
    tq = clean(tq)
    tq = clean_col(tq)
    r = count_vect.transform(tq)
    return r

In [28]:
QA['q'].sample(20)

61                 Are my pets at risk of getting sick?
18    Is it safe to use public transport (Dalla Dall...
67    Wet markets in China are widely believed to be...
21    Does eating bush meat pose a risk for COVID-19...
52      What if my child needs to go to the hospital?\n
28    Should people on immunosuppressive drugs stop ...
6                    What are the symptoms of COVID-19?
36    Is the current spike in cases a result of incr...
0                                What is a coronavirus?
21      Am I at risk for COVID-19 in the United States?
72         What treatments are there for coronaviruses?
42    While school’s out, how can I keep my family h...
49      How can my family cope with the added stress?\n
9         When and how should I seek medical attention?
13                                                  NaN
52    Is it correct that if someone in your home is ...
80    For businesses that remain open, what are best...
98    Can sanitizing tunnels be used at building

In [29]:
QA.shape[0] == bow_data.shape[0]

True

In [30]:
bow_data[0].shape

(1, 723)

In [31]:
r.shape

(1, 723)

In [None]:
while True:
    q = input("Q: ")
    q = cv_test(q)
    ranks = np.array([cosine_similarity(q,bow_data[i]) for i in range(QA.shape[0])]).reshape(QA.shape[0],1)
    loc = np.argmax(ranks)  
    a = QA_dict[loc][1]
    print("A: ",a)

Q: keep my family safe
A:  Watch your child for any signs of illness. If you see any sign of illness consistent with symptoms of COVID-19, particularly fever, cough, or shortness of breath, call your healthcare provider and keep your child at home and away from others as much as possible.Follow CDC’s guidance on “What to do if you are sick.”

Q: sings
A:  A novel coronavirus is a new coronavirus that has not been previously identied. The virus causing coronavirus
disease 2019 (COVID-19), is not the same as the coronaviruses that commonly circulate among humans and cause
mild illness, like the common cold.

Q: signs
A:  A novel coronavirus is a new coronavirus that has not been previously identied. The virus causing coronavirus
disease 2019 (COVID-19), is not the same as the coronaviruses that commonly circulate among humans and cause
mild illness, like the common cold.

Q: test
A:  Testing is particularly important for people who are seriously ill. Knowing the diagnosis is important fo