In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import stop_words
import os
from nltk.corpus import stopwords 
import nltk
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
#nltk.download('stopwords')

In [3]:
stop = set(['?',',',':','@']) 

In [4]:
ROOT = '../data/'

QA_dict = {}

In [5]:
os.listdir(ROOT)

['CDC.xlsx',
 'Coronavirus (COVID-19) frequently asked questions _ CDC.pdf',
 'covidquestionsca.xlsx',
 'EAC.xlsx',
 'JHU.xlsx',
 'MOHKE.xlsx',
 'sources.txt',
 'WHO.xlsx']

In [6]:
paths = [ROOT+d for d in os.listdir(ROOT) if d.endswith(".xlsx")]

In [7]:
paths

['../data/CDC.xlsx',
 '../data/covidquestionsca.xlsx',
 '../data/EAC.xlsx',
 '../data/JHU.xlsx',
 '../data/MOHKE.xlsx',
 '../data/WHO.xlsx']

In [8]:
data = [ pd.read_excel(p, names = ['q','a']) for p in paths]

In [9]:
QA = pd.DataFrame()

In [10]:
for d in data[:]:
    if d.shape[0]>0:
        print(d.shape)
        QA = pd.concat((QA,d),axis = 0)

(116, 2)
(10, 2)
(26, 2)
(86, 2)
(85, 2)


In [11]:
QA.shape

(323, 2)

In [12]:
QA.sample(10)

Unnamed: 0,q,a
109,Are pets from a shelter safe to adopt?\n,Based on the limited information available to ...
44,"While school’s out, how can I keep my family h...",Help your child stay socially connected.\nReac...
21,Is there evidence of a heightened risk to preg...,"So far, we haven’t seen any particular data th..."
12,What percentage of the population will become ...,It is difficult to estimate what percentage of...
21,Am I at risk for COVID-19 in the United States?,This is a rapidly evolving situation and the r...
78,Are there any medications I should avoid takin...,"Currently, there is no evidence to show that t..."
7,How does COVID-19 spread?,People can catch COVID-19 from others who have...
114,What are community mitigation actions for COVI...,Some community mitigation actions may include:...
43,How long is the incubation period for COVID-19?,The “incubation period” means the time between...
68,Should pregnant women be tested for COVID-19?,Testing protocols and eligibility vary dependi...


In [13]:
 snow = nltk.stem.SnowballStemmer('english')

In [14]:
def clean(q_a):
    temp = []
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in q_a:
        sentence = str(sentence)
        sentence = sentence.lower()                 # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]\d',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]@',r' ',sentence)        #Removing Punctuations
        words = [snow.stem(word) for word in sentence.split() if word not in stop]   # Stemming and removing stopwords
        temp.append(words)
    q_a = temp  
    return q_a

In [15]:
def clean_col(c):
    sent = []
    for row in c:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        sent.append(sequ)
    c = sent
    return c

In [16]:
q_new =  clean_col(clean(QA['q']))
a_new =  clean_col(clean(QA['a']))

In [17]:
QA['q_new'] =  q_new
QA['a_new'] = a_new

In [18]:
QA.head()

Unnamed: 0,q,a,q_new,a_new
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what is a novel coronavirus?,a novel coronavirus is a new coronavirus that...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",whi is the diseas be call coronavirus diseas ...,"on februari 11, 2020 the world health organ a..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,whi might someon blame or avoid individu and ...,peopl in the u.s. may be worri or anxious abo...
3,How can people help stop stigma related to COV...,People can ght stigma by providing social supp...,how can peopl help stop stigma relat to covid...,peopl can ght stigma by provid social support...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...,whi do some state covid-19 case number someti...,cdc overal case number are valid through a co...


In [19]:
count_vect = TfidfVectorizer()
bow_data = count_vect.fit_transform(QA['q_new'])

In [20]:
tq = ["cornonavirus"]
tq = clean(tq)
tq = clean_col(tq)

In [21]:
r = count_vect.transform(tq)

In [22]:
for i,(q , a) in enumerate(zip(QA['q'],QA['a'])):
    QA_dict[i] = [q,a]                   

In [23]:
def cv_test(q):
    tq = [q]
    tq = clean(tq)
    tq = clean_col(tq)
    r = count_vect.transform(tq)
    return r

In [24]:
QA['q'].sample(20)

97     Should outdoor playgrounds, like those at scho...
42     How to put on, use, take off and dispose of a ...
8      Are people with high blood pressure (hypertens...
70     Is it possible that some people may be exposed...
71     Do pregnant women with suspected or confirmed ...
16     What can I do to protect myself and prevent th...
9          When and how should I seek medical attention?
87     What should I do if my family member died from...
9      Should I wear a facemask to protect myself aga...
60              Will I get sick if I help care for them?
51     What if my child’s symptoms of their underlyin...
50     What if my child or someone else in the home i...
15     Can the virus that causes COVID-19 be spread t...
14     Can someone who has been quarantined for COVID...
10                      What is the source of the virus?
57     Are there acceptable designs of PPE that local...
77     How are COVID-19 and influenza viruses different?
29     Should contact lens wear

In [25]:
QA.shape[0] == bow_data.shape[0]

True

In [26]:
bow_data[0].shape

(1, 723)

In [27]:
r.shape

(1, 723)

In [None]:
while True:
    q = input("Q: ")
    q = cv_test(q)
    ranks = np.array([cosine_similarity(q,bow_data[i]) for i in range(QA.shape[0])]).reshape(QA.shape[0],1)
    loc = np.argmax(ranks)  
    a = QA_dict[loc][1]
    print("A: ",a)

Q: can animals spread the virus
A:  The virus that causes COVID-19 is thought to spread mainly from person to person, mainly through respiratory
droplets produced when an infected person coughs or sneezes. These droplets can land in the mouths or noses of
people who are nearby or possibly be inhaled into the lungs. Spread is more likely when people are in close contact
with one another (within about 6 feet).
COVID-19 seems to be spreading easily and sustainably in the community (“community spread”) in many a ected
geographic areas. Community spread means people have been infected with the virus in an area, including some who
are not sure how or where they became infected.

Q: can animals spread the virus
A:  The virus that causes COVID-19 is thought to spread mainly from person to person, mainly through respiratory
droplets produced when an infected person coughs or sneezes. These droplets can land in the mouths or noses of
people who are nearby or possibly be inhaled into the lungs. S