In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from nltk.corpus import stopwords 
import nltk
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances



In [2]:
#nltk.download('stopwords')

In [3]:
stop = set(['?',',',':','@']) 

In [4]:
ROOT = '../data/'

QA_dict = {}

In [5]:
os.listdir(ROOT)

['CDC.xlsx',
 'Coronavirus (COVID-19) frequently asked questions _ CDC.pdf',
 'covidquestionsca.xlsx',
 'EAC.xlsx',
 'JHU.xlsx',
 'MOHKE.xlsx',
 'sources.txt',
 'WHO.xlsx']

In [6]:
paths = [ROOT+d for d in os.listdir(ROOT) if d.endswith(".xlsx")]

In [7]:
paths

['../data/CDC.xlsx',
 '../data/covidquestionsca.xlsx',
 '../data/EAC.xlsx',
 '../data/JHU.xlsx',
 '../data/MOHKE.xlsx',
 '../data/WHO.xlsx']

In [8]:
data = [ pd.read_excel(p, names = ['q','a']) for p in paths]

In [9]:
QA = pd.DataFrame()

In [10]:
for d in data[:]:
    if d.shape[0]>0:
        print(d.shape)
        QA = pd.concat((QA,d),axis = 0)

(116, 2)
(10, 2)
(26, 2)
(86, 2)
(85, 2)


In [11]:
QA.shape

(323, 2)

In [12]:
QA.sample(10)

Unnamed: 0,q,a
24,What should I do if I have had close contact w...,There is information for people who have had c...
106,Can I take my dog to daycare or a groomer?\n,Until we know more about how this virus a ects...
43,How much information about the movements of in...,These decisions should be made with local publ...
19,What can I do to protect myself and prevent th...,Keep up to date on the latest COVID-19 hotspot...
82,How large does a meeting or event need to be i...,High profile international sporting events suc...
72,Can women with COVID-19 breastfeed?,Yes. Women with COVID-19 can breastfeed if the...
111,Can I travel to the United States with dogs or...,Please refer to CDC’s requirements for bringin...
8,Why does it take long to develop a vaccine,"Developing a vaccine takes time, in most cases..."
45,"While school’s out, limit time with older adul...",Older adults and people who have serious under...
8,How does COVID-19 spread?,People can also catch COVID-19 if they breathe...


In [13]:
 snow = nltk.stem.SnowballStemmer('english')

In [14]:
def clean(q_a):
    temp = []
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in q_a:
        sentence = str(sentence)
        sentence = sentence.lower()                 # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]\d',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]@',r' ',sentence)        #Removing Punctuations
        words = [snow.stem(word) for word in sentence.split() if word not in stop]   # Stemming and removing stopwords
        temp.append(words)
    q_a = temp  
    return q_a

In [15]:
def clean_col(c):
    sent = []
    for row in c:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        sent.append(sequ)
    c = sent
    return c

In [16]:
q_new =  clean_col(clean(QA['q']))
a_new =  clean_col(clean(QA['a']))

In [17]:
QA['q_new'] =  q_new
QA['a_new'] = a_new

In [18]:
QA.head()

Unnamed: 0,q,a,q_new,a_new
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what is a novel coronavirus?,a novel coronavirus is a new coronavirus that...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",whi is the diseas be call coronavirus diseas ...,"on februari 11, 2020 the world health organ a..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,whi might someon blame or avoid individu and ...,peopl in the u.s. may be worri or anxious abo...
3,How can people help stop stigma related to COV...,People can ght stigma by providing social supp...,how can peopl help stop stigma relat to covid...,peopl can ght stigma by provid social support...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...,whi do some state covid-19 case number someti...,cdc overal case number are valid through a co...


In [19]:
w2v_data = QA['q_new']

In [20]:
w2v_data 

0                          what is a novel coronavirus?
1      whi is the diseas be call coronavirus diseas ...
2      whi might someon blame or avoid individu and ...
3      how can peopl help stop stigma relat to covid...
4      whi do some state covid-19 case number someti...
                            ...                        
80     are smoker and tobacco user at higher risk of...
81     are smoker and tobacco user at higher risk of...
82     how larg doe a meet or event need to be in or...
83     how larg doe a meet or event need to be in or...
84     doe who recommend that all intern mass gather...
Name: q_new, Length: 323, dtype: object

In [21]:
splitted = []
for row in w2v_data: 
    splitted.append([word for word in row.split()]) 

In [24]:
[[word for word in row.split()] for row in w2v_data]

[['what', 'is', 'a', 'novel', 'coronavirus?'],
 ['whi',
  'is',
  'the',
  'diseas',
  'be',
  'call',
  'coronavirus',
  'diseas',
  '2019,',
  'covid-19?'],
 ['whi',
  'might',
  'someon',
  'blame',
  'or',
  'avoid',
  'individu',
  'and',
  'group',
  '(creat',
  'stigma)',
  'becaus',
  'of',
  'covid-19?'],
 ['how', 'can', 'peopl', 'help', 'stop', 'stigma', 'relat', 'to', 'covid-19?'],
 ['whi',
  'do',
  'some',
  'state',
  'covid-19',
  'case',
  'number',
  'sometim',
  'di',
  'er',
  'from',
  'what',
  'is',
  'post',
  'on',
  'cdc',
  'website?'],
 ['how',
  'do',
  'cdc',
  'covid-19',
  'case',
  'number',
  'compar',
  'with',
  'those',
  'provid',
  'by',
  'the',
  'world',
  'health',
  'organ',
  '(who)',
  'or'],
 ['john', 'hopkins?'],
 ['whi',
  'do',
  'the',
  'number',
  'of',
  'case',
  'for',
  'previous',
  'day',
  'increase?'],
 ['are',
  'peopl',
  'with',
  'high',
  'blood',
  'pressur',
  '(hypertension)',
  'at',
  'higher',
  'risk',
  'from',
  

In [22]:
splitted

[['what', 'is', 'a', 'novel', 'coronavirus?'],
 ['whi',
  'is',
  'the',
  'diseas',
  'be',
  'call',
  'coronavirus',
  'diseas',
  '2019,',
  'covid-19?'],
 ['whi',
  'might',
  'someon',
  'blame',
  'or',
  'avoid',
  'individu',
  'and',
  'group',
  '(creat',
  'stigma)',
  'becaus',
  'of',
  'covid-19?'],
 ['how', 'can', 'peopl', 'help', 'stop', 'stigma', 'relat', 'to', 'covid-19?'],
 ['whi',
  'do',
  'some',
  'state',
  'covid-19',
  'case',
  'number',
  'sometim',
  'di',
  'er',
  'from',
  'what',
  'is',
  'post',
  'on',
  'cdc',
  'website?'],
 ['how',
  'do',
  'cdc',
  'covid-19',
  'case',
  'number',
  'compar',
  'with',
  'those',
  'provid',
  'by',
  'the',
  'world',
  'health',
  'organ',
  '(who)',
  'or'],
 ['john', 'hopkins?'],
 ['whi',
  'do',
  'the',
  'number',
  'of',
  'case',
  'for',
  'previous',
  'day',
  'increase?'],
 ['are',
  'peopl',
  'with',
  'high',
  'blood',
  'pressur',
  '(hypertension)',
  'at',
  'higher',
  'risk',
  'from',
  

In [21]:
train_w2v = Word2Vec(splitted,min_count=1,size=100, workers=4)

In [22]:
avg_data = []
for row in splitted:
    vec = np.zeros(100)
    count = 0
    for word in row:
        try:
            vec += train_w2v[word]
            count += 1
        except:
            pass
    avg_data.append(vec/count)

  import sys


In [23]:
avg_data = np.array(avg_data)

In [24]:
for i,(q , a) in enumerate(zip(QA['q'],QA['a'])):
    QA_dict[i] = [q,a]                   

In [25]:
def wv_test(q):
    tq = [q]
    tq = clean(tq)
    tq = clean_col(tq)
    splitted_test = []
    for row in tq: 
        splitted_test.append([word for word in row.split()]) 
    avg_data_test = []
    for row in splitted_test:
        vec = np.zeros(100)
        count = 0
        for word in row:
            try:
                vec += train_w2v[word]
                count += 1
            except:
                pass
        avg_data_test.append(vec/count)
    return np.array(avg_data_test)

In [26]:
QA['q'].sample(20)

4     What symptoms do coronaviruses typically cause...
23                       How can I help protect myself?
77    How are COVID-19 and influenza viruses different?
68    Is it possible to have the flu and COVID-19 at...
74    Is there evidence for using hydroxychloroquine...
7     Is the loss of smell an early coronavirus indi...
60    How did the first human SARS-CoV-2 infections ...
60    What cleaning products should I use to protect...
19    What can I do to protect myself and prevent th...
21    Is there evidence of a heightened risk to preg...
12                 Why are we seeing a rise in cases?\n
89                  What is CDC doing about COVID-19?\n
32    Are the symptoms of COVID-19 di erent in child...
6     Is there any concern mosquitoes carying the vi...
91    Is it safe to vacuum in a school, business, or...
24            Are COVID toes related to blood clotting?
7                             How does COVID-19 spread?
82    Does the science support limiting the numb

In [None]:
while True:
    q = input("Q: ")
    q = wv_test(q)
    ranks = np.array([cosine_similarity(q.reshape(1,100),avg_data[i].reshape(1,100)) for i in range(QA.shape[0])]).reshape(QA.shape[0],1)
    loc = np.argmax(ranks)  
    a = QA_dict[loc][1]
    print("A: ",a)

Q: can animals spread the virus


  


A:  At this time, CDC has no data to suggest that this new coronavirus or other similar coronaviruses are spread by
mosquitoes or ticks. The main way that COVID-19 spreads is from person to person. See How Coronavirus Spreads for
more information.

Q: can the virus cause depression


  


A:  At this time, CDC has no data to suggest that this new coronavirus or other similar coronaviruses are spread by
mosquitoes or ticks. The main way that COVID-19 spreads is from person to person. See How Coronavirus Spreads for
more information.

Q: when will it end


  


A:  Testing is particularly important for people who are seriously ill. Knowing the diagnosis is important for clinical care, allows health care workers to protect themselves, and is necessary for research into treatments.

For people who are mildly or moderately ill, testing can help assure that they isolate themselves and alert people they have come in contact with of the potential need for quarantine. By quickly identifying individuals who are sick and isolating them, public health authorities can reduce the spread of the novel coronavirus.

To assure both that tests are available for those who need them and that the health professionals are prepared to do the test safely, adequate safety equipment is essential. Not every clinic or medical office will conduct the testing. Physicians and public health authorities should direct people to where they can be tested. Given the limited availability of testing, there is much less urgency to test people who are feeling well.

Also, wherever 

  


A:  The virus that causes COVID-19 is thought to spread mainly from person to person, mainly through respiratory
droplets produced when an infected person coughs or sneezes. These droplets can land in the mouths or noses of
people who are nearby or possibly be inhaled into the lungs. Spread is more likely when people are in close contact
with one another (within about 6 feet).
COVID-19 seems to be spreading easily and sustainably in the community (“community spread”) in many a ected
geographic areas. Community spread means people have been infected with the virus in an area, including some who
are not sure how or where they became infected.

Q: symptoms


  


A:  Common signs of infection include runny nose, cough, fever, sore throat, and shortness of breath. In more severe cases, infection can cause pneumonia, severe acute respiratory syndrome, kidney failure, and even death.
Q:  What is CDC doing about COVID-19


  


A:  CDC is working with other federal partners in a whole-of-government response. This is an emerging, rapidly evolving
situation and CDC will continue to provide updated information as it becomes available. CDC works 24/7 to protect
people’s health. More information about CDC’s response to COVID-19 is available online.

Q: do we have treatment?


  


A:  People who have symptoms should assume they have it, isolate themselves from other people, talk to people they may have exposed in the days prior to experiencing symptoms, and recommend that those contacts self-quarantine. Mayors should reinforce this message. Having a call center or using 311 for coronavirus response could help mayors and public health officials provide these messages. Such a system can also be a way to keep track of the epidemic. If you give people experiencing symptoms the option to go online or call for help with food delivery or other services, you can log them into a database, and, to the extent that the health department is able, do contact tracing, make referrals, and offer support. 
Q: treatment?


  


A:  Every viral disease needs to be taken seriously and COVID-19 is highly contagious. Therefore, it can spread rapidly and infect many people in a short period of time. But, the symptoms of COVID-19 are in most cases mild. Many patients will never even see a doctor. With about 4 % the average death rate for COVID-19 is also rather low compared to other diseases that we are facing in the region. For comparison: The death rate for measles outbreaks of for the Ebola Virus Disease can be above 60 % and up to 100 % of people infected with Rabies will die.

Another example: In China, with a total population of more than 1,4 billion people, only about 80,000 cases of COVID-19 were confirmed.
Q: jogging


  


A:  No. The majority of transmissions of this virus occur through droplet transmission. This means that a sick patient who coughs or sneezes is expelling virus through droplets that don’t travel that far. They tend to fall to surfaces within a six-foot distance. If you touch a surface that is contaminated with droplets from someone who has coughed or sneezed on it and then you touch your eyes or nose or mouth, you could potentially become infected that way. 
Q: pregnant women


  


A:  Testing protocols and eligibility vary depending on where you live.
Q: Should I stay inside the house


  


A:  It is very difficult to accurately predict the surge without having sufficient testing capacity for the population. Depending on the scale of testing in the area, cities may be able to look at test results to understand the likely surge in patients coming in the ensuing 2–4 weeks. Cities should also closely monitor the hospital system to assess the number of hospitalized and ICU patients, as well as the doubling time in the growth of the number of these patients. If there is a single call center for people with symptoms, the number of calls should be tracked as one measure to watch. There are other emerging sources of data, such as fevers reported by companies that make internet-enabled thermometers.
Q: Should I stay inside the house


  


A:  It is very difficult to accurately predict the surge without having sufficient testing capacity for the population. Depending on the scale of testing in the area, cities may be able to look at test results to understand the likely surge in patients coming in the ensuing 2–4 weeks. Cities should also closely monitor the hospital system to assess the number of hospitalized and ICU patients, as well as the doubling time in the growth of the number of these patients. If there is a single call center for people with symptoms, the number of calls should be tracked as one measure to watch. There are other emerging sources of data, such as fevers reported by companies that make internet-enabled thermometers.


In [None]:
jogging