In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from nltk.corpus import stopwords 
import nltk
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances



In [2]:
#nltk.download('stopwords')

In [3]:
stop = set(['?',',',':','@']) 

In [4]:
ROOT = '../data/'

QA_dict = {}

In [5]:
os.listdir(ROOT)

['CDC.xlsx',
 'Coronavirus (COVID-19) frequently asked questions _ CDC.pdf',
 'covidquestionsca.xlsx',
 'EAC.xlsx',
 'JHU.xlsx',
 'MOHKE.xlsx',
 'sources.txt',
 'WHO.xlsx']

In [6]:
paths = [ROOT+d for d in os.listdir(ROOT) if d.endswith(".xlsx")]

In [7]:
paths

['../data/CDC.xlsx',
 '../data/covidquestionsca.xlsx',
 '../data/EAC.xlsx',
 '../data/JHU.xlsx',
 '../data/MOHKE.xlsx',
 '../data/WHO.xlsx']

In [8]:
data = [ pd.read_excel(p, names = ['q','a']) for p in paths]

In [9]:
QA = pd.DataFrame()

In [10]:
for d in data[:]:
    if d.shape[0]>0:
        print(d.shape)
        QA = pd.concat((QA,d),axis = 0)

(116, 2)
(10, 2)
(26, 2)
(86, 2)
(85, 2)


In [11]:
QA.shape

(323, 2)

In [12]:
QA.sample(10)

Unnamed: 0,q,a
54,There’s a holding-your-breath test sweeping th...,There’s no way to tell if you have COVID-19 wi...
108,Why are animals being tested when many people ...,Animals are only being tested in very rare cir...
58,If someone goes for a jog through a neighborho...,No. The majority of transmissions of this viru...
100,Do I need to get my pet tested for COVID-19?\n,"No. At this time, routine testing of animals f..."
10,Can CoVID-19 be caught from a person who has n...,The main way the disease spreads is through re...
14,Are there different “species” of coronavirus l...,Among the thousands of samples of the long str...
9,Should I wear a facemask to protect myself aga...,Do not need to wear a facemask unless you are ...
44,"While school’s out, how can I keep my family h...",Help your child stay socially connected.\nReac...
97,"Should outdoor playgrounds, like those at scho...",Outdoor areas generally require normal routine...
18,What is community spread?\n,Community spread means people have been infect...


In [13]:
 snow = nltk.stem.SnowballStemmer('english')

In [14]:
def clean(q_a):
    temp = []
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in q_a:
        sentence = str(sentence)
        sentence = sentence.lower()                 # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]\d',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]@',r' ',sentence)        #Removing Punctuations
        words = [snow.stem(word) for word in sentence.split() if word not in stop]   # Stemming and removing stopwords
        temp.append(words)
    q_a = temp  
    return q_a

In [15]:
def clean_col(c):
    sent = []
    for row in c:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        sent.append(sequ)
    c = sent
    return c

In [16]:
def combine_cols(l1,l2):
    r = []
    for q, a in  zip(l1,l2):
        r.append(q+" "+a)
    return r

In [17]:
q_new =  clean_col(clean(QA['q']))
a_new =  clean_col(clean(QA['a']))
qa_combined = combine_cols(q_new,a_new)


In [19]:
QA['q_new'] =  q_new
QA['a_new'] = a_new
QA['qa_combined'] = qa_combined

In [20]:
QA.head()

Unnamed: 0,q,a,q_new,a_new,qa_combined
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what is a novel coronavirus?,a novel coronavirus is a new coronavirus that...,what is a novel coronavirus? a novel coronav...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",whi is the diseas be call coronavirus diseas ...,"on februari 11, 2020 the world health organ a...",whi is the diseas be call coronavirus diseas ...
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,whi might someon blame or avoid individu and ...,peopl in the u.s. may be worri or anxious abo...,whi might someon blame or avoid individu and ...
3,How can people help stop stigma related to COV...,People can ght stigma by providing social supp...,how can peopl help stop stigma relat to covid...,peopl can ght stigma by provid social support...,how can peopl help stop stigma relat to covid...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...,whi do some state covid-19 case number someti...,cdc overal case number are valid through a co...,whi do some state covid-19 case number someti...


In [22]:
w2v_data = QA['qa_combined']

In [23]:
splitted = []
for row in w2v_data: 
    splitted.append([word for word in row.split()]) 

In [24]:
train_w2v = Word2Vec(splitted,min_count=1,size=100, workers=4)

In [25]:
avg_data = []
for row in splitted:
    vec = np.zeros(100)
    count = 0
    for word in row:
        try:
            vec += train_w2v[word]
            count += 1
        except:
            pass
    avg_data.append(vec/count)

  import sys


In [26]:
avg_data = np.array(avg_data)

In [27]:
for i,(q , a) in enumerate(zip(QA['q'],QA['a'])):
    QA_dict[i] = [q,a]                   

In [28]:
def wv_test(q):
    tq = [q]
    tq = clean(tq)
    tq = clean_col(tq)
    splitted_test = []
    for row in tq: 
        splitted_test.append([word for word in row.split()]) 
    avg_data_test = []
    for row in splitted_test:
        vec = np.zeros(100)
        count = 0
        for word in row:
            try:
                vec += train_w2v[word]
                count += 1
            except:
                pass
        avg_data_test.append(vec/count)
    return np.array(avg_data_test)

In [29]:
QA['q'].sample(20)

9      Can the virus that causes COVID-19 be transmit...
22     What is known about how this virus responds to...
39     While school’s out, will kids have access to m...
10     How are coronaviruses transmitted between people?
104                   Should I worry about my pet cat?\n
45       How can I protect myself against coronaviruses?
5                     What are the symptoms of COVID-19?
82     How large does a meeting or event need to be i...
80     Are smokers and tobacco users at higher risk o...
23     There have been more cases in the northern hem...
81     Are smokers and tobacco users at higher risk o...
20     Are Africans less at risk of contracting the C...
1                        Why is it called a coronavirus?
5      How do CDC’s COVID-19 case numbers compare wit...
6                                         Johns Hopkins?
22     Are patients who recovered from COVID-19 immun...
75     Who is at higher risk for serious illness from...
40       Are false negatives a 

In [None]:
while True:
    q = input("Q: ")
    q = wv_test(q)
    ranks = np.array([cosine_similarity(q.reshape(1,100),avg_data[i].reshape(1,100)) for i in range(QA.shape[0])]).reshape(QA.shape[0],1)
    loc = np.argmax(ranks)  
    a = QA_dict[loc][1]
    print("A: ",a)

Q: does it kill people


  


A:  It is not certain how long the virus that causes COVID-19 survives on surfaces, but it seems to behave like other coronaviruses.
Q: what is coronavirus


  


A:  The Coronavirus Disease is a flu-like respiratory disease that is highly contagious.
Q: does it kill people


  


A:  It is not certain how long the virus that causes COVID-19 survives on surfaces, but it seems to behave like other coronaviruses.
Q: keep my family safe


  


A:  Watch your child for any signs of illness. If you see any sign of illness consistent with symptoms of COVID-19, particularly fever, cough, or shortness of breath, call your healthcare provider and keep your child at home and away from others as much as possible.Follow CDC’s guidance on “What to do if you are sick.”

Q: signs


  


A:  Smoking
Q: symptoms


  


A:  The most common symptoms of COVID-19 are fever, tiredness, and dry cough. Some patients may have aches and pains, nasal congestion, runny nose, sore throat or diarrhea. These symptoms are usually mild and begin gradually.
