In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
import os
from nltk.corpus import stopwords 
import nltk
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances



In [2]:
#nltk.download('stopwords')

In [3]:
stop = set(['?',',',':','@']) 

In [4]:
ROOT = '../data/'

QA_dict = {}

In [5]:
os.listdir(ROOT)

['CDC.xlsx',
 'Coronavirus (COVID-19) frequently asked questions _ CDC.pdf',
 'covidquestionsca.xlsx',
 'EAC.xlsx',
 'JHU.xlsx',
 'MOHKE.xlsx',
 'sources.txt',
 'WHO.xlsx']

In [6]:
paths = [ROOT+d for d in os.listdir(ROOT) if d.endswith(".xlsx")]

In [7]:
paths

['../data/CDC.xlsx',
 '../data/covidquestionsca.xlsx',
 '../data/EAC.xlsx',
 '../data/JHU.xlsx',
 '../data/MOHKE.xlsx',
 '../data/WHO.xlsx']

In [8]:
data = [ pd.read_excel(p, names = ['q','a']) for p in paths]

In [9]:
QA = pd.DataFrame()

In [10]:
for d in data[:]:
    if d.shape[0]>0:
        print(d.shape)
        QA = pd.concat((QA,d),axis = 0)

(116, 2)
(10, 2)
(26, 2)
(86, 2)
(85, 2)


In [11]:
QA.shape

(323, 2)

In [12]:
QA.sample(10)

Unnamed: 0,q,a
68,Should pregnant women be tested for COVID-19?,Testing protocols and eligibility vary dependi...
56,When to Seek Emergency Medical Attention\n,Look for emergency warning signs* for COVID-19...
67,What are the symptoms and complications that C...,People with COVID-19 have had a wide range of ...
27,"Am I at risk for COVID-19 from mail, packages,...",There is still a lot that is unknown about COV...
2,What is COVID-19?,The Coronavirus Disease is a flu-like respirat...
24,Can I get COVID-19 through second-hand clothe...,The virus that causes COVID-19 can survive on ...
74,"If I have recovered from COVID-19, will I be i...",CDC and partners are investigating to determin...
24,Are COVID toes related to blood clotting?,Yes. People with coronavirus have the ability ...
55,Should the general public cover their faces wi...,The CDC now recommends that people without sym...
12,Why are we seeing a rise in cases?\n,The number of cases of COVID-19 being reported...


In [13]:
 snow = nltk.stem.SnowballStemmer('english')

In [14]:
def clean(q_a):
    temp = []
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in q_a:
        sentence = str(sentence)
        sentence = sentence.lower()                 # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]\d',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]@',r' ',sentence)        #Removing Punctuations
        words = [snow.stem(word) for word in sentence.split() if word not in stop]   # Stemming and removing stopwords
        temp.append(words)
    q_a = temp  
    return q_a

In [15]:
def clean_col(c):
    sent = []
    for row in c:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        sent.append(sequ)
    c = sent
    return c

In [16]:
def combine_cols(l1,l2):
    r = []
    for q, a in  zip(l1,l2):
        r.append(q+" "+a)
    return r

In [17]:
q_new =  clean_col(clean(QA['q']))
a_new =  clean_col(clean(QA['a']))
qa_combined = combine_cols(q_new,a_new)


In [18]:
QA['q_new'] =  q_new
QA['a_new'] = a_new
QA['qa_combined'] = qa_combined

In [19]:
QA.head()

Unnamed: 0,q,a,q_new,a_new,qa_combined
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what is a novel coronavirus?,a novel coronavirus is a new coronavirus that...,what is a novel coronavirus? a novel coronav...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",whi is the diseas be call coronavirus diseas ...,"on februari 11, 2020 the world health organ a...",whi is the diseas be call coronavirus diseas ...
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,whi might someon blame or avoid individu and ...,peopl in the u.s. may be worri or anxious abo...,whi might someon blame or avoid individu and ...
3,How can people help stop stigma related to COV...,People can ght stigma by providing social supp...,how can peopl help stop stigma relat to covid...,peopl can ght stigma by provid social support...,how can peopl help stop stigma relat to covid...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...,whi do some state covid-19 case number someti...,cdc overal case number are valid through a co...,whi do some state covid-19 case number someti...


In [20]:
w2v_data = QA['qa_combined']

In [21]:
splitted = []
for row in w2v_data: 
    splitted.append([word for word in row.split()]) 

In [22]:
train_w2v = Word2Vec(splitted,min_count=1,size=100, workers=4)

In [23]:
avg_data = []
for row in splitted:
    vec = np.zeros(100)
    count = 0
    for word in row:
        try:
            vec += train_w2v[word]
            count += 1
        except:
            pass
    avg_data.append(vec/count)

  import sys


In [24]:
avg_data = np.array(avg_data)

In [25]:
for i,(q , a) in enumerate(zip(QA['q'],QA['a'])):
    QA_dict[i] = [q,a]                   

In [26]:
def wv_test(q):
    tq = [q]
    tq = clean(tq)
    tq = clean_col(tq)
    splitted_test = []
    for row in tq: 
        splitted_test.append([word for word in row.split()]) 
    avg_data_test = []
    for row in splitted_test:
        vec = np.zeros(100)
        count = 0
        for word in row:
            try:
                vec += train_w2v[word]
                count += 1
            except:
                pass
        avg_data_test.append(vec/count)
    return np.array(avg_data_test)

In [27]:
QA['q'].sample(20)

58                    Is there anything I should not do?
57                    Is there anything I should not do?
35     Why are there reagent shortages that are impac...
26     Is it safe to get care for my other medical co...
80     What does a well-controlled health condition m...
77     How are COVID-19 and influenza viruses different?
39     If you are tested and the test is negative, do...
40       Are false negatives a problem with COVID tests?
4      What symptoms do coronaviruses typically cause...
71     Do pregnant women with suspected or confirmed ...
99     Can I get COVID-19 from my pets or other anima...
19     What temperature kills the virus that causes C...
60     What cleaning products should I use to protect...
115    Who is involved in community mitigation action...
81     What should we do to protect employees at busi...
41     While school’s out, how can I keep my family h...
20     What can I do to protect myself andProtection ...
64     Does a person’s blood ty

In [None]:
while True:
    q = input("Q: ")
    q = wv_test(q)
    ranks = np.array([cosine_similarity(q.reshape(1,100),avg_data[i].reshape(1,100)) for i in range(QA.shape[0])]).reshape(QA.shape[0],1)
    loc = np.argmax(ranks)  
    a = QA_dict[loc][1]
    print("A: ",a)

Q: Should I stay inside the house


  


A:  Pregnant women should take the same precautions to avoid COVID-19 infection as other people.
