In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import stop_words
import os
from nltk.corpus import stopwords 
import nltk
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer          #For Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
#nltk.download('stopwords')

In [3]:
stop = set(['?',',',':','@']) 

In [4]:
ROOT = '../data/'

QA_dict = {}

In [5]:
os.listdir(ROOT)

['CDC.xlsx',
 'Coronavirus (COVID-19) frequently asked questions _ CDC.pdf',
 'covidquestionsca.xlsx',
 'EAC.xlsx',
 'JHU.xlsx',
 'MOHKE.xlsx',
 'sources.txt',
 'WHO.xlsx']

In [6]:
paths = [ROOT+d for d in os.listdir(ROOT) if d.endswith(".xlsx")]

In [7]:
paths

['../data/CDC.xlsx',
 '../data/covidquestionsca.xlsx',
 '../data/EAC.xlsx',
 '../data/JHU.xlsx',
 '../data/MOHKE.xlsx',
 '../data/WHO.xlsx']

In [8]:
data = [ pd.read_excel(p, names = ['q','a']) for p in paths]

In [9]:
QA = pd.DataFrame()

In [10]:
for d in data[:]:
    if d.shape[0]>0:
        print(d.shape)
        QA = pd.concat((QA,d),axis = 0)

(116, 2)
(10, 2)
(26, 2)
(86, 2)
(85, 2)


In [11]:
QA.shape

(323, 2)

In [12]:
QA.sample(10)

Unnamed: 0,q,a
9,Can the virus that causes COVID-19 be transmit...,Studies to date suggest that the virus that ca...
50,What if my child or someone else in the home i...,If your child with special healthcare needs be...
33,"Is there a vaccine, drug or treatment for COVI...",Those affected should receive care to relieve ...
23,How safe are goods from China?,Scientific experiments showed that the virus t...
8,How does COVID-19 spread?,People can also catch COVID-19 if they breathe...
75,How are COVID-19 and influenza viruses similar?,"Both viruses are transmitted by contact, dropl..."
13,Can someone who has had COVID-19 spread the il...,The virus that causes COVID-19 is spreading fr...
81,Are smokers and tobacco users at higher risk o...,Smoking products such as water pipes often inv...
80,"For businesses that remain open, what are best...",CDC has released guidelines for infection cont...
57,"How can I prepare in case my child’s school, c...",Talk to the school or facility about their eme...


In [13]:
 snow = nltk.stem.SnowballStemmer('english')

In [14]:
def clean(q_a):
    temp = []
    snow = nltk.stem.SnowballStemmer('english')
    for sentence in q_a:
        sentence = str(sentence)
        sentence = sentence.lower()                 # Converting to lowercase
        cleanr = re.compile('<.*?>')
        sentence = re.sub(cleanr, ' ', sentence)        #Removing HTML tags
        sentence = re.sub(r'[?|!|\'|"|#]\d',r'',sentence)
        sentence = re.sub(r'[.|,|)|(|\|/]@',r' ',sentence)        #Removing Punctuations
        words = [snow.stem(word) for word in sentence.split() if word not in stop]   # Stemming and removing stopwords
        temp.append(words)
    q_a = temp  
    return q_a

In [15]:
def clean_col(c):
    sent = []
    for row in c:
        sequ = ''
        for word in row:
            sequ = sequ + ' ' + word
        sent.append(sequ)
    c = sent
    return c

In [16]:
q_new =  clean_col(clean(QA['q']))
a_new =  clean_col(clean(QA['a']))

In [17]:
QA['q_new'] =  q_new
QA['a_new'] = a_new

In [18]:
QA.head()

Unnamed: 0,q,a,q_new,a_new
0,What is a novel coronavirus?,A novel coronavirus is a new coronavirus that ...,what is a novel coronavirus?,a novel coronavirus is a new coronavirus that...
1,Why is the disease being called coronavirus di...,"On February 11, 2020 the World Health Organiza...",whi is the diseas be call coronavirus diseas ...,"on februari 11, 2020 the world health organ a..."
2,Why might someone blame or avoid individuals a...,People in the U.S. may be worried or anxious a...,whi might someon blame or avoid individu and ...,peopl in the u.s. may be worri or anxious abo...
3,How can people help stop stigma related to COV...,People can ght stigma by providing social supp...,how can peopl help stop stigma relat to covid...,peopl can ght stigma by provid social support...
4,Why do some state’s COVID-19 case numbers some...,CDC’s overall case numbers are validated throu...,whi do some state covid-19 case number someti...,cdc overal case number are valid through a co...


In [20]:
count_vect = TfidfVectorizer(ngram_range=(1,2))
bow_data = count_vect.fit_transform(QA['q_new'])

In [21]:
tq = ["cornonavirus"]
tq = clean(tq)
tq = clean_col(tq)

In [22]:
r = count_vect.transform(tq)

In [23]:
for i,(q , a) in enumerate(zip(QA['q'],QA['a'])):
    QA_dict[i] = [q,a]                   

In [24]:
def cv_test(q):
    tq = [q]
    tq = clean(tq)
    tq = clean_col(tq)
    r = count_vect.transform(tq)
    return r

In [25]:
QA['q'].sample(20)

106         Can I take my dog to daycare or a groomer?\n
3                     What are the symptoms of COVID-19?
27          Who is at risk of developing severe illness?
2                      What is different about the virus
34     Is there a vaccine, drug or treatment for COVI...
100       Do I need to get my pet tested for COVID-19?\n
48     Can humans be infected with the COVID-19 from ...
95     How long do companies need to close for disinf...
20     Are Africans less at risk of contracting the C...
15     What can I do to protect myself and prevent th...
24                    How likely am I to catch COVID-19?
46     Is my child with an underlying medical conditi...
46     Can humans be infected with the COVID-19 from ...
9      Can the virus that causes COVID-19 be transmit...
63     Will schools be dismissed if there is an outbr...
3      How can people help stop stigma related to COV...
38     What is the difference between molecular tests...
56     Is there official guidan

In [26]:
QA.shape[0] == bow_data.shape[0]

True

In [27]:
bow_data[0].shape

(1, 2729)

In [28]:
r.shape

(1, 2729)

In [None]:
while True:
    q = input("Q: ")
    q = cv_test(q)
    ranks = np.array([cosine_similarity(q,bow_data[i]) for i in range(QA.shape[0])]).reshape(QA.shape[0],1)
    loc = np.argmax(ranks)  
    a = QA_dict[loc][1]
    print("A: ",a)

Q: can animals spread virus
A:  Although we know certain bacteria and fungi can be carried on fur and hair, there is no evidence that viruses,
including the virus that causes COVID-19, can spread to people from the skin, fur, or hair of pets.
However, because animals can sometimes carry other germs that can make people sick, it’s always a good idea to
practice healthy habits around pets and other animals, including washing hands before and after interacting with
them.

Q: what coronavirus
A:  Coronaviruses are a large family of viruses which occur naturally in animals and may cause illness in animals and humans. In humans, coronaviruses cause respiratory infections ranging from common colds with mild symptoms to more severe diseases such as the Middle East Respiratory Syndrome (MERS) and the Severe Acute Respiratory Syndrome (SARS). The most recently discovered virus in this family causes the Coronavirus Disease, COVID-19.
Q: what is covid-19
A:  The Coronavirus Disease is a flu-like r