In [1]:
import requests
import operator
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import pickle
from chatterbot import ChatBot
from chatterbot.trainers import ChatterBotCorpusTrainer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial

In [2]:
def get_FDA_QA():
    URL = 'https://www.fda.gov/emergency-preparedness-and-response/coronavirus-disease-2019-covid-19/covid-19-frequently-asked-questions'
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results =  soup.find_all(class_=' panel panel-default fda-accordion-panel')
    Question=[]
    Answer=[]
    for element in results:
        Q = element.find('a')
        Question.append(Q)
        A=element.find('p')
        Answer.append(A)
    Question=pd.DataFrame(Question)
    Answer=pd.DataFrame(Answer)
    QA=pd.concat([Question, Answer], axis=1, sort=False)
    QA.columns = range(QA.shape[1])
    QA=QA.rename(columns={0:'Questions',1:"Answers"})
    QA["Questions"]=QA["Questions"].astype(str).apply(lambda x: x.split('Q:')[1].split('?')[0])
    QA["Questions"]=QA["Questions"].astype(str).apply(lambda x:x.strip())
    QA["Answers"]=QA["Answers"].astype(str).apply(lambda x: x.split('A:')[1].split('</p')[0])
    return QA

In [3]:
def get_CDC_QA():
    URL = 'https://www.cdc.gov/coronavirus/2019-ncov/faq.html'
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    results =  soup.find_all(class_='card card-accordion ')
    Question=[]
    Answer=[]
    for element in results:
        Q = element.find('div')
        Question.append(Q)
        A=element.find('p')
        Answer.append(A)
    Question=pd.DataFrame(Question)
    Answer=pd.DataFrame(Answer)
    QA=pd.concat([Question, Answer], axis=1, sort=False)
    QA.columns = range(QA.shape[1])
    QA=QA.rename(columns={0:'Questions',1:"Answers"})
    QA["Questions"]=QA["Questions"].astype(str).apply(lambda x: x.split('role="heading">')[1].split('?</span>')[0])
    QA["Answers"]=QA["Answers"].astype(str).apply(lambda x: x.split('<p>')[1].split('</p>')[0])
    QA["Answers"].replace('', np.nan, inplace=True)
    QA.dropna(subset=['Answers'], inplace=True)
    return QA

In [5]:
#get all FAQ source
FDA=get_FDA_QA()
CDC=get_CDC_QA()
other_source=pd.read_excel(r'/Users/faye/Desktop/Chatbot/Capstone/other_source.xlsx')
final_df=FDA.append([CDC, other_source])

In [6]:
final_df.reset_index(drop=True,inplace=True)
data=final_df['Questions'].apply(lambda x: x.lower())

In [14]:
final_df.to_csv(r'/Users/faye/Desktop/Chatbot/Capstone/question.csv')

In [7]:
#when user enter new questions:
def ask_question(string,question_bank):
    #get all question vector
    question_lower=string.lower()
    question=pd.Series([question_lower])
    question_bank=question_bank.append(question,ignore_index=True)
    tfidf_vectorizer=TfidfVectorizer(use_idf=True)  
    tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(question_bank)
    question_vector = tfidf_vectorizer_vectors.todense()
    return question_vector

In [8]:
#cos similarity
def search_best_answer(string,question_bank):
    question_vec=ask_question(string,question_bank)
    cos_score=[]
    user_question_vec=question_vec[-1]
    for i in question_vec[:-1]:
        result = 1 - spatial.distance.cosine(user_question_vec, i)
        cos_score.append(result)
    max_index, max_value = max(enumerate(cos_score), key=operator.itemgetter(1))
    return max_index

In [9]:
def return_answer(string,question_bank):
    index=search_best_answer(string,question_bank)
    print(final_df['Answers'][index])

In [11]:
question = input("What's your question: ")
return_answer(question,data)

What's your question: what's coronavirous
COVID-19 is the infectious disease caused by the most recently discovered coronavirus. This new virus and disease were unknown before the outbreak began in Wuhan, China, in December 2019.
