# Create Your Own Chatbot

In this notebook, you can create a series of intents with paraphrase generation and use those in a dialogflow agent

You may optionally mount google drive to use one of its directories as a file system

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Use this block to declare any variables you will use throughout the notebook

In [None]:
# path to the directory you wish to use as a file system
BASE_PATH='/content/drive/MyDrive/VaccineFAQs-BigShot/questions'

# path to the base HTML/XML file from which to extract QAs
HTML_DOC_PATH = BASE_PATH + '/COVID-19-Vaccine-FAQs.html'

# path to a simple QA CSV file which will be the basis for agent generation
QA_CSV_SIMPLE_PATH = BASE_PATH + '/india-qa-simple.csv'

# path to an intermediate CSV file which will contain dialogflow intents
QA_CSV_FORMATTED_PATH = BASE_PATH + '/akuryla-qa-formatted.csv'

# Parsing an HTML File for Q/A Pairs (Example)

We'll use lxml to parse Q/A pairs out of our HTML site, and export our parsed data into a simple Q/A CSV

In [None]:
from lxml import html, etree

# utility method to print xml elements
def xml_print(elmt):
  print(etree.tostring(elmt, pretty_print=True))

# we fetch the content from the html document, stored in drive
html_doc = open(HTML_DOC_PATH, mode='r')
doc = html.parse(html_doc, parser=html.html_parser)

# parse the html document into sets of Q/A pairs
qa_pairs = []
elmt_list=doc.xpath('/html/body/section[4]/div[1]/div/div/div[2]/div[2]/div/div/div/span/div')

for elmt in elmt_list:
  elmt_children = elmt.xpath('div')
  qa_pairs.append([elmt_children[0].text_content(), elmt_children[1].text_content()])

# utility function to print Q/A pairs
def print_pairs(pairs):
  for q, a in qa_pairs:
    print("Q:", q)
    print("A:", a, "\n")

print_pairs(qa_pairs)

In [None]:
import csv

# write Q/A pairs to csv
def write_pairs_to_csv(qa_pairs, csv_file_path):
  with open(csv_file_path, 'w') as csvfile:
    filewriter = csv.writer(csvfile)

    # write header
    filewriter.writerow(['Question', 'Answer'])
    
    # write contents
    for entry in qa_pairs:
      filewriter.writerow(entry)

# write_pairs_to_csv(qa_pairs, QA_CSV_SIMPLE_PATH)

# Generate Intents for Dialogflow Agent

- Parse Q/A pairs from our simple CSV file 
- Generate paraphrases for each question
- Output a list of dialogflow intents to our formatted CSV file
- You may manually edit the formatted CSV file after this step to ensure all intents are up to standard

In [None]:
import csv

# read entries from csv
def read_entries_from_csv(csv_file_path):
  with open(csv_file_path, 'r') as csvfile:
    filereader = csv.reader(csvfile)

    faqs = []
    for row in filereader:
      faqs.append({"question": row[0].replace('/', ''), "answer": row[1].replace('/', '')})

    return faqs[1:]

faqs = read_entries_from_csv(QA_CSV_SIMPLE_PATH)
faqs

In [None]:
!pip install transformers==2.8.0

In [None]:
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer

def set_seed(seed):
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# set seed to make examples reproducible
set_seed(42)

# load pretrained model for text-to-text conversion
model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_paraphraser')
tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_paraphraser')

# if GPU is available, set global device to GPU (cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print ("device ",device)

# move model into current device
model = model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1208.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691413.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…


device  cuda


In [None]:
def paraphrase_sentence(sentence, max_length=256, num_return_sequences=10):
  # generate input encoding
  text =  "paraphrase: " + sentence + " </s>"
  encoding = tokenizer.encode_plus(text, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

  # specify parameters for paraphrasing model and generate tokenized paraphrases
  beam_outputs = model.generate(
      input_ids=input_ids, attention_mask=attention_masks,
      do_sample=True,
      max_length=max_length,
      top_k=120,
      top_p=0.98,
      early_stopping=True,
      num_return_sequences=num_return_sequences
  )

  # iterate through results, decode and filter out repeated paraphrases
  final_outputs =[]
  for beam_output in beam_outputs:
      sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
      if sent.lower() != sentence.lower() and sent not in final_outputs:
          final_outputs.append(sent)
  
  return final_outputs

In [None]:
# generate all paraphrases
for entry in faqs:
  entry['paraphrases'] = paraphrase_sentence(entry['question'])

In [None]:
# write faqs in desired format
def write_formatted_faq_to_csv(faqs, csv_file_path):
  with open(csv_file_path, 'w') as csvfile:
    filewriter = csv.writer(csvfile)

    # write header
    filewriter.writerow(['IntentID', 'IntentName', 'Query', 'Response'])
    
    # write contents
    for idx, entry in enumerate(faqs):
      filewriter.writerow([str(idx + 1), entry['question'].replace('/', ''), \
                           entry['question'].replace('/', ''), entry['answer']])
      for paraphrase in entry['paraphrases']:
        filewriter.writerow([str(idx + 1), '', paraphrase, ''])

write_formatted_faq_to_csv(faqs, QA_CSV_FORMATTED_PATH)

# Generate Dialogflow Agent

- Specify output path
- Read intents from formatted CSV
- specify keywords to filter out unnecessary/incorrectly paraphrased questions
- Add default intents and format output
- Generate an importable Dialogflow agent

In [None]:
# specify output folder name
OUTPUT_FOLDER_NAME = 'akuryla_usa_output'

output_path = BASE_PATH + '/' + OUTPUT_FOLDER_NAME
intent_path = output_path + '/intents'

In [None]:
import csv

def read_formatted_csv_to_faq(csv_file_path):
  with open(csv_file_path, 'r') as csvfile:
    filereader = csv.reader(csvfile)

    faqs = []
    curr_entry = {}
    curr_intent_id = '-1'

    for idx, row in enumerate(filereader):
      # skip headers
      if idx == 0:
        continue

      # check whether it's a new question
      if curr_intent_id != row[0]:
        # set current intent id
        curr_intent_id = row[0]

        # if entry is not empty, add to faqs
        if curr_entry:
          faqs.append(curr_entry)

        # initialize current entry
        curr_entry = {'question': row[1], 'answer': row[3], 'paraphrases': []}
      else:
        # add paraphrase to curr_entry
        curr_entry['paraphrases'].append(row[2])

    return faqs

faqs = read_formatted_csv_to_faq(QA_CSV_FORMATTED_PATH)
faqs

In [None]:
def add_default_intents(faqs):
  faqs.append({
      "question": "Default Welcome Intent",
      "answer": "Greetings! I am Vaccine chatbot. You can ask me questions about COVID-19 vaccines such as vaccine safety, side effects, immunity and allergies.",
      "paraphrases": ["Hi",
                      "Hello",
                      "Hi there",
                      "Hey there",
                      "Heya",
                      "Howdy",
                      "How are you?",
                      "Just going to say hi"]})
  faqs.append({
      "question": "Default Fallback Intent",
      "answer": "I'm sorry, I don't think I can answer that question. Please try again.",
      "paraphrases": []})
  faqs.append({
      "question": "End Session",
      "answer": "",
      "paraphrases": ["OK", 
                      "Thank you", 
                      "That's enough",
                      "Good bye",
                      "Bye",
                      "See you",
                      "Stop",
                      "No"]})


# you may filter out paraphrases by keyword
def should_filter(phrase, keywords):
  return any(elmt.lower() in phrase.lower() for elmt in keywords)
  
def filter_keywords(faqs, keywords):
  for faq in faqs:
    faq['paraphrases'] = [x for x in faq['paraphrases'] if not should_filter(x, keywords)]

add_default_intents(faqs)

keywords = ["HIV", "AIDS", "cholera", "cattle", "cow", "colibid", "covirid", \
            "covarid", "SVDC-19"]

filter_keywords(faqs, keywords)
faqs

In [None]:
import os

def make_folder_if_absent(output_path, intent_path):
  if not os.path.exists(output_path):
      os.makedirs(output_path)

  if not os.path.exists(intent_path):
      os.makedirs(intent_path)

make_folder_if_absent(output_path, intent_path)

In [None]:
import uuid

# there will be one question entity element per paraphrased/original question
def make_question_entity_element(question):
  return {
    "id": str(uuid.uuid1()),
    "data": [
      {
        "text": question,
        "userDefined": False
      }
    ],
    "isTemplate": False,
    "count": 0,
    "lang": "en",
    "updated": 0
  }

# a question entity is a collection of question entity elements
def make_question_entity(question, paraphrases):
  question_entity = []

  question_entity.append(make_question_entity_element(question))

  for paraphrase in paraphrases:
    question_entity.append(make_question_entity_element(paraphrase))
  
  return question_entity

# answer entities typically consist of a single object
def make_answer_entity(intent_name, answer):
  return {
    "id": str(uuid.uuid1()),
    "name": intent_name,
    "auto": True,
    "contexts": [],
    "responses": [
      {
        "resetContexts": False,
        "action": "",
        "affectedContexts": [],
        "parameters": [],
        "messages": [
          {
            "type": "0",
            "title": "",
            "textToSpeech": "",
            "lang": "en",
            "speech": [answer],
            "condition": ""
          }
        ],
        "speech": []
      }
    ],
    "priority": 500000,
    "webhookUsed": False,
    "webhookForSlotFilling": False,
    "fallbackIntent": False,
    "events": [],
    "conditionalResponses": [],
    "condition": "",
    "conditionalFollowupEvents": []
  }

In [None]:
def format_answer(question, answer):
  # only format non-default answers
  if question.startswith("Default Welcome Intent") \
  or question.startswith("Default Fallback Intent"):
    return answer
  else:
    return "Q: " + question + "\nA: " + answer

for idx, entry in enumerate(faqs):
  entry['intent_name'] = f"VaccineFAQ.{entry['question']}"[:56]
  entry['question_entity'] = make_question_entity(entry['question'], entry['paraphrases'])
  entry['answer_entity'] = make_answer_entity(entry['intent_name'], format_answer(entry['question'], entry['answer']))

In [None]:
import json

agent={
  "description": "",
  "language": "en",
  "shortDescription": "",
  "examples": "",
  "linkToDocs": "",
  "displayName": "Vaccine-Bot-FAQ",
  "disableInteractionLogs": False,
  "disableStackdriverLogs": True,
  "defaultTimezone": "America/New_York",
  "isPrivate": False,
  "mlMinConfidence": 0.3,
  "supportedLanguages": ["en"],
  "enableOnePlatformApi": True,
  "onePlatformApiVersion": "v2beta1",
  "secondaryKey": "9d74e6a3640d4ce3807cf42e2fdcea79",
  "analyzeQueryTextSentiment": False,
  "enabledKnowledgeBaseNames": [],
  "knowledgeServiceConfidenceAdjustment": 0.0,
  "dialogBuilderMode": False,
  "baseActionPackagesUrl": "",
  "enableSpellCorrection": False
}

package={
  "version": "1.0.0"
}

def write_to_output_path(faqs):
  with open(output_path + "/agent.json", 'w') as outfile:
    json.dump(agent, outfile)

  with open(output_path + "/package.json", 'w') as outfile:
    json.dump(package, outfile)

  for entry in faqs:
    with open(output_path + "/intents/" + entry["intent_name"] + "_usersays_en.json", 'w') as outfile:
      json.dump(entry['question_entity'], outfile)

    with open(output_path + "/intents/" + entry["intent_name"] + ".json", 'w') as outfile:
      json.dump(entry['answer_entity'], outfile)

write_to_output_path(faqs)