In [23]:
import os
import io
import json
import re
import pandas as pd
import ast

import ruamel.yaml
yaml = ruamel.yaml.YAML()

from dotenv import load_dotenv
load_dotenv();

In [24]:
# paths

home = os.getenv("PROJ_HOME")

data_in_filepath = os.path.join(home, "data/chatbot_knowledgebase/subject_matter/encyclopedia_britannica.csv")
subject_list_filepath = os.path.join(home, "rasa/data/nlu/lookups/subjects.txt")

In [25]:
# load encyclopedia data

data = pd.read_csv(data_in_filepath,
                   header=0,
                   names=["Id", "Title", "Alt Title", "Field", "Is Person", "Text"],
                   index_col=0
                   )

In [71]:
# write list of unique subjects to a txt file for lookup

titles = data["Title"]
alt_titles = data["Alt Title"].apply(lambda x: ast.literal_eval(x))

subjects = []
for title, aliases in zip(titles, alt_titles):
    these_titles = [title] + aliases
    subjects += these_titles

subjects = list(set(subjects))

paren_pattern = r"(\(.*?\))|(\(.*)"

clean_subjects = []
for subject in subjects:
    this_subject = subject.split(":")[0]
    this_subject = re.sub(paren_pattern, "", this_subject)
    parts = this_subject.split(".")
    num_parts = len(parts)
    this_subject = ""
    for count, part in enumerate(parts):
        if len(part) <= 4 and count < num_parts - 1:
            this_subject += part + "."
        else:
            this_subject += part
            break
    clean_subjects.append(this_subject.strip())

with open(subject_list_filepath, "w") as txt_file:
    for item in clean_subjects:
        txt_file.write(f"{item}\n")

In [72]:
# generate annotated ask_for_explanation intents from items in subject_list

non_anno_examples = []

no_annotations_filepath = os.path.join(home, "rasa/data/nlu/intents/ask_for_explanation/no_annotation.txt")
with open(no_annotations_filepath, "r") as file:
    for line in file:
        non_anno_examples.append(line.strip("\n"))

In [73]:
templates = []

templates_filepath = os.path.join(home, "rasa/data/nlu/intents/ask_for_explanation/templates.txt")
with open(templates_filepath, "r") as file:
    for line in file:
        templates.append(line.strip("\n"))

In [74]:
# merge subject list and intent templates to form list of annotated ask_for_explanation intent exampes

anno_pattern = "(\[.*?\])"

num_templates = len(templates)

def fill_template(filler, n):
    template = templates[n % num_templates]
    anno_subject = "[" + filler + "]"
    match = re.search(anno_pattern, template)
    filled = template.replace(match.group(0), anno_subject)
    return filled

intent_examples = non_anno_examples
i = 0
for subject in clean_subjects:
    intent = fill_template(subject, i)
    intent_examples.append(intent)
    i += 1

In [75]:
# write ask_for_explanation intent examples to a txt file

intents_txt = os.path.join(home, "rasa/data/nlu/intents/ask_for_explanation/examples.txt")

with open(intents_txt, "w") as txt_file:
    for item in intent_examples:
        txt_file.write(f"{item}\n")

In [76]:
# generate yaml file for intent ask_for_explanation

intent_dict = {
    "version": "3.1",
    "nlu": [
        {
            "intent": "ask_for_explanation",
            "examples": intent_examples
        }
    ]
}

def literalize_list(v):
    assert isinstance(v, list)
    buf = io.StringIO()
    yaml.dump(v, buf)
    return ruamel.yaml.scalarstring.LiteralScalarString(buf.getvalue())

def transform_value(d, key, transformation):
    if isinstance(d, dict):
        for k, v in d.items():
            if k == key:
                d[k] = transformation(v)
            else:
                transform_value(v, key, transformation)
    elif isinstance(d, list):
        for elem in d:
            transform_value(elem, key, transformation)

transform_value(intent_dict, 'examples', literalize_list)

yaml_file = os.path.join(home, "rasa/data/nlu/intent_ask_for_explanation.yml")

with open(yaml_file, "w") as file:
    yaml.dump(intent_dict, file)