In [122]:
import os
import io
import json
import re
import random

import ruamel.yaml
yaml = ruamel.yaml.YAML()

from processing import processing

from importlib import reload
reload(processing)

In [123]:
# define paths

home = os.getenv("PROJ_HOME")

articles_filepath = os.path.join(home, "chatbot/data/chatbot_knowledgebase/subject_matter/article_data.json")
exhibits_filepath = os.path.join(home, "chatbot/data/chatbot_knowledgebase/institutional/exhibits.json")
galleries_filepath = os.path.join(home, "chatbot/data/chatbot_knowledgebase/institutional/galleries.json")

lookup_directory = os.path.join(home, "chatbot/rasa/data/nlu/lookups")
synonym_directory = os.path.join(home, "chatbot/rasa/data/nlu/synonyms")
intent_templates_directory = os.path.join(home, "chatbot/rasa/data/txt_files/templates")

In [124]:
# remove extra spaces and punctuation

def remove_punc(text):
    text_sans_punc = processing.preprocess(text,
                                           accented_chars=False,
                                           contractions=False,
                                           convert_num=False,
                                           lemmatization=False,
                                           stop_words=False)
    return text_sans_punc

# fill intent template with given entity having index ind

def fill_template(entity, templates, ind):
    num_templates = len(templates)
    template = templates[ind % num_templates]
    anno_filled = "[" + entity + "]"
    anno_empty = re.search(anno_pattern, template)
    filled = template.replace(anno_empty.group(0), anno_filled)
    return filled

# functions for converting a list to a yaml string literal

def literalize_list(v):
    assert isinstance(v, list)
    buf = io.StringIO()
    yaml.dump(v, buf)
    return ruamel.yaml.scalarstring.LiteralScalarString(buf.getvalue())

def transform_value(d, key, transformation):
    if isinstance(d, dict):
        for k, v in d.items():
            if k == key:
                d[k] = transformation(v)
            else:
                transform_value(v, key, transformation)
    elif isinstance(d, list):
        for elem in d:
            transform_value(elem, key, transformation)


In [125]:
# get list of unique subjects

with open(articles_filepath, "r") as f:
    articles = json.load(f)

subjects = []
for article in articles:
    subjects.append(article["title"])
    subjects += article["aliases"]
subjects = list(set(subjects))

paren_pattern = r"(\(.*?\))|(\(.*)"

clean_subjects = []
for subject in subjects:
    this_subject = subject.split(":")[0]
    this_subject = re.sub(paren_pattern, "", this_subject)
    parts = this_subject.split(".")
    num_parts = len(parts)
    this_subject = ""
    for count, part in enumerate(parts):
        if len(part) <= 4 and count < num_parts - 1:
            this_subject += part + "."
        else:
            this_subject += part
            break
    clean_subjects.append(this_subject.strip())

num_subjects = len(clean_subjects)

In [126]:
# get list of exhibit names, including aliases

with open(exhibits_filepath, "r") as f:
    exhibit_data = json.load(f)

exhibits = []
exhibit_synonyms = {}

for item in exhibit_data:
    title = remove_punc(item["title"])
    aliases = [remove_punc(alias) for alias in item["aliases"]]
    exhibits.append(title)
    exhibits += aliases
    these_aliases = [title] + aliases
    if item["id"] in exhibit_synonyms.keys():
        exhibit_synonyms[item["id"]] += these_aliases
    else:
        exhibit_synonyms[item["id"]] = these_aliases

clean_exhibits = list(set(exhibits))

num_exhibits = len(exhibit_synonyms)

In [127]:
# get list of gallery names, including aliases (TBA)

with open(galleries_filepath, "r") as f:
    gallery_data = json.load(f)

galleries = []

for item in gallery_data:
    galleries.append(item["title"])

clean_galleries = list(set(galleries))

num_galleries = len(clean_galleries)

In [128]:
# export entity lists found above to yaml lookup files

entity_names = {
    "subject": clean_subjects,
    "exhibit": clean_exhibits,
    "gallery": clean_galleries
}

for entity in entity_names.keys():
    lookup_dict = {
            "version": "3.1",
            "nlu": [{"lookup": entity, "examples": entity_names[entity]}]
        }

    transform_value(lookup_dict, 'examples', literalize_list)

    filepath = os.path.join(lookup_directory, entity + ".yml")
    with open(filepath, "w") as f:
        yaml.dump(lookup_dict, f)

In [129]:
# export exhibit alias names to yaml file for synonym matching

synonym_dict = {
            "version": "3.1",
            "nlu": []
        }

for exhibit_id in exhibit_synonyms.keys():
    synonym_dict["nlu"].append({"synonym": exhibit_id, "examples": exhibit_synonyms[exhibit_id]})

transform_value(synonym_dict, 'examples', literalize_list)

filepath = os.path.join(synonym_directory, "exhibits.yml")
with open(filepath, "w") as f:
    yaml.dump(synonym_dict, f)

In [130]:
# generate yaml nlu files containing annotated intents

anno_pattern = r"\[.*?\]"

for infile in os.listdir(intent_templates_directory):
    infilepath = os.path.join(intent_templates_directory, infile)

    nonanno_examples = []
    anno_examples = []
    with open(infilepath, "r") as file:
        for line in file:
            if re.search(anno_pattern, line):
                anno_examples.append(line.strip("\n"))
            else:
                nonanno_examples.append(line.strip("\n"))

    num_templates =  len(anno_examples)

    intent = infile.split("_template")[0]
    if "explanation" in intent:
        entity_list = clean_subjects
        num_entities = num_subjects
        dir_out = os.path.join(home, "rasa/data/nlu/subjects")
    elif "exhibit" in intent:
        entity_list = clean_exhibits
        num_entities = num_exhibits
        dir_out = os.path.join(home, "rasa/data/nlu/exhibits")
    elif "gallery" in intent:
        entity_list = clean_galleries
        num_entities = num_galleries
        dir_out = os.path.join(home, "rasa/data/nlu/galleries")
    else:
        continue

    rand_entities = []
    for _ in range(5):
        rand_entities.append(random.choice(entity_list))


    if (delta := num_templates - num_templates) > 0:
        sampled_entities = []
        for _ in range(delta):
            sampled_entities += random.choice(entity_list)
        entity_list += sampled_entities

    examples = nonanno_examples
    for count, entity in enumerate(entity_list):
        example = fill_template(entity=entity, templates=anno_examples, ind=count)
        examples.append(example)

    intent_dict = {
        "version": "3.1",
        "nlu": [{"intent": intent, "examples": examples}]
    }

    transform_value(intent_dict, 'examples', literalize_list)
    outfile = intent + ".yml"
    outfilepath = os.path.join(dir_out, outfile)

    with open(outfilepath, "w") as f:
        yaml.dump(intent_dict, f)