In [129]:
import json
file_ind = "017"
target_dir = "test_gen/"
#target_dir = ""
mwoz_dir = "/code/workflows/multiwoz/data/MultiWOZ_2.2/train/"
mwoz_file =  mwoz_dir + "dialogues_" + file_ind + ".json"
seq_in_file = target_dir + "seq" + file_ind + ".in"
seq_out_file = target_dir + "seq" + file_ind + ".out"
label_file = target_dir + "label" + file_ind
service_domains = ["hotel", "restaurant"]
include_line_num = False # flag to indicate if line numbers should be included

In [130]:
# build xlate table - xlate domain specific slot names and intents to generic name
xlate_slots_intents = True
xlate_file = "slot_xlate.json"
if xlate_slots_intents:
    xlate_dict = {}
    with open(xlate_file, "r") as f_xlate:
        xlate_list = json.load(f_xlate)
    for entry in xlate_list:
        for intents in entry["intents"]:
            xlate_dict[intents["name"]] = intents["new-name"]
        for slot_names in entry["slots"]:
            xlate_dict[slot_names["name"]] = slot_names["new-name"]

In [131]:
with open(mwoz_file, "r") as fp:
    mwoz_dialogues_list = json.load(fp)

In [132]:
fp_in = open(seq_in_file, "w")
fp_out = open(seq_out_file, "w")
fp_label = open(label_file, "w")

In [133]:
def domain_of_interest(services, service_domains):
    for service_domain in service_domains:
        if service_domain in services:
            return 1
    return 0

In [134]:
line_cnt = 0
line_num_str = ""
remove_set = set(['.', ';', ',', '!', '?', '<', '>', '(', ')', '{', '}', '[', ']'])

In [135]:
for dialogue in mwoz_dialogues_list: # for each dialogue in the dialog file
    services = dialogue["services"]
    if not domain_of_interest(set(services), service_domains): # only restaurant and hotel domains
        continue
    
    for turn in dialogue["turns"]:
        if turn["speaker"] == "SYSTEM": # skip all the system generated dialogs
            continue
        
        for dom in turn["frames"]: # each domain within a frame
            if not domain_of_interest(dom["service"], service_domains): # if not one of service_domain
                continue
            
            active_intent = dom["state"]["active_intent"] #extract active intent
            slot_values = dom["state"]["slot_values"] # extract slot value dictionary
            
            if active_intent == "NONE" and slot_values == {}: # if no active_intent, empty slot_values skip
                continue
            
            if xlate_slots_intents: # if need to translate
                try: # check if xlate_dict has this intent
                    active_intent = xlate_dict[active_intent]
                except KeyError: # if not, leave it at active_intent
                    pass
            
            slotNameList = []
            slotValList = []
            bio_str = ""
            for slotName, slotVal in slot_values.items(): # get the slot names and slot values
                if xlate_slots_intents: # if we want generic slot names
                    try: # check xlate_dict has the key, in which case use the value from xlate_dict
                        newSlotName = xlate_dict[slotName.lower()]
                    except KeyError: # if the key doesn't exist then don't translate
                        newSlotName = slotName.lower()
                else:
                    newSlotName = slotName.lower()
                slotNameList.append(newSlotName) # slot names
                slotValList.append(slotVal[0].lower()) # slot values
            
            for word in turn["utterance"].split(): # check if each word in utterance exist in slot value
                if word[-1] in remove_set: # if the last character is a non-alphanumeric
                    word = word[:-1]
                try:
                    ind = slotValList.index(word.lower()) # if the word is one of the slot values
                    parts = slotValList[ind].split() # for slot names that have multiple words
                    for i in range(len(parts)):
                        if i == 0:
                            bio_str += "B-" + slotNameList[ind] + " " # 1st part of slot starts with B-
                        else:
                            bio_str += "I-" + slotNameList[ind] + " " # other parts start with I-
                    
                except ValueError:
                    bio_str += "O "        
            
            if include_line_num:
                line_cnt += 1
                line_num_str = str(line_cnt) + " "
            
            fp_in.write(line_num_str + turn["utterance"] + "\n")
            fp_out.write(line_num_str + bio_str + "\n")
            fp_label.write(line_num_str + active_intent + "\n")
            

In [136]:
fp_in.close()
fp_out.close()
fp_label.close()