In [154]:
import json
file_ind = "017"
target_dir = "test_gen/"
#target_dir = ""
mwoz_dir = "/code/workflows/multiwoz/data/MultiWOZ_2.2/train/"
mwoz_file =  mwoz_dir + "dialogues_" + file_ind + ".json"
seq_in_file = target_dir + "seq" + file_ind + ".in"
seq_out_file = target_dir + "seq" + file_ind + ".out"
label_file = target_dir + "label" + file_ind
service_domains = ["hotel", "restaurant"]

In [155]:
# build xlate table - xlate domain specific slot names and intents to generic name
xlate_slots_intents = True
xlate_file = "slot_xlate.json"
if xlate_slots_intents:
    xlate_dict = {}
    with open(xlate_file, "r") as f_xlate:
        xlate_list = json.load(f_xlate)
    for entry in xlate_list:
        for intents in entry["intents"]:
            xlate_dict[intents["name"]] = intents["new-name"]
        for slot_names in entry["slots"]:
            xlate_dict[slot_names["name"]] = slot_names["new-name"]

In [156]:
with open(mwoz_file, "r") as fp:
    mwoz_dialogues_list = json.load(fp)

In [157]:
fp_in = open(seq_in_file, "w")
fp_out = open(seq_out_file, "w")
fp_label = open(label_file, "w")

In [158]:
def domain_of_interest(services, service_domains):
    
    for service_domain in service_domains:
        if service_domain in services:
            return 1
    return 0

In [159]:
def get_bio_string(utterance, slotValList, slotNameList):
    
    utterance = utterance.lower()
    bio_str = ""
    for i in range(len(slotValList)): # for each slot value in the slotValList
        try: # check if the slot value occurs in the utterance
            ind = utterance.index(slotValList[i]) # get the index of the slot value
            parts = slotValList[i].split() # handle blank inside slot value
            BI_slotName = ""
            for j in range(len(parts)):
                if j == 0:
                    BI_slotName += "B-" + slotNameList[i] + " " # 1st part is B-slotname
                else:
                    BI_slotName += "I-" + slotNameList[i] + " " # subsequent parts are I-slotname
            utterance = utterance[: ind] + BI_slotName + utterance[ind+len(slotValList[i]) :]
        except ValueError:
            pass
            
    for word in utterance.split(): # check if each word in utterance exist in slot value
        if "B-" in word[0:2] or "I-" in word[0:2]:
            bio_str += word + " "
        else:
            bio_str += "O "    
    
    return bio_str

In [160]:
def get_label(intent):
    
    key = list(intent.keys())
    if len(key) == 1:
        return intent[key[0]]
    else:
        if intent[key[0]] == "NONE" and intent[key[1]] == "NONE":
            return "NONE"
        elif intent[key[0]] == "NONE":
            return intent[key[1]]
        else:
            return intent[key[0]]

In [161]:
for dialogue in mwoz_dialogues_list: # for each dialogue in the dialog file
    services = dialogue["services"]
    if not domain_of_interest(set(services), service_domains): # only restaurant and hotel domains
        continue
    
    for turn in dialogue["turns"]:
        if turn["speaker"] == "SYSTEM": # skip all the system generated dialogs
            continue
            
        slotNameList = []
        slotValList = []
        intent = {}
        hotel_restaurant = False # flag to indicate if the utterance is relevant to hotel and/or restaurant
        for dom in turn["frames"]: # each domain within a frame
            if not domain_of_interest(dom["service"], service_domains): # if not one of service_domain
                continue
            
            active_intent = dom["state"]["active_intent"] #extract active intent
            slot_values = dom["state"]["slot_values"] # extract slot value dictionary
            
            if active_intent == "NONE" and slot_values == {}: # if no active_intent, empty slot_values skip
                continue
            
            hotel_restaurant = True #set the flag true; at least one of hotel or restaurant active domain
            
            if xlate_slots_intents: # if need to translate
                try: # check if xlate_dict has this intent
                    intent[dom["service"]] = xlate_dict[active_intent]
                except KeyError: # if not, leave it at active_intent
                    intent[dom["service"]] = active_intent
                
            for slotName, slotVal in slot_values.items(): # get the slot names and slot values
                if xlate_slots_intents: # if we want generic slot names
                    try: # check xlate_dict has the key, in which case use the value from xlate_dict
                        newSlotName = xlate_dict[slotName.lower()]
                    except KeyError: # if the key doesn't exist then don't translate
                        newSlotName = slotName.lower()
                else:
                    newSlotName = slotName.lower()
                slotNameList.append(newSlotName) # slot names
                slotValList.append(slotVal[0].lower()) # slot values
        
        if hotel_restaurant: 
            bio_str = get_bio_string(turn["utterance"], slotValList, slotNameList)        
            label = get_label(intent)
            
            fp_in.write(turn["utterance"] + "\n")
            fp_out.write(bio_str + "\n")
            fp_label.write(label + "\n")
            

In [162]:
fp_in.close()
fp_out.close()
fp_label.close()