In [443]:
import json
file_ind = "017"
target_dir = "debugs/"
#target_dir = "test_police/"
mwoz_dir = "/code/workflows/multiwoz/data/MultiWOZ_2.2/train/"
mwoz_file =  mwoz_dir + "dialogues_" + file_ind + ".json"
seq_in_file = target_dir + "seq" + file_ind + ".in"
seq_out_file = target_dir + "seq" + file_ind + ".out"
label_file = target_dir + "label" + file_ind
#service_domains = ["hotel", "restaurant"]
#domains: hotel, train, attraction, restaurant, hospital, taxi, bus, police
service_domains = ["train"]

In [444]:
# build xlate table - xlate domain specific slot names and intents to generic name
xlate_slots_intents = True
xlate_file = "slot_xlate.json"
if xlate_slots_intents:
    xlate_dict = {}
    with open(xlate_file, "r") as f_xlate:
        xlate_list = json.load(f_xlate)
    for entry in xlate_list:
        for intents in entry["intents"]:
            xlate_dict[intents["name"]] = intents["new-name"]
        for slot_names in entry["slots"]:
            xlate_dict[slot_names["name"]] = slot_names["new-name"]

In [445]:
with open(mwoz_file, "r") as fp:
    mwoz_dialogues_list = json.load(fp)

In [446]:
fp_in = open(seq_in_file, "w")
fp_out = open(seq_out_file, "w")
fp_label = open(label_file, "w")

In [447]:
def domain_of_interest(services, service_domains):
    
    for service_domain in service_domains:
        if service_domain in services:
            return 1
    return 0

In [448]:
def get_bio_utterance(turn_utterance, slotValList, slotNameList):
    
    bio_utterance = turn_utterance.lower()
    punctuations = set(['.', '?', ':', ';', ',', '"', "'", '!', '<', '>', '(', ')', '[', ']', '-'])
    bio_str = ""
    user_utterance = ""
    utter_indList = []
    for i in range(len(slotValList)): # for each slot value in the slotValList
        try: # check if the slot value occurs in the utterance
            ind = bio_utterance.index(slotValList[i]) # get the index of the slot value
            parts = slotValList[i].split() # handle blank inside slot value
            BI_slotName = ""
            for j in range(len(parts)):
                if j == 0:
                    BI_slotName += ' ' + "B-" + slotNameList[i] # 1st part is B-slotname
                    user_ind = turn_utterance.lower().index(slotValList[i]) # locate the slot value in turn_utterance
                    turn_utterance = turn_utterance[: user_ind] + ' ' + turn_utterance[user_ind :] # introduce a blank before tha
                else:
                    BI_slotName += ' ' + "I-" + slotNameList[i] # No need for a blank for the last 
            try:
                next_word_pos = bio_utterance[ind + len(slotValList[i]) :].index(' ') # to check for slotValList[i] inside a phrase in utterance
                ind2 = ind + len(slotValList[i]) + next_word_pos # go past the word that contains the slotValList[i]
                bio_utterance = bio_utterance[: ind] + BI_slotName + bio_utterance[ind2 :]
            except ValueError:
                bio_utterance = bio_utterance[: ind] + BI_slotName
        except ValueError:
            pass
    
    words = bio_utterance.strip().split()
    for i in range(len(words)): # check if each word in utterance exist in slot value
        word = words[i]
        if "B-" in word[0:2] or "I-" in word[0:2]:
                bio_str += word + ' '
        else:
            if i == len(words) - 1: # special handling for the last word; drop if all punctuations
                for k in range(len(word)):
                    if word[k] in punctuations:
                        continue
                    else:
                        bio_str += "O "
                        break
            else:
                if not word in punctuations:
                    bio_str += "O "
                    
    words = turn_utterance.strip().split()
    for i in range(len(words)):
        word = words[i]
        if i == len(words) - 1: # last word
            for k in range(len(word)): # if the last word is only punctuations then drop this word
                if word[k] in punctuations:
                    continue
                else:
                    user_utterance += word + ' '
                    break
        else:
            if word in punctuations:
                if i == 0: # very 1st word
                    user_utterance += word
                else: # add the punctuation after removing the blank
                    user_utterance = user_utterance.strip() + word + ' '
            else:
                user_utterance += word + ' '
    
    return bio_str.strip(), user_utterance.strip()

In [449]:
def get_label(intent):
    
    key = list(intent.keys())
    if len(key) == 1:
        return intent[key[0]]
    else:
        if intent[key[0]] == "NONE" and intent[key[1]] == "NONE":
            return "NONE"
        elif intent[key[0]] == "NONE":
            return intent[key[1]]
        else:
            return intent[key[0]]

In [450]:
transaction_cnt = 0
domain_transaction_cnt = 0
domain_dialog_cnt = 0
for dialogue in mwoz_dialogues_list: # for each dialogue in the dialog file
    transaction_cnt += 1
    services = dialogue["services"]
    if not domain_of_interest(set(services), service_domains): # only restaurant and hotel domains
        continue
    
    domain_transaction_cnt += 1
    for turn in dialogue["turns"]:
        if turn["speaker"] == "SYSTEM": # skip all the system generated dialogs
            continue
            
        slotNameList = []
        slotValList = []
        intent = {}
        hotel_restaurant = False # flag to indicate if the utterance is relevant to hotel and/or restaurant
        for dom in turn["frames"]: # each domain within a frame
            if not domain_of_interest(dom["service"], service_domains): # if not one of service_domain
                continue
            
            active_intent = dom["state"]["active_intent"] #extract active intent
            slot_values = dom["state"]["slot_values"] # extract slot value dictionary
            
            if active_intent == "NONE" and slot_values == {}: # if no active_intent, empty slot_values skip
                continue
            
            hotel_restaurant = True # set the flag true; at least one of hotel or restaurant active domain
                
            if not dom["slots"] == []: # if for a domain "slots" exist then take only the slot
                slot_values = {}
                for entry in dom["slots"]:
                    if type(entry["value"]) is list: 
                        slot_values[entry["slot"]] = entry["value"]
                    else:
                        slot_values[entry["slot"]] = [entry["value"]]
            
            if xlate_slots_intents: # if need to translate
                try: # check if xlate_dict has this intent
                    intent[dom["service"]] = xlate_dict[active_intent]
                except KeyError: # if not, leave it at active_intent
                    intent[dom["service"]] = active_intent
            else:
                intent[dom["service"]] = active_intent
                
            for slotName, slotVal in slot_values.items(): # get the slot names and slot values
                if xlate_slots_intents: # if we want generic slot names
                    try: # check xlate_dict has the key, in which case use the value from xlate_dict
                        newSlotName = xlate_dict[slotName.lower()]
                    except KeyError: # if the key doesn't exist then don't translate
                        newSlotName = slotName.lower()
                else:
                    newSlotName = slotName.lower()
                slotNameList.append(newSlotName) # slot names
                slotValList.append(slotVal[0].lower()) # slot values
        
        if hotel_restaurant: 
            bio_str, user_utterance = get_bio_utterance(turn["utterance"], slotValList, slotNameList)        
            label = get_label(intent)
            
            if not len(user_utterance.split()) == len(bio_str.split()):
                print("dialog_id:", file_ind)
                print(bio_str)
                print(user_utterance)
                print(turn["utterance"])
            else:
                fp_in.write(user_utterance + "\n")
                fp_out.write(bio_str + "\n")
                fp_label.write(label + "\n")
                domain_dialog_cnt += 1

print("Number of transactions in the file:", transaction_cnt)
print("Number of domain relevant transactions:", domain_transaction_cnt)
print("Number of utterances relevant for the domain:", domain_dialog_cnt)

Number of transactions in the file: 246
Number of domain relevant transactions: 82
Number of utterances relevant for the domain: 539


In [451]:
fp_in.close()
fp_out.close()
fp_label.close()