In [120]:
import json
file_ind = "017"
target_dir = "test/"
mwoz_file = "/code/workflows/multiwoz/data/MultiWOZ_2.2/train/dialogues_" + file_ind + ".json"
seq_in_file = target_dir + "seq" + file_ind + ".in"
seq_out_file = target_dir + "seq" + file_ind + ".out"
label_file = target_dir + "label" + file_ind
service_domains = ["hotel", "restaurant"]
include_line_num = False # flag to indicate if line numbers should be included

In [121]:
with open(mwoz_file, "r") as fp:
    mwoz_dialogues_list = json.load(fp)

In [122]:
fp_in = open(seq_in_file, "w")
fp_out = open(seq_out_file, "w")
fp_label = open(label_file, "w")

In [123]:
def domain_of_interest(services, service_domains):
    for service_domain in service_domains:
        if service_domain in services:
            return 1
    return 0

In [124]:
line_cnt = 0
line_num_str = ""
remove_set = set(['.', ';', ',', '!', '?', '<', '>', '(', ')', '{', '}', '[', ']'])

In [125]:
for dialogue in mwoz_dialogues_list: # for each dialogue in the dialog file
    services = dialogue["services"]
    if not domain_of_interest(set(services), service_domains): # only restaurant and hotel domains
        continue
    
    for turn in dialogue["turns"]:
        if turn["speaker"] == "SYSTEM": # skip all the system generated dialogs
            continue
        
        for dom in turn["frames"]: # each domain within a frame
            if not domain_of_interest(dom["service"], service_domains): # if not one of service_domain
                continue
            
            active_intent = dom["state"]["active_intent"] #extract active intent
            slot_values = dom["state"]["slot_values"] # extract slot value dictionary
            
            if active_intent == "NONE" and slot_values == {}: # if no active_intent, empty slot_values skip
                continue
            
            keyList = []
            valList = []
            bio_str = ""
            for key, val in slot_values.items(): # get the slot names and slot values
                keyList.append(key.lower()) # slot names
                valList.append(val[0].lower()) # slot values
            
            for word in turn["utterance"].split(): # check if each word in utterance exist in slot value
                if word[-1] in remove_set: # if the last character is a non-alphanumeric
                        word = word[:-1]
                try:
                    ind = valList.index(word.lower()) # if the word is one of the slot values
                    parts = valList[ind].split() # for slot names that have multiple words
                    for i in range(len(parts)):
                        if i == 0:
                            bio_str += "B-" + keyList[ind] + " " # 1st part of slot starts with B-
                        else:
                            bio_str += "I-" + keyList[ind] + " " # other parts start with I-
                    
                except ValueError:
                    bio_str += "O "        
            
            if include_line_num:
                line_cnt += 1
                line_num_str = str(line_cnt) + " "
            
            fp_in.write(line_num_str + turn["utterance"] + "\n")
            fp_out.write(line_num_str + bio_str + "\n")
            fp_label.write(line_num_str + active_intent + "\n")
            

In [126]:
fp_in.close()
fp_out.close()
fp_label.close()