In [63]:
import json
utterance_file = "test_multi.txt"
dialog_cnt = 1000
outfile = "dialogues_051.json"
json_list = []
other_domains = ["taxi", "train", "bus", "police", "hotel", "attraction", "hospital"]
service_domain = "restaurant"
num_workflows = -1 # track the number of processed workflows

In [64]:
with open(utterance_file) as fp:
    Lines = fp.readlines()

In [65]:
def init_dict(dialog_cnt):
    dialogue_id = "CAiFE" + str(dialog_cnt) + ".json"
    line_dict = {"dialogue_id": dialogue_id, "services": ["restaurant"], 
                 "turns": []}
    return line_dict

In [66]:
def add_other_domains():
    d_list = []
    for domain in other_domains:
        d = {"actions": [], "service": domain, "slots": [], 
            "state": {"active_intent": "NONE", "requested_slots": [], "slot_values": {}}}
        d_list.append(d)
    return d_list

In [67]:
def get_slots_list(speaker, utterance, attr_vals):
    utterance = utterance.lower()
    attr_vals = attr_vals.lower()
    ind = 0 # a counter that keeps track if keywords such as "slots:", "req_slots:" etc. are specified
    
    if not attr_vals == "[]": # split only if attr_vals has something other than "[]"
        attr_list = attr_vals.split(';') # get the slots, req_slots, slot_vals and intent
    
    slots_list = []
    if "slots:" in attr_vals: # if values for slots are specified
        slots = attr_list[ind].strip()[6:].strip() # 0th element contains slots
        ind += 1
        slot_name_values = slots.split(',') # extract the slot_name=value tuple separated by ','
        for slot_name_value in slot_name_values:
            slot_name = slot_name_value.split('=')[0].strip()
            slot_value = slot_name_value.split('=')[1].strip()
            start = utterance.index(slot_value) # get the start index of value
            exclusive_end = start + len(slot_value) # get the end index of value
            slots_list.append({"exclusive_end": exclusive_end,
                              "slot": slot_name,
                              "start": start,
                              "value": slot_value})
            
    if speaker == "SYSTEM": # if the speaker is system then only slot_list need to be returned
        return slots_list
    
    requested_slots = []
    if "req_slot:" in attr_vals:
        req_slots = attr_list[ind].strip()[10:].strip() # extract the req_list values
        ind += 1
        req_slots_values = req_slots.split(',')
        for slot_name in req_slots_values:
            requested_slots.append(slot_name.strip())
    
    slot_values = {}
    if "slot_vals:" in attr_vals:
        slot_vals = attr_list[ind].strip()[10:].strip()
        ind += 1
        slot_name_vals = slot_vals.split(',')
        for slot_name_val in slot_name_vals:
            slot_name = slot_name_val.split('=')[0].strip()
            slot_val = slot_name_val.split('=')[1].strip().split()
            slot_values[slot_name] = slot_val
    
    active_intent = "NONE"
    if "intent:" in attr_vals:
        active_intent = attr_list[ind].strip()[7:].strip()
    
    return slots_list, requested_slots, slot_values, active_intent

In [68]:
line_num = 0
while line_num < len(Lines):
    line = Lines[line_num]
    if line[0:4] == "----":
        num_workflows += 1 # 0 for the 1st time this loop is executed
        if num_workflows > 0: # append only after at least 1 workflow is processed
            json_list.append(line_dict)
        dialog_cnt += 1
        line_dict = init_dict(dialog_cnt)
        turn_id = 0
        line_num += 1
        continue
    
    speaker = line[0: line.index(':')].strip() # identify speaker - letters up to 1st ':'
    utterance = line[line.index(':') + 1: -1].strip() # extract utterance
    
    # for each turn_id setup the dictionary in turns list
    line_dict["turns"].append({"frames": [], "speaker": speaker, 
                               "turn_id": turn_id, "utterance": utterance}) 
    
    attr_vals = Lines[line_num+1].strip()
    
    # if SYSTEM has attributes fill in; if no attributes then init above is enough 
    if (speaker == "SYSTEM") and (not attr_vals == "[]" ):
        slots_list = get_slots_list(speaker, utterance, attr_vals)
        frame_dict = {"actions": [], "service": service_domain,"slots": slot_list}
        line_dict["turns"][turn_id]["frames"].append(frame_dict)
        
    if speaker == "USER": # case of speaker being USER
        slot_list, requested_slots, slot_values, active_intent = get_slots_list(speaker, utterance, attr_vals)
        
        frame_dict = {"actions": [], "service": service_domain,
                      "slots": slot_list,
                      "state": {"active_intent": active_intent,
                                "requested_slots": requested_slots,
                                "slot_values": slot_values
                               }
                     }
        line_dict["turns"][turn_id]["frames"].append(frame_dict)
        
        other_domain_list = add_other_domains() # get values for other domains
        for val in other_domain_list:
            line_dict["turns"][turn_id]["frames"].append(val)

    turn_id += 1
    line_num += 2

json_list.append(line_dict)

In [69]:
json_object = json.dumps(json_list, indent=2)

with open(outfile, "w") as fp:
    fp.write(json_object)