In [6]:
import json
mwoz_file = "dialogues_016.json"
outfile = "bert2_dialogue_016.json"
service_domain = "restaurant"

In [7]:
with open(mwoz_file) as fp:
    mwoz_dialogues_list = json.load(fp)

In [8]:
def get_first_dialog(turn_list, service_domain):
    """Get the turn list and look for the 1st dialog that has restaurant domain. 
    Return the dictionary for Bert/DST processing and the next index in the turn
    list that needs to be processed and also the accumulated slot, name, and act
    dictionary"""
    
    accum_slot_name_dict = {} # for each dialog accumulate all slot, value pairs
    accum_slot_act_dict = {} # for each dialog accumulate the active intent 
    next_turn_ind = -1
    bert_dialog_dict = {}
    belief_state_list = []
    turn_label_list = []
    for turn in turn_list:
        if (len(turn["frames"]) > 0):
            for dom in turn["frames"]:
                if dom["service"] == service_domain:
                    if (len(dom["slots"]) > 0) or (not dom["state"]["active_intent"] == "NONE"):
                        act = dom["state"]["active_intent"]
                        req_slot = dom["state"]["requested_slots"]
                        slot_val_mwoz = dom["state"]["slot_values"]
                        for key, val in slot_val_mwoz.items():
                            belief_state_list.append({"slots": [[key, val[0]]], "act": act})
                            turn_label_list.append([key, val[0]])
                            accum_slot_name_dict[key] = val[0]
                            accum_slot_act_dict[key] = act
                        
                        transcript = turn["utterance"]
                        system_acts = req_slot
                        
                        bert_dialog_dict = {"system_transcript": "", "turn_idx": 0, 
                                            "belief_state": belief_state_list, 
                                            "turn_label": turn_label_list, "transcript": transcript, 
                                            "system_acts": system_acts, "domain": service_domain
                                           }
                        next_turn_ind = mwoz_dialogue["turns"].index(turn) + 1
                        return bert_dialog_dict, next_turn_ind, accum_slot_name_dict, accum_slot_act_dict
    
    # this return statement shouldn't get executed; kept only for debug
    return bert_dialog_dict, next_turn_ind, accum_slot_name_dict, accum_slot_act_dict

In [9]:
total_dialogs = 0
restaurant_dialogs = 0
json_list = []
for mwoz_dialogue in mwoz_dialogues_list:
    total_dialogs += 1
    if not service_domain in mwoz_dialogue["services"]:
        continue
    
    restaurant_dialogs += 1
    bert_dict = {}
    bert_dict["dialogue_idx"] = mwoz_dialogue["dialogue_id"]
    bert_dict["domains"] = [service_domain]
    bert_dict["dialogue"] = []
    
    # get the first relevant dialog for the service domain
    bert_dialog_dict, next_turn_ind, accum_slot_name_dict, accum_slot_act_dict  = \
                      get_first_dialog(mwoz_dialogue["turns"], service_domain)
    
    bert_dict["dialogue"].append(bert_dialog_dict)
    turn_idx = 1
    
    # for entries in "turn" from next_turn_ind to n-1, where n is the final utterance index 1 
    # ("SYSTEM"), 2 ("USER") are paired; take slots and value from utterance 2 (USER).
    turn_len = len(mwoz_dialogue["turns"])
    while next_turn_ind <= turn_len - 2:
        utter_sys = mwoz_dialogue["turns"][next_turn_ind] # represents SYSTEM
        utter_user = mwoz_dialogue["turns"][next_turn_ind+1] # represents USER
        
        system_transcript = utter_sys["utterance"] # system utterance
        
        belief_state_list = []
        turn_label_list = []
        for dom in utter_user["frames"]:
            act = dom["state"]["active_intent"]
            service = dom["service"]
            # consider only service = "restaurant" (i.e service_domain)
            if service == service_domain:
                slot_val_mwoz = dom["state"]["slot_values"]
                req_slot = dom["state"]["requested_slots"]
                for key, val in slot_val_mwoz.items():
                    belief_state_list.append({"slots": [[key, val[0]]], "act": act})
                    turn_label_list.append([key, val[0]])
                    accum_slot_name_dict[key] = val[0]
                    accum_slot_act_dict[key] = act
        
        transcript = utter_user["utterance"] 
        system_acts = req_slot
        domain = service_domain
        
        if (not len(belief_state_list) == 0): # only if belief state has data
            bert_dialog_dict = {"system_transcript": system_transcript, "turn_idx": turn_idx,
                                "belief_state": belief_state_list, "turn_label": turn_label_list, 
                                "transcript": transcript, "system_acts": system_acts, "domain": domain
                               }
            bert_dict["dialogue"].append(bert_dialog_dict)
            turn_idx += 1
        
        next_turn_ind += 2
        
    # handle the case of the last system utterance
    utter_sys = mwoz_dialogue["turns"][next_turn_ind]
    system_transcript = utter_sys["utterance"] # system utterance
    
    system_acts = []
    belief_state_list = []
    turn_label_list = []
    # dump all the accumulated slot and values to the belief state of the last turn
    for s, v in accum_slot_name_dict.items():
        belief_state_list.append({"slots": [[s, v]], "act": accum_slot_act_dict[s]})
        turn_label_list.append([s, v])
        
    transcript = ""
    domain = service_domain
        
    bert_dialog_dict = {"system_transcript": system_transcript, "turn_idx": turn_idx,
                        "belief_state": belief_state_list, "turn_label": turn_label_list, 
                        "transcript": transcript, "system_acts": system_acts, "domain": domain
                       }
    bert_dict["dialogue"].append(bert_dialog_dict)
        
    json_list.append(bert_dict)

print("Total dialogs:", total_dialogs, "Restaurant dialogs:", restaurant_dialogs)

Total dialogs: 512 Restaurant dialogs: 234


In [10]:
json_object = json.dumps(json_list, indent=2)

with open(outfile, "w") as fp:
    fp.write(json_object)