In [1]:
import sys
import os
import json

In [2]:
def load_json(data_file):
    if os.path.isfile(data_file):
        with open(data_file, 'r') as read_file:
            data = json.load(read_file)
            return data

def load_list_file(list_file):
    with open(list_file, 'r') as read_file:
        dialog_id_list = read_file.readlines()
        dialog_id_list = [l.strip('\n') for l in dialog_id_list]
        return dialog_id_list
    return

In [3]:
dialog_data_file = './MultiWOZ/data.json'
dialog_data = load_json(dialog_data_file)
dialog_id_list = list(set(dialog_data.keys())) # Bug: v1.0 contains duplicate id in the valid data
print('# of dialogs:', len(dialog_data))
# print(dialog_data['PMUL4641.json']) # print a sample dialog

# of dialogs: 9855


In [4]:
valid_list_file = './MultiWOZ/valListFile.json'
test_list_file = './MultiWOZ/testListFile.json'

valid_id_list = list(set(load_list_file(valid_list_file))) # Bug: v1.0 contains duplicate id in the valid data
test_id_list = load_list_file(test_list_file)
train_id_list = [did for did in dialog_id_list if did not in (valid_id_list + test_id_list)]
print('# of train dialogs:', len(train_id_list))
print('# of valid dialogs:', len(valid_id_list))
print('# of test dialogs :', len(test_id_list))
assert(len(dialog_id_list) == len(train_id_list) + len(valid_id_list) + len(test_id_list))

# of train dialogs: 7953
# of valid dialogs: 902
# of test dialogs : 1000


In [5]:
train_data = [v for k, v in dialog_data.items() if k in train_id_list]
valid_data = [v for k, v in dialog_data.items() if k in valid_id_list]
test_data = [v for k, v in dialog_data.items() if k in test_id_list]
assert(len(train_data) == len(train_id_list))
assert(len(valid_data) == len(valid_id_list))
assert(len(test_data) == len(test_id_list))

In [6]:
train_data[0]

{'goal': {'attraction': {},
  'eod': True,
  'hospital': {},
  'hotel': {'book': {'day': 'tuesday',
    'invalid': False,
    'people': '6',
    'pre_invalid': True,
    'stay': '2'},
   'fail_book': {'stay': '3'},
   'fail_info': {},
   'info': {'internet': 'yes',
    'parking': 'yes',
    'pricerange': 'cheap',
    'type': 'hotel'}},
  'message': ["You are looking for a <span class='emphasis'>place to stay</span>. The hotel should be in the <span class='emphasis'>cheap</span> price range and should be in the type of <span class='emphasis'>hotel</span>",
   "The hotel should <span class='emphasis'>include free parking</span> and should <span class='emphasis'>include free wifi</span>",
   "Once you find the <span class='emphasis'>hotel</span> you want to book it for <span class='emphasis'>6 people</span> and <span class='emphasis'>3 nights</span> starting from <span class='emphasis'>tuesday</span>",
   "If the booking fails how about <span class='emphasis'>2 nights</span>",
   "Make su

In [7]:
def get_dst_diff(prev_d, crnt_d):
    assert len(prev_d) == len(crnt_d)
    diff = {}
    for ((k1, v1), (k2, v2)) in zip(prev_d.items(), crnt_d.items()):
        assert k1 == k2
        if v1 != v2: # updated
            diff[k2] = v2
    return diff

def analyze_dialog(d, print_dialog=True):
    assert 'log' in d
    assert 'goal' in d
    domains = []
    ignore_keys_in_goal = ['eod', 'messageLen', 'message'] # eod (probably) means the user archieved the goal. 
    for dom_k, dom_v  in d['goal'].items():
        if dom_v and dom_k not in ignore_keys_in_goal: # check whether contains some goal entities
            domains.append(dom_k)
    print('{} domain(s): {}'.format(len(domains), domains))
    
    if print_dialog:
        prev_d = None
        for i, t in enumerate(d['log']):
            spk = 'Usr' if i % 2 == 0 else 'Sys' # Turn 0 is always a user's turn in this corpus.
            if spk == 'Sys':
                if prev_d is None:
                    prev_d = t['metadata']
                else:
                    crnt_d = t['metadata']
                    dst_diff = get_dst_diff(prev_d, crnt_d)
                    print('Updated DST:', dst_diff)
                    prev_d = crnt_d
            u = t['text']
            print('{}: {}'.format(spk, u))

for d in train_data[:3]:
    print('-' * 50)
    analyze_dialog(d, True)

--------------------------------------------------
2 domain(s): ['hotel', 'topic']
Usr: am looking for a place to to stay that has cheap price range it should be in a type of hotel
Sys: Okay, do you have a specific area you want to stay in?
Usr: no, i just need to make sure it's cheap. oh, and i need parking
Updated DST: {'hotel': {'book': {'booked': [], 'stay': '', 'day': '', 'people': ''}, 'semi': {'name': 'not mentioned', 'area': 'not mentioned', 'parking': 'yes', 'pricerange': 'cheap', 'stars': 'not mentioned', 'internet': 'not mentioned', 'type': 'hotel'}}}
Sys: I found 1 cheap hotel for you that includes parking. Do you like me to book it?
Usr: Yes, please. 6 people 3 nights starting on tuesday.
Updated DST: {'hotel': {'book': {'booked': [], 'stay': '3', 'day': 'tuesday', 'people': '6'}, 'semi': {'name': 'not mentioned', 'area': 'not mentioned', 'parking': 'yes', 'pricerange': 'cheap', 'stars': 'not mentioned', 'internet': 'not mentioned', 'type': 'hotel'}}}
Sys: I am sorry but I

## DB

In [8]:
hotel_db_list = load_json('./MultiWOZ/hotel_db.json')
train_db_list = load_json('./MultiWOZ/train_db.json')
attractin_db_list = load_json('./MultiWOZ/attraction_db.json')
restaurant_db_list = load_json('./MultiWOZ/restaurant_db.json')
print(hotel_db_list[0]) # print a sample entity

{'address': '124 tenison road', 'area': 'east', 'internet': 'yes', 'parking': 'no', 'id': '0', 'location': [52.1963733, 0.1987426], 'name': 'a and b guest house', 'phone': '01223315702', 'postcode': 'cb12dp', 'price': {'double': '70', 'family': '90', 'single': '50'}, 'pricerange': 'moderate', 'stars': '4', 'takesbookings': 'yes', 'type': 'guesthouse'}
