In [100]:
import json
import re
import os

In [28]:
LOGS = ['metalwoz_pure_ext_info.log', 'metalwoz_cross_ext_info.log', 'multiwoz_ext_info.log']
TESTSPECS = ['datasets/dstc8-fast-adaptation-evaluation/test_spec_metalwoz_heldout_pure_task.jsonl',
             'datasets/dstc8-fast-adaptation-evaluation/test_spec_metalwoz_heldout_cross_task.jsonl',
             'datasets/dstc8-fast-adaptation-evaluation/test_spec_multiwoz2.0.jsonl']

In [90]:
def parse_utterance(in_line):
    line, _, _ = in_line.partition('INFO')
    side, _, utterance = line.partition(':')
    side = [token for token in side.split(' ') if token][-1]
    utterance = utterance.strip()
    return side, utterance

In [87]:
def parse_candidate(in_line):
    _, utterance_and_info = parse_utterance(in_line)
    utterance, _, info = utterance_and_info.partition('[')
    utterance = utterance.strip()
    score, _, method = info.partition(']')
    try:
        score = float(score)
    except:
        score, method = None, None
        return utterance, score, method
    _, _, method = method.partition('[')
    method = method.strip(']')
    return utterance, score, method

In [64]:
def get_dialogue_ids_from_testspec(in_testspec_file):
    result = set([])
    with open(in_testspec_file) as spec_in:
        for line in spec_in:
            line_json = json.loads(line)
            result.add(line_json['target_dlg'])
    return result

In [95]:
def parse_log(in_lines, in_dialogue_ids):
    predictions = []
    broken_candidate = False
    for line in in_lines:
        line = line.replace('tri[', '')
        for dialogue_id in in_dialogue_ids:
            if dialogue_id in line:
                predictions.append({'id': dialogue_id, 'input': [], 'target': '', 'candidates': []})
                continue
        if 'INPUT' in line:
            # print('found INPUT line: ' + line)
            side, utterance = parse_utterance(line)
            predictions[-1]['input'].append({'side': side, 'utterance': utterance})
            continue
        if 'TARGET' in line:
            # print('found TARGET line: ' + line)
            _, utterance = parse_utterance(line)
            predictions[-1]['target'] = utterance
            continue
        if 'CANDIDATE' in line:
            # print('found CANDIDATE line: ' + line)
            utterance, score, method = parse_candidate(line)
            predictions[-1]['candidates'].append({'utterance': utterance,
                                                  'score': score,
                                                  'method': method})
            continue
        method_re_match = re.findall('\[(retrieved|generated)\]', line)
        score_re_match = re.findall('\[(-?\d+(?:.\d+)?)\]', line)
        if len(method_re_match) or len(score_re_match):
            rest_of_utterance, _, _ = line.partition('[')
            rest_of_utterance = rest_of_utterance.strip()
            if rest_of_utterance and len(predictions) and len(predictions[-1]['candidates']):
                print('fixed broken utterance: {} | {}'.format(predictions[-1]['candidates'][-1]['utterance'], rest_of_utterance))
                predictions[-1]['candidates'][-1]['utterance'] = predictions[-1]['candidates'][-1]['utterance'].strip() + ' ' + rest_of_utterance
        if len(method_re_match) and len(predictions) and len(predictions[-1]['candidates']) and predictions[-1]['candidates'][-1]['method'] is None:
            predictions[-1]['candidates'][-1]['method'] = method_re_match[0]
        if len(score_re_match) and len(predictions) and len(predictions[-1]['candidates']) and predictions[-1]['candidates'][-1]['score'] is None:
            predictions[-1]['candidates'][-1]['score'] = float(score_re_match[0])
    return predictions

In [96]:
parsed_logs = {}
for log, spec in zip(LOGS, TESTSPECS):
    with open(log) as log_in:
        parse_i = parse_log(log_in, get_dialogue_ids_from_testspec(spec))
        parsed_logs[log] = parse_i

fixed broken utterance: I am going to be in Toronto next week. I need 1 room with 2 beds. It also needs to be on the 2nd floor in a | downtown hotel.
fixed broken utterance: I'm going to be visiting there soon, but I'm not too interested in the beaches or food attractions. What else is | good to see?
fixed broken utterance: Oh that's a very popular place that I have seen pictures of. I'm interested in participating in on a group tour. | Are there any available for the Colosseum?
fixed broken utterance: I'm going to be visiting there soon, but I'm not too interested in the beaches or food attractions. What else is | good to see?
fixed broken utterance: I'm not really sure, we've already had pre-drinks and I forget where I am. I think I might be in Florida or | something.
fixed broken utterance: I'm wanting to find some festivals going on on Montreal, Quebec, Canada during the summer. Something where a big | party is bound to happen.
fixed broken utterance: I am going on vacation there i

fixed broken utterance: I 'd like to go to a museum. Can you tell me your favorite? And if you have their phone number that would be | really great.
fixed broken utterance: The amount of stars don't matter as long as it is in the same price range Does Worth House in the North include | free parking?
fixed broken utterance: I think I would like it near the centre. Does it have free parking? I would like the price range to be cheap but | at least a 3
fixed broken utterance: Just gathering information about the hotel first. Could you tell me what area of the city they're in, the price, | and if they have internet connectivity
fixed broken utterance: Money is no object, but I 'd like it to be a 4 star location please. And I 'd only like to stay on the north | side of
fixed broken utterance: Are you sure, but if not, book the Super 5 for 8 people for 5 nights starting Monday. I will need the reference | number.
fixed broken utterance: I don't really have a price range. Anything is fine. I w

In [97]:
parsed_logs['metalwoz_pure_ext_info.log'][1]

{'id': '6e759078',
 'input': [{'side': 'Wizard', 'utterance': 'Hello how may I help you?'},
  {'side': 'User', 'utterance': 'I need help on a reservation'},
  {'side': 'Wizard', 'utterance': 'What is it about >'}],
 'target': 'I have a reservation at a hotel in NYC that I want to cancel',
 'candidates': [{'utterance': 'I already have reservations at a hotel in New York City. I need to get them canceled.',
   'score': 1.128,
   'method': 'retrieved'},
  {'utterance': 'I need to cancel the reservation',
   'score': 0.614,
   'method': 'generated'}]}

In [98]:
for dataset, predictions in parsed_logs.items():
    for prediction in predictions:
        for candidate in prediction['candidates']:
            assert candidate['score'] is not None and candidate['method'] is not None, candidate

In [103]:
for dataset, predictions in parsed_logs.items():
    dataset_name, _ = os.path.splitext(dataset)
    with open(dataset_name + '_parsed.json', 'w') as parsed_out:
        json.dump(predictions, parsed_out)