In [1]:
import time
import sys
import numpy
import collections
import os
import json

import dataset_walker
import ontology_reader
from baseline import BaselineMethod1, BaselineMethod2

In [2]:
DATASET = 'dstc5_dev'
DATAROOT = '../data'
TRACKFILE = 'baseline_dev.json'
ONTOLOGY_FILE = 'config/ontology_dstc5.json'
METHOD = '1'

ONTOLOGY = ontology_reader.OntologyReader(ONTOLOGY_FILE)

In [3]:
def get_dataset_info(in_dataset):
    stats = collections.defaultdict(lambda: 0)
    dataset = dataset_walker.dataset_walker(in_dataset, dataroot=DATAROOT, labels=True, translations=True)
    for call in dataset:
        stats['dialogs'] += 1
        for (utter, translations, labels) in call:
            stats['utterances'] += 1
            bio_tag = utter['segment_info']['target_bio']
            if bio_tag == 'B':
                stats['segments'] += 1
            if bio_tag != 'O':
                stats['informative_utterances'] += 1
    return stats

In [4]:
def print_ontology_info(in_ontology):
    print in_ontology
print dir (ONTOLOGY)
print ONTOLOGY.get_topics()
print ONTOLOGY.get_slots('TRANSPORTATION')
print ONTOLOGY.get_tagsets()['TRANSPORTATION']['FROM'][:10]
print ONTOLOGY.get_translated_tagsets()['TRANSPORTATION']['FROM'][:10]

['__doc__', '__init__', '__module__', 'get_pilot_tagsets', 'get_slots', 'get_tagsets', 'get_topics', 'get_translated_tagsets', 'get_translations', 'ontology', 'pilot_tagsets', 'tagsets', 'translations']
[u'FOOD', u'ATTRACTION', u'TRANSPORTATION', u'SHOPPING', u'ACCOMMODATION']
[u'INFO', u'FROM', u'TO', u'STATION', u'LINE', u'TYPE', u'TICKET']
[u'1-Altitude Gallery & Bar', u'112 Katong', u'2am Dessert Bar', u'313@Somerset', u'5footway.inn Project Ann Siang', u'5footway.inn Project Boat Quay', u'5footway.inn Project Bugis', u'5footway.inn Project Chinatown', u'5footway.inn Project Chinatown 2', u'7-Eleven']
[{'entry_en': u'1-Altitude Gallery & Bar', 'translated_cn': [u'1-altitude\u5eca\u53ca\u9152\u5427', u'1-altitude\u5eca\u53ca\u9152\u5427', u'1-altitude\u53ca\u9152\u5427', u'1-altitude\u5eca\u53ca\u5f8b\u5e08', u'1-altitude\u5eca\u53ca', u'1-altitude\u53ca\u5f8b\u5e08']}, {'entry_en': u'112 Katong', 'translated_cn': [u'\u7b2c112\u52a0\u4e1c', u'\u7b2c112\u52a0\u4e1c', u'\u7b2c112\u52a

Train Set Info
==

In [5]:
for key, value in get_dataset_info('dstc5_train').iteritems():
    print '{}:\t{}'.format(key, value)

segments:	4296
informative_utterances:	25338
dialogs:	35
utterances:	31034


Dev Set Info
==

In [29]:
for key, value in get_dataset_info('dstc5_dev').iteritems():
    print '{}:\t{}'.format(key, value)

segments:	253
informative_utterances:	2189
dialogs:	2
utterances:	3130


In [30]:
print_ontology_info(ONTOLOGY)

<ontology_reader.OntologyReader instance at 0x1127dc830>


DSTC4 - Train Set Info
==

In [31]:
for key, value in get_dataset_info('dstc4_train').iteritems():
    print '{}:\t{}'.format(key, value)

segments:	1747
informative_utterances:	9974
dialogs:	14
utterances:	12759


DSTC4 - Dev Set Info
==

In [32]:
for key, value in get_dataset_info('dstc4_dev').iteritems():
    print '{}:\t{}'.format(key, value)

segments:	632
informative_utterances:	4139
dialogs:	6
utterances:	4812


DSTC4 - Test Set Info
==

In [33]:
for key, value in get_dataset_info('dstc4_test').iteritems():
    print '{}:\t{}'.format(key, value)

segments:	1147
informative_utterances:	6528
dialogs:	9
utterances:	7848


In [35]:
dir(ONTOLOGY)

['__doc__',
 '__init__',
 '__module__',
 'get_pilot_tagsets',
 'get_slots',
 'get_tagsets',
 'get_topics',
 'get_translated_tagsets',
 'get_translations',
 'ontology',
 'pilot_tagsets',
 'tagsets',
 'translations']

In [9]:
# print ONTOLOGY.get_topics()
for topic in ONTOLOGY.get_topics():
    print topic, '::', ONTOLOGY.get_slots(topic)

FOOD :: [u'INFO', u'CUISINE', u'TYPE_OF_PLACE', u'DRINK', u'PLACE', u'MEAL_TIME', u'DISH', u'NEIGHBOURHOOD']
ATTRACTION :: [u'INFO', u'TYPE_OF_PLACE', u'ACTIVITY', u'PLACE', u'TIME', u'NEIGHBOURHOOD']
TRANSPORTATION :: [u'INFO', u'FROM', u'TO', u'STATION', u'LINE', u'TYPE', u'TICKET']
SHOPPING :: [u'INFO', u'TYPE_OF_PLACE', u'PLACE', u'NEIGHBOURHOOD', u'TIME']
ACCOMMODATION :: [u'INFO', u'TYPE_OF_PLACE', u'PLACE', u'NEIGHBOURHOOD']


Exploring dialog states
==

In [9]:
slot_value_map = collections.defaultdict(lambda: collections.defaultdict(lambda: 0))
for root, dirs, files in os.walk(DATAROOT):
    for filename in files:
        if not filename == 'label.json':
            continue
        content = json.load(open(os.path.join(root, filename)))
        for utterance in content['utterances']:
            for slot, values in utterance.get('frame_label', {}).items():
                slot_value_map[slot][len(values)] += 1

for slot, values_dist in slot_value_map.items():
    values_mean = sum([key * value for key, value in values_dist.items()]) / float(sum(values_dist.values()))
    print slot, values_mean

INFO 1.11442457698
CUISINE 1.24077046549
FROM 1.01831210191
TYPE_OF_PLACE 1.03892535491
DRINK 1.09386281588
TIME 1.2495697074
TO 1.03622641509
STATION 1.01346153846
PLACE 1.09922535211
MEAL_TIME 1.03020134228
ACTIVITY 1.07260963336
DISH 1.15974625144
TICKET 1.0037037037
TYPE 1.05574280529
LINE 1.08549618321
NEIGHBOURHOOD 1.03237109962


In [20]:
for topic, slots in ONTOLOGY.tagsets.items():
    print topic, slots.keys()

FOOD [u'INFO', u'CUISINE', u'TYPE_OF_PLACE', u'DRINK', u'PLACE', u'MEAL_TIME', u'DISH', u'NEIGHBOURHOOD']
ATTRACTION [u'INFO', u'TYPE_OF_PLACE', u'ACTIVITY', u'PLACE', u'TIME', u'NEIGHBOURHOOD']
TRANSPORTATION [u'INFO', u'FROM', u'TO', u'STATION', u'LINE', u'TYPE', u'TICKET']
SHOPPING [u'INFO', u'TYPE_OF_PLACE', u'PLACE', u'NEIGHBOURHOOD', u'TIME']
ACCOMMODATION [u'INFO', u'TYPE_OF_PLACE', u'PLACE', u'NEIGHBOURHOOD']


In [3]:
print ONTOLOGY.get_slots('OPENING')

None
