In [39]:
import re
from codecs import getreader
from operator import itemgetter


def read_task(in_file_name):
    result = []
    with getreader('utf-8')(open(in_file_name)) as task_in:
        task_content = task_in.read()
    dialogs = [
        filter(lambda line: len(line.strip()), dialog.split('\n'))
        for dialog in task_content.split('\n\n')
    ]
    dialogs = filter(len, dialogs)

    for dialog in dialogs:
        result.append([])
        for line in dialog:
            line = re.sub('^\d+\s', '', line)
            user_turn, system_turn = line.split('\t')
            result[-1].append({'user': user_turn, 'system': system_turn})
    return result


def print_dataset_stats(in_train, in_dev, in_test):
    print '#Dialogs:    {} (train), {} (dev), {} (test)'.format(
        len(in_train),
        len(in_dev),
        len(in_test)
    )
    joint_dataset = in_train + in_dev + in_test
    print 'Mean #turns:    {0:.3f}'.format(
        sum(map(len, joint_dataset)) / float(len(joint_dataset))
    )
    system_turns = reduce(
        lambda x, y: x + map(itemgetter('system'), y),
        joint_dataset,
        []
    )
    user_turns = reduce(
        lambda x, y: x + map(itemgetter('user'), y),
        joint_dataset,
        []
    )
    print 'Mean system turn length (words):    {0:.3f}'.format(
        sum(map(lambda line: len(line.split()), system_turns)) / float(len(system_turns))
    )
    print 'Mean user turn length (words):    {0:.3f}'.format(
        sum(map(lambda line: len(line.split()), user_turns)) / float(len(user_turns))
    )

In [40]:
task1_train = read_task('dialog-babi-task1-API-calls-trn.txt')
task1_dev = read_task('dialog-babi-task1-API-calls-dev.txt')
task1_test = read_task('dialog-babi-task1-API-calls-tst.txt')
task1_oov_test = read_task('dialog-babi-task1-API-calls-tst-OOV.txt')

task2_train = read_task('dialog-babi-task2-API-refine-trn.txt')
task2_dev = read_task('dialog-babi-task2-API-refine-dev.txt')
task2_test = read_task('dialog-babi-task2-API-refine-tst.txt')
task2_oov_test = read_task('dialog-babi-task2-API-refine-tst-OOV.txt')

In [41]:
print 'Task 1 stats:'
print_dataset_stats(task1_train, task1_dev, task1_test + task1_oov_test)
print '\n'

print 'Task 2 stats:'
print_dataset_stats(task2_train, task2_dev, task2_test + task2_oov_test)

Task 1 stats:
#Dialogs:    1000 (train), 1000 (dev), 2000 (test)
Mean #turns:    5.999
Mean system turn length (words):    6.250
Mean user turn length (words):    3.804


Task 2 stats:
#Dialogs:    1000 (train), 1000 (dev), 2000 (test)
Mean #turns:    9.484
Mean system turn length (words):    6.156
Mean user turn length (words):    4.501
