In [1]:
from operator import itemgetter
from os import path
import re

from config import CONFIG
from lib.babi import read_task

In [2]:
def count_hesitations(in_turns):
    hesitate_starts = map(lambda template: template.split()[0], CONFIG['action_templates']['hesitate'])
    user_turns = filter(lambda x: x['agent'] == 'user', in_turns)
    hesitation_triggers = 0
    hesitation_turns = 0
    hesitation_cases = set([])
    for turn in user_turns:
        tokens = turn['text'].strip().split()
        hesitation_triggered = False
        for token_i, token_j in zip(tokens[:], tokens[1:]):
            if token_i in hesitate_starts and token_j not in ['sorry', 'yeah']:
                hesitation_cases.add(' '.join([token_i, token_j]))
                hesitation_triggers += 1
                if not hesitation_triggered:
                    hesitation_triggered = True
                    hesitation_turns += 1
    # print '\n'.join(hesitation_cases)
    return hesitation_triggers, hesitation_turns

In [3]:
def count_restarts(in_turns):
    user_turns = filter(lambda x: x['agent'] == 'user', in_turns)
    restart_triggers = 0
    restart_turns = 0
    restart_cases = set([])
    for turn in user_turns:
        tokens = turn['text'].strip().split()
        restart_triggered = False
        for token_i, token_j, token_k in zip(tokens[:], tokens[1:], tokens[2:]):
            if token_i in ['uhm', 'sorry'] and token_j in ['yeah']:
                restart_cases.add(' '.join([token_i, token_j, token_k]))
                restart_triggers += 1
                if not restart_triggered:
                    restart_triggered = True
                    restart_turns += 1
    # print '\n'.join(restart_cases)
    return restart_triggers, restart_turns

In [4]:
def count_corrections(in_turns, in_slot_values):
    correction_starts = [
        filter(lambda token: not token.startswith('$'), phrase.split())
        for phrase in CONFIG['action_templates']['correct']
    ]
    correction_cases = set([])
    user_turns = filter(lambda x: x['agent'] == 'user', in_turns)
    correction_triggers = 0
    correction_turns = 0
    for turn in user_turns:
        tokens = turn['text'].strip().split()
        correction_triggered = False
        for token_i, token_j, token_k in zip(tokens[:], tokens[1:], tokens[2:]):
            if ([token_i] in correction_starts and token_j in in_slot_values) or ([token_i, token_j] in correction_starts and token_k in in_slot_values) :
                correction_cases.add(' '.join([token_i, token_j, token_k]))
                correction_triggers += 1
                if not correction_triggered:
                    correction_triggered = True
                    correction_turns += 1
    # print '\n'.join(correction_cases)
    return correction_triggers, correction_turns

In [8]:
def count_long_distance_corrections(in_turns, in_slot_values):
    correction_starts = [
        filter(lambda token: not token.startswith('$'), phrase.split())
        for phrase in CONFIG['action_templates']['correct_long_distance']
    ]
    correction_cases = set([])
    user_turns = filter(lambda x: x['agent'] == 'user', in_turns)
    correction_triggers = 0
    correction_turns = 0
    prepositions = ['with', 'for', 'in', 'a']
    for turn in user_turns:
        tokens = turn['text'].strip().split()
        correction_triggered = False
        for token_i, token_j, token_k in zip(tokens[:], tokens[1:], tokens[2:]):
            if ([token_i] in correction_starts and token_j in prepositions and token_k in in_slot_values) or ([token_i, token_j] in correction_starts and token_k in prepositions):
                correction_cases.add(' '.join([token_i, token_j, token_k]))
                correction_triggers += 1
                if not correction_triggered:
                    correction_triggered = True
                    correction_turns += 1
    # print '\n'.join(correction_cases)
    return correction_triggers, correction_turns

In [14]:
def collect_corpus_stats(in_corpus_root):
    task1_train = read_task(path.join(in_corpus_root, 'dialog-babi-task1-API-calls-trn.txt'))
    task1_dev = read_task(path.join(in_corpus_root, 'dialog-babi-task1-API-calls-dev.txt'))
    task1_test = read_task(path.join(in_corpus_root, 'dialog-babi-task1-API-calls-tst.txt'))
    task1_oov_test = read_task(path.join(in_corpus_root, 'dialog-babi-task1-API-calls-tst-OOV.txt'))

    data_all = task1_train + task1_dev + task1_test + task1_oov_test
    turns_all = []
    for dialogue in data_all:
        turns_all += dialogue[1]

    slot_values = set([])
    for turn in filter(lambda x: x['text'].startswith('api_call'), turns_all):
        slot_values.update(turn['text'].split()[1:])
    result = {}
    result['hesitate'] = count_hesitations(turns_all)
    result['restart'] = count_restarts(turns_all)
    result['correct'] = count_corrections(turns_all, slot_values)
    result['correct_long_distance'] = count_long_distance_corrections(turns_all, slot_values)

    print 'Overall number of turns:\t{}'.format(len(turns_all))
    user_turns = filter(lambda x: x['agent'] == 'user', turns_all)
    print 'Among them user\'s turns:\t{}'.format(len(user_turns))

    print 'Number of hesitations triggered:\t{}'.format(result['hesitate'][0])
    print 'User turns containing hesitations:\t{0} ({1:.2f}%)'.format(result['hesitate'][1], 100. * result['hesitate'][1] / len(user_turns))

    print 'Number of restarts triggered:\t{}'.format(result['restart'][0])
    print 'User turns containing restarts:\t{0} ({1:.2f}%)'.format(result['restart'][1], 100. * result['restart'][1] / len(user_turns))

    print 'Number of corrections triggered:\t{}'.format(result['correct'][0])
    print 'User turns containing corrections:\t{0} ({1:.2f}%)'.format(result['correct'][1], 100. * result['correct'][1] / len(user_turns))

    print 'Number of long-distance corrections triggered:\t{}'.format(result['correct_long_distance'][0])
    print 'User turns containing long-distance corrections:\t{0} ({1:.2f}%)'.format(result['correct_long_distance'][1], 100. * result['correct_long_distance'][1] / len(user_turns))

bAbI+
==

In [9]:
collect_corpus_stats('babi_plus')

Overall number of turns:	47990
Among them user's turns:	23995
Number of hesitations triggered:	1194
User turns containing hesitations:	1167 (4.86%)
Number of restarts triggered:	9579
User turns containing restarts:	9579 (39.92%)
Number of corrections triggered:	4128
User turns containing corrections:	2556 (10.65%)
Number of long-distance corrections triggered:	4014
User turns containing long-distance corrections:	2476 (10.32%)


bAbI+ 2x
==

In [10]:
collect_corpus_stats('babi_plus_2x')

Overall number of turns:	95980
Among them user's turns:	47990
Number of hesitations triggered:	2402
User turns containing hesitations:	2334 (4.86%)
Number of restarts triggered:	19254
User turns containing restarts:	19254 (40.12%)
Number of corrections triggered:	8357
User turns containing corrections:	5190 (10.81%)
Number of long-distance corrections triggered:	7961
User turns containing long-distance corrections:	4890 (10.19%)


bAbI+ 5x
==

In [11]:
collect_corpus_stats('babi_plus_5x')

Overall number of turns:	239950
Among them user's turns:	119975
Number of hesitations triggered:	6180
User turns containing hesitations:	6018 (5.02%)
Number of restarts triggered:	47894
User turns containing restarts:	47894 (39.92%)
Number of corrections triggered:	20328
User turns containing corrections:	12789 (10.66%)
Number of long-distance corrections triggered:	19810
User turns containing long-distance corrections:	12214 (10.18%)


bAbI+ 10x
==

In [12]:
collect_corpus_stats('babi_plus_10x')

Overall number of turns:	479900
Among them user's turns:	239950
Number of hesitations triggered:	11953
User turns containing hesitations:	11664 (4.86%)
Number of restarts triggered:	96129
User turns containing restarts:	96129 (40.06%)
Number of corrections triggered:	41286
User turns containing corrections:	25920 (10.80%)
Number of long-distance corrections triggered:	39608
User turns containing long-distance corrections:	24465 (10.20%)


bAbI+ 50x
==

In [15]:
collect_corpus_stats('babi_plus_50x')

Overall number of turns:	2399500
Among them user's turns:	1199750
Number of hesitations triggered:	60100
User turns containing hesitations:	58483 (4.87%)
Number of restarts triggered:	479221
User turns containing restarts:	479221 (39.94%)
Number of corrections triggered:	205220
User turns containing corrections:	128796 (10.74%)
Number of long-distance corrections triggered:	199215
User turns containing long-distance corrections:	123417 (10.29%)


bAbI+ 100x
==

In [16]:
collect_corpus_stats('babi_plus_100x')

Overall number of turns:	4799000
Among them user's turns:	2399500
Number of hesitations triggered:	119308
User turns containing hesitations:	116042 (4.84%)
Number of restarts triggered:	959610
User turns containing restarts:	959610 (39.99%)
Number of corrections triggered:	412214
User turns containing corrections:	258775 (10.78%)
Number of long-distance corrections triggered:	398451
User turns containing long-distance corrections:	246918 (10.29%)
