In [12]:
import re
from collections import defaultdict
import string
import os

import numpy as np
import nltk

from swda import swda
from swda_utils import (extract_nonspeech,
                        filter_nonspeech,
                        extract_nonsentence,
                        filter_nonsentence,
                        extract_restarts_with_repair_and_nonsentence,
                        filter_restarts_with_repair_and_nonsentence,
                        extract_restarts_with_repair,
                        filter_restarts_with_repair,
                        extract_restarts_without_repair,
                        filter_restarts_without_repair)

In [13]:
reader = swda.CorpusReader('swda/swda')

In [14]:
print extract_nonsentence('Actually, {F uh, }')
print filter_nonsentence('Actually, {F uh, }')

[('{F...}', ('{F uh, }', 'uh, '))]
('Actually, uh, ', 'Actually, ')


In [15]:
print extract_restarts_with_repair_and_nonsentence('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')
print filter_restarts_with_repair_and_nonsentence('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')

[('[RM + {} RR]', ('[ I, + {F uh, } I]', ' I,', '{F uh, }', 'F', 'uh, ', 'I'))]
('Actually, I, uh, I guess I am [I, + I], {F uh, }', 'Actually, I guess I am [I, + I], {F uh, }')


In [16]:
print extract_restarts_with_repair('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')
print filter_restarts_with_repair('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')

[('[RM + RR]', ('[I, + I]', 'I,', 'I'))]
('Actually, [ I, + {F uh, } I] guess I am I, I, {F uh, }', 'Actually, [ I, + {F uh, } I] guess I am I, {F uh, }')


In [17]:
print extract_restarts_without_repair('Actually, [ I + ] guess I am [I, + I], {F uh, }')
print filter_restarts_without_repair('Actually, [ I + ] guess I am [I, + I], {F uh, }')

[('[RM +]', ('[ I + ]', ' I'))]
('Actually, I guess I am [I, + I], {F uh, }', 'Actually, guess I am [I, + I], {F uh, }')


In [18]:
pipeline = [
    (extract_restarts_with_repair_and_nonsentence, filter_restarts_with_repair_and_nonsentence),
    (extract_restarts_with_repair, filter_restarts_with_repair),
    (extract_restarts_without_repair, filter_restarts_without_repair),
    (extract_nonsentence, filter_nonsentence),
]
disfluency_stats = defaultdict(lambda: 0)
parallel_corpus = []
for utt in reader.iter_utterances(display_progress=False):
    local_disfluencies = defaultdict(lambda: 0)
    utt_filtered = filter_nonspeech(utt.text)
    utt_original, utt_clean = utt_filtered, utt_filtered
    for extract_step, filter_step in pipeline:
        disfluencies = extract_step(utt_original)
        if not len(disfluencies):
            continue
        utt_original, utt_clean = filter_step(utt_original)[0], filter_step(utt_clean)[1]
        if not len(re.findall('\w+', utt_clean)):
            break
        for disfluency_type, disfluency in disfluencies:
            local_disfluencies[disfluency_type] += 1
    if local_disfluencies and len(re.findall('\w+', utt_clean)):
        for disfluency_type, count in local_disfluencies.iteritems():
            disfluency_stats[disfluency_type] += count
        parallel_corpus.append((utt_original, utt_clean))

In [19]:
# 30% of utternaces in the final corpus will be fluent
fluent_corpus_size = int(0.3 * len(parallel_corpus) / 0.7)
parallel_corpus_fluent = []
for utt in reader.iter_utterances(display_progress=False):
    utt_filtered = filter_nonspeech(utt.text)
    utt_original, utt_clean = utt_filtered, utt_filtered
    fluent = True
    for extract_step, filter_step in pipeline:
        disfluencies = extract_step(utt_original)
        if not len(disfluencies):
            fluent = False
            break
    if fluent and len(re.findall('\w+', utt_clean)):
        parallel_corpus_fluent.append((utt_clean, utt_clean))
    if fluent_corpus_size == len(parallel_corpus_fluent):
        break

Corpus stats
==

In [20]:
final_corpus = [(nltk.word_tokenize(utt_from.translate(None, string.punctuation)), nltk.word_tokenize(utt_to.translate(None, string.punctuation)))
                for utt_from, utt_to in parallel_corpus + parallel_corpus_fluent]

In [21]:
print 'Total number of utterances with disfluencies: {}'.format(len(parallel_corpus))
print 'Total number of utterances without disfluencies: {}'.format(len(parallel_corpus_fluent))
print 'Mean utterance length (utterance_from): {:.3f}'.format(np.mean([len(utt_from) for utt_from, utt_to in final_corpus]))
print 'Mean utterance length (utterance_to): {:.3f}'.format(np.mean([len(utt_to) for utt_from, utt_to in final_corpus]))
print 'Dislfuency stats by type:'
for key, value in disfluency_stats.iteritems():
    print '{}:\t{}'.format(key, value)

Total number of utterances with disfluencies: 95900
Total number of utterances without disfluencies: 99
Mean utterance length (utterance_from): 10.828
Mean utterance length (utterance_to): 8.696
Dislfuency stats by type:
[RM + {} RR]:	6467
{E...}:	3257
{A...}:	294
{C...}:	48083
{D...}:	28696
{F...}:	39464
[RM + RR]:	30532
[RM +]:	2964


In [28]:
utts = []
for utt in reader.iter_utterances(display_progress=False):
    if extract_restarts_with_repair_and_nonsentence(utt.text):
        utts.append(utt)
        break

In [35]:
utts[0].text

'{C But, } {F uh, } I bought it for target practicing [ and, + {F uh, } and ] also because I wanted a weapon.  /'