In [189]:
import re
from collections import defaultdict
import string
import os

import numpy as np
import nltk

from deep_disfluency.corpus import swda

In [46]:
reader = swda.CorpusReader('../swda/swda')

In [92]:
def extract_nonspeech(in_utterance):
    return map(lambda x: ('nonspeech', x), re.findall('<[^<]+?>', in_utterance))


def filter_nonspeech(in_utterance):
    return re.sub('<[^<]+?>', '', in_utterance).strip()

In [101]:
NONSENTENCE_RE = '({(\w) ([^{]+?)})'


def extract_nonsentence(in_utterance):
    return map(lambda (body, disfluency_type, text): ('{{{}...}}'.format(disfluency_type), (body, text)),
               re.findall(NONSENTENCE_RE, in_utterance))


def filter_nonsentence(in_utterance):
    disfluent, clean = in_utterance, in_utterance
    for disfluency_type, (body, text) in extract_nonsentence(in_utterance):
        disfluent = disfluent.replace(body, text)
        clean = clean.replace(body, '')
    return re.sub('\s+', ' ', disfluent), re.sub('\s+', ' ', clean)

In [102]:
print extract_nonsentence('Actually, {F uh, }')
print filter_nonsentence('Actually, {F uh, }')

[('{F...}', ('{F uh, }', 'uh, '))]
('Actually, uh, ', 'Actually, ')


In [134]:
def extract_restarts_with_repair_and_nonsentece(in_utterance):
    return map(lambda x: ('[RM + {} RR]', x),
               re.findall('(\[([^[]+?)\s+\+\s+{}\s+([^[]+?)\])'.format(NONSENTENCE_RE), in_utterance))


def filter_restarts_with_repair_and_nonsentece(in_utterance):
    disfluent, clean = in_utterance, in_utterance
    for repair_type, (body, rm, nonsent_body, nonsent_type, nonsent_text, rr) in extract_restarts_with_repair_and_nonsentece(in_utterance):
        disfluent = disfluent.replace(body, ' '.join([rm, nonsent_text, rr]))
        clean = clean.replace(body, rr)
    return re.sub('\s+', ' ', disfluent), re.sub('\s+', ' ', clean)

In [135]:
print extract_restarts_with_repair_and_nonsentece('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')
print filter_restarts_with_repair_and_nonsentece('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')

[('[RM + {} RR]', ('[ I, + {F uh, } I]', ' I,', '{F uh, }', 'F', 'uh, ', 'I'))]
('Actually, I, uh, I guess I am [I, + I], {F uh, }', 'Actually, I guess I am [I, + I], {F uh, }')


In [138]:
def extract_restarts_with_repair(in_utterance):
    return map(lambda x: ('[RM + RR]', x),
               re.findall('(\[([^[]+?)\s+\+\s+([^[{]+?)\])', in_utterance))


def filter_restarts_with_repair(in_utterance):
    disfluent, clean = in_utterance, in_utterance
    for repair_type, (body, rm, rr) in extract_restarts_with_repair(in_utterance):
        disfluent = disfluent.replace(body, ' '.join([rm, rr]))
        clean = clean.replace(body, rr)
    return re.sub('\s+', ' ', disfluent), re.sub('\s+', ' ', clean)

In [139]:
print extract_restarts_with_repair('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')
print filter_restarts_with_repair('Actually, [ I, + {F uh, } I] guess I am [I, + I], {F uh, }')

[('[RM + RR]', ('[I, + I]', 'I,', 'I'))]
('Actually, [ I, + {F uh, } I] guess I am I, I, {F uh, }', 'Actually, [ I, + {F uh, } I] guess I am I, {F uh, }')


In [144]:
def extract_restarts_without_repair(in_utterance):
    return map(lambda x: ('[RM +]', x),
               re.findall('(\[([^[]+?)\s+\+\s+\])', in_utterance))


def filter_restarts_without_repair(in_utterance):
    disfluent, clean = in_utterance, in_utterance
    for repair_type, (body, rm) in extract_restarts_without_repair(in_utterance):
        disfluent = disfluent.replace(body, ' '.join([rm]))
        clean = clean.replace(body, '')
    return re.sub('\s+', ' ', disfluent), re.sub('\s+', ' ', clean)

In [145]:
print extract_restarts_without_repair('Actually, [ I + ] guess I am [I, + I], {F uh, }')
print filter_restarts_without_repair('Actually, [ I + ] guess I am [I, + I], {F uh, }')

[('[RM +]', ('[ I + ]', ' I'))]
('Actually, I guess I am [I, + I], {F uh, }', 'Actually, guess I am [I, + I], {F uh, }')


In [172]:
pipeline = [
    (extract_restarts_with_repair_and_nonsentece, filter_restarts_with_repair_and_nonsentece),
    (extract_restarts_with_repair, filter_restarts_with_repair),
    (extract_restarts_without_repair, filter_restarts_without_repair),
    (extract_nonsentence, filter_nonsentence),
]
disfluency_stats = defaultdict(lambda: 0)
parallel_corpus = []
for utt in reader.iter_utterances(display_progress=False):
    local_disfluencies = defaultdict(lambda: 0)
    utt_filtered = filter_nonspeech(utt.text)
    utt_original, utt_clean = utt_filtered, utt_filtered
    for extract_step, filter_step in pipeline:
        disfluencies = extract_step(utt_original)
        if not len(disfluencies):
            continue
        utt_original, utt_clean = filter_step(utt_original)[0], filter_step(utt_clean)[1]
        if not len(re.findall('\w+', utt_clean)):
            break
        for disfluency_type, disfluency in disfluencies:
            local_disfluencies[disfluency_type] += 1
    if local_disfluencies and len(re.findall('\w+', utt_clean)):
        for disfluency_type, count in local_disfluencies.iteritems():
            disfluency_stats[disfluency_type] += count
        parallel_corpus.append((utt_original, utt_clean))

../swda/swda/sw00utt/sw_0001_4325.utt.csv
../swda/swda/sw00utt/sw_0002_4330.utt.csv
../swda/swda/sw00utt/sw_0003_4103.utt.csv
../swda/swda/sw00utt/sw_0004_4327.utt.csv
../swda/swda/sw00utt/sw_0005_4646.utt.csv
../swda/swda/sw00utt/sw_0006_4108.utt.csv
../swda/swda/sw00utt/sw_0007_4171.utt.csv
../swda/swda/sw00utt/sw_0008_4321.utt.csv
../swda/swda/sw00utt/sw_0009_4329.utt.csv
../swda/swda/sw00utt/sw_0010_4356.utt.csv
../swda/swda/sw00utt/sw_0011_4358.utt.csv
../swda/swda/sw00utt/sw_0012_4360.utt.csv
../swda/swda/sw00utt/sw_0013_4617.utt.csv
../swda/swda/sw00utt/sw_0014_4619.utt.csv
../swda/swda/sw00utt/sw_0015_4877.utt.csv
../swda/swda/sw00utt/sw_0016_3389.utt.csv
../swda/swda/sw00utt/sw_0017_4036.utt.csv
../swda/swda/sw00utt/sw_0018_4082.utt.csv
../swda/swda/sw00utt/sw_0019_4104.utt.csv
../swda/swda/sw00utt/sw_0020_4109.utt.csv
../swda/swda/sw00utt/sw_0021_4168.utt.csv
../swda/swda/sw00utt/sw_0022_4320.utt.csv
../swda/swda/sw00utt/sw_0023_4341.utt.csv
../swda/swda/sw00utt/sw_0024_4688.

../swda/swda/sw02utt/sw_0203_4603.utt.csv
../swda/swda/sw02utt/sw_0204_4698.utt.csv
../swda/swda/sw02utt/sw_0205_4725.utt.csv
../swda/swda/sw02utt/sw_0206_4859.utt.csv
../swda/swda/sw02utt/sw_0207_2039.utt.csv
../swda/swda/sw02utt/sw_0208_2094.utt.csv
../swda/swda/sw02utt/sw_0209_2102.utt.csv
../swda/swda/sw02utt/sw_0210_2113.utt.csv
../swda/swda/sw02utt/sw_0211_2163.utt.csv
../swda/swda/sw02utt/sw_0212_2275.utt.csv
../swda/swda/sw02utt/sw_0213_2285.utt.csv
../swda/swda/sw02utt/sw_0214_2302.utt.csv
../swda/swda/sw02utt/sw_0215_2314.utt.csv
../swda/swda/sw02utt/sw_0216_2336.utt.csv
../swda/swda/sw02utt/sw_0217_2421.utt.csv
../swda/swda/sw02utt/sw_0218_2465.utt.csv
../swda/swda/sw02utt/sw_0219_2472.utt.csv
../swda/swda/sw02utt/sw_0220_2549.utt.csv
../swda/swda/sw02utt/sw_0221_2566.utt.csv
../swda/swda/sw02utt/sw_0222_2676.utt.csv
../swda/swda/sw02utt/sw_0223_2703.utt.csv
../swda/swda/sw02utt/sw_0224_2818.utt.csv
../swda/swda/sw02utt/sw_0225_2877.utt.csv
../swda/swda/sw02utt/sw_0226_3081.

../swda/swda/sw04utt/sw_0402_2634.utt.csv
../swda/swda/sw04utt/sw_0403_2650.utt.csv
../swda/swda/sw04utt/sw_0404_2667.utt.csv
../swda/swda/sw04utt/sw_0405_2717.utt.csv
../swda/swda/sw04utt/sw_0406_2784.utt.csv
../swda/swda/sw04utt/sw_0407_2826.utt.csv
../swda/swda/sw04utt/sw_0408_2860.utt.csv
../swda/swda/sw04utt/sw_0409_2866.utt.csv
../swda/swda/sw04utt/sw_0410_2970.utt.csv
../swda/swda/sw04utt/sw_0411_2998.utt.csv
../swda/swda/sw04utt/sw_0412_3015.utt.csv
../swda/swda/sw04utt/sw_0413_3041.utt.csv
../swda/swda/sw04utt/sw_0414_3067.utt.csv
../swda/swda/sw04utt/sw_0415_3168.utt.csv
../swda/swda/sw04utt/sw_0416_3205.utt.csv
../swda/swda/sw04utt/sw_0417_3237.utt.csv
../swda/swda/sw04utt/sw_0418_3275.utt.csv
../swda/swda/sw04utt/sw_0419_3284.utt.csv
../swda/swda/sw04utt/sw_0420_3288.utt.csv
../swda/swda/sw04utt/sw_0421_3311.utt.csv
../swda/swda/sw04utt/sw_0422_3320.utt.csv
../swda/swda/sw04utt/sw_0423_3325.utt.csv
../swda/swda/sw04utt/sw_0424_3328.utt.csv
../swda/swda/sw04utt/sw_0425_3382.

../swda/swda/sw05utt/sw_0598_2858.utt.csv
../swda/swda/sw05utt/sw_0599_2870.utt.csv
../swda/swda/sw06utt/sw_0600_2883.utt.csv
../swda/swda/sw06utt/sw_0601_2893.utt.csv
../swda/swda/sw06utt/sw_0602_2938.utt.csv
../swda/swda/sw06utt/sw_0603_2962.utt.csv
../swda/swda/sw06utt/sw_0604_2969.utt.csv
../swda/swda/sw06utt/sw_0605_2989.utt.csv
../swda/swda/sw06utt/sw_0606_3011.utt.csv
../swda/swda/sw06utt/sw_0607_3012.utt.csv
../swda/swda/sw06utt/sw_0608_3030.utt.csv
../swda/swda/sw06utt/sw_0609_3049.utt.csv
../swda/swda/sw06utt/sw_0610_3056.utt.csv
../swda/swda/sw06utt/sw_0611_3072.utt.csv
../swda/swda/sw06utt/sw_0612_3090.utt.csv
../swda/swda/sw06utt/sw_0613_3096.utt.csv
../swda/swda/sw06utt/sw_0614_3097.utt.csv
../swda/swda/sw06utt/sw_0615_3131.utt.csv
../swda/swda/sw06utt/sw_0616_3283.utt.csv
../swda/swda/sw06utt/sw_0617_3353.utt.csv
../swda/swda/sw06utt/sw_0618_3368.utt.csv
../swda/swda/sw06utt/sw_0619_3399.utt.csv
../swda/swda/sw06utt/sw_0620_3408.utt.csv
../swda/swda/sw06utt/sw_0621_3449.

../swda/swda/sw08utt/sw_0813_2296.utt.csv
../swda/swda/sw08utt/sw_0814_2308.utt.csv
../swda/swda/sw08utt/sw_0815_2354.utt.csv
../swda/swda/sw08utt/sw_0816_2368.utt.csv
../swda/swda/sw08utt/sw_0817_2379.utt.csv
../swda/swda/sw08utt/sw_0818_2528.utt.csv
../swda/swda/sw08utt/sw_0819_2594.utt.csv
../swda/swda/sw08utt/sw_0820_2638.utt.csv
../swda/swda/sw08utt/sw_0821_2711.utt.csv
../swda/swda/sw08utt/sw_0822_2776.utt.csv
../swda/swda/sw08utt/sw_0823_2827.utt.csv
../swda/swda/sw08utt/sw_0824_2944.utt.csv
../swda/swda/sw08utt/sw_0825_2953.utt.csv
../swda/swda/sw08utt/sw_0826_2981.utt.csv
../swda/swda/sw08utt/sw_0827_3019.utt.csv
../swda/swda/sw08utt/sw_0828_3055.utt.csv
../swda/swda/sw08utt/sw_0829_3228.utt.csv
../swda/swda/sw08utt/sw_0830_3233.utt.csv
../swda/swda/sw08utt/sw_0831_3253.utt.csv
../swda/swda/sw08utt/sw_0832_3265.utt.csv
../swda/swda/sw08utt/sw_0833_3281.utt.csv
../swda/swda/sw08utt/sw_0834_3282.utt.csv
../swda/swda/sw08utt/sw_0835_3319.utt.csv
../swda/swda/sw08utt/sw_0836_3326.

../swda/swda/sw10utt/sw_1025_3250.utt.csv
../swda/swda/sw10utt/sw_1026_3280.utt.csv
../swda/swda/sw10utt/sw_1027_3463.utt.csv
../swda/swda/sw10utt/sw_1028_3496.utt.csv
../swda/swda/sw10utt/sw_1029_3774.utt.csv
../swda/swda/sw10utt/sw_1030_2064.utt.csv
../swda/swda/sw10utt/sw_1031_2386.utt.csv
../swda/swda/sw10utt/sw_1032_2557.utt.csv
../swda/swda/sw10utt/sw_1033_2723.utt.csv
../swda/swda/sw10utt/sw_1034_2924.utt.csv
../swda/swda/sw10utt/sw_1035_2957.utt.csv
../swda/swda/sw10utt/sw_1036_2960.utt.csv
../swda/swda/sw10utt/sw_1037_3054.utt.csv
../swda/swda/sw10utt/sw_1038_3061.utt.csv
../swda/swda/sw10utt/sw_1039_3077.utt.csv
../swda/swda/sw10utt/sw_1040_3244.utt.csv
../swda/swda/sw10utt/sw_1041_3290.utt.csv
../swda/swda/sw10utt/sw_1042_4078.utt.csv
../swda/swda/sw10utt/sw_1043_2293.utt.csv
../swda/swda/sw10utt/sw_1044_2457.utt.csv
../swda/swda/sw10utt/sw_1045_2495.utt.csv
../swda/swda/sw10utt/sw_1046_2621.utt.csv
../swda/swda/sw10utt/sw_1047_2754.utt.csv
../swda/swda/sw10utt/sw_1048_2794.

In [171]:
# 30% of utternaces in the final corpus will be fluent
fluent_corpus_size = int(0.3 * len(parallel_corpus) / 0.7)
parallel_corpus_fluent = []
for utt in reader.iter_utterances(display_progress=False):
    utt_filtered = filter_nonspeech(utt.text)
    utt_original, utt_clean = utt_filtered, utt_filtered
    fluent = True
    for extract_step, filter_step in pipeline:
        disfluencies = extract_step(utt_original)
        if not len(disfluencies):
            fluent = False
            break
    if fluent and len(re.findall('\w+', utt_clean)):
        parallel_corpus_fluent.append((utt_clean, utt_clean))
    if fluent_corpus_size == len(parallel_corpus_fluent):
        break

../swda/swda/sw00utt/sw_0001_4325.utt.csv
../swda/swda/sw00utt/sw_0002_4330.utt.csv
../swda/swda/sw00utt/sw_0003_4103.utt.csv
../swda/swda/sw00utt/sw_0004_4327.utt.csv
../swda/swda/sw00utt/sw_0005_4646.utt.csv
../swda/swda/sw00utt/sw_0006_4108.utt.csv
../swda/swda/sw00utt/sw_0007_4171.utt.csv
../swda/swda/sw00utt/sw_0008_4321.utt.csv
../swda/swda/sw00utt/sw_0009_4329.utt.csv
../swda/swda/sw00utt/sw_0010_4356.utt.csv
../swda/swda/sw00utt/sw_0011_4358.utt.csv
../swda/swda/sw00utt/sw_0012_4360.utt.csv
../swda/swda/sw00utt/sw_0013_4617.utt.csv
../swda/swda/sw00utt/sw_0014_4619.utt.csv
../swda/swda/sw00utt/sw_0015_4877.utt.csv
../swda/swda/sw00utt/sw_0016_3389.utt.csv
../swda/swda/sw00utt/sw_0017_4036.utt.csv
../swda/swda/sw00utt/sw_0018_4082.utt.csv
../swda/swda/sw00utt/sw_0019_4104.utt.csv
../swda/swda/sw00utt/sw_0020_4109.utt.csv
../swda/swda/sw00utt/sw_0021_4168.utt.csv
../swda/swda/sw00utt/sw_0022_4320.utt.csv
../swda/swda/sw00utt/sw_0023_4341.utt.csv
../swda/swda/sw00utt/sw_0024_4688.

../swda/swda/sw02utt/sw_0202_4376.utt.csv
../swda/swda/sw02utt/sw_0203_4603.utt.csv
../swda/swda/sw02utt/sw_0204_4698.utt.csv
../swda/swda/sw02utt/sw_0205_4725.utt.csv
../swda/swda/sw02utt/sw_0206_4859.utt.csv
../swda/swda/sw02utt/sw_0207_2039.utt.csv
../swda/swda/sw02utt/sw_0208_2094.utt.csv
../swda/swda/sw02utt/sw_0209_2102.utt.csv
../swda/swda/sw02utt/sw_0210_2113.utt.csv
../swda/swda/sw02utt/sw_0211_2163.utt.csv
../swda/swda/sw02utt/sw_0212_2275.utt.csv
../swda/swda/sw02utt/sw_0213_2285.utt.csv
../swda/swda/sw02utt/sw_0214_2302.utt.csv
../swda/swda/sw02utt/sw_0215_2314.utt.csv
../swda/swda/sw02utt/sw_0216_2336.utt.csv
../swda/swda/sw02utt/sw_0217_2421.utt.csv
../swda/swda/sw02utt/sw_0218_2465.utt.csv
../swda/swda/sw02utt/sw_0219_2472.utt.csv
../swda/swda/sw02utt/sw_0220_2549.utt.csv
../swda/swda/sw02utt/sw_0221_2566.utt.csv
../swda/swda/sw02utt/sw_0222_2676.utt.csv
../swda/swda/sw02utt/sw_0223_2703.utt.csv
../swda/swda/sw02utt/sw_0224_2818.utt.csv
../swda/swda/sw02utt/sw_0225_2877.

../swda/swda/sw04utt/sw_0403_2650.utt.csv
../swda/swda/sw04utt/sw_0404_2667.utt.csv
../swda/swda/sw04utt/sw_0405_2717.utt.csv
../swda/swda/sw04utt/sw_0406_2784.utt.csv
../swda/swda/sw04utt/sw_0407_2826.utt.csv
../swda/swda/sw04utt/sw_0408_2860.utt.csv
../swda/swda/sw04utt/sw_0409_2866.utt.csv
../swda/swda/sw04utt/sw_0410_2970.utt.csv
../swda/swda/sw04utt/sw_0411_2998.utt.csv
../swda/swda/sw04utt/sw_0412_3015.utt.csv
../swda/swda/sw04utt/sw_0413_3041.utt.csv
../swda/swda/sw04utt/sw_0414_3067.utt.csv
../swda/swda/sw04utt/sw_0415_3168.utt.csv
../swda/swda/sw04utt/sw_0416_3205.utt.csv
../swda/swda/sw04utt/sw_0417_3237.utt.csv
../swda/swda/sw04utt/sw_0418_3275.utt.csv
../swda/swda/sw04utt/sw_0419_3284.utt.csv
../swda/swda/sw04utt/sw_0420_3288.utt.csv
../swda/swda/sw04utt/sw_0421_3311.utt.csv
../swda/swda/sw04utt/sw_0422_3320.utt.csv
../swda/swda/sw04utt/sw_0423_3325.utt.csv
../swda/swda/sw04utt/sw_0424_3328.utt.csv
../swda/swda/sw04utt/sw_0425_3382.utt.csv
../swda/swda/sw04utt/sw_0426_3409.

../swda/swda/sw06utt/sw_0600_2883.utt.csv
../swda/swda/sw06utt/sw_0601_2893.utt.csv
../swda/swda/sw06utt/sw_0602_2938.utt.csv
../swda/swda/sw06utt/sw_0603_2962.utt.csv
../swda/swda/sw06utt/sw_0604_2969.utt.csv
../swda/swda/sw06utt/sw_0605_2989.utt.csv
../swda/swda/sw06utt/sw_0606_3011.utt.csv
../swda/swda/sw06utt/sw_0607_3012.utt.csv
../swda/swda/sw06utt/sw_0608_3030.utt.csv
../swda/swda/sw06utt/sw_0609_3049.utt.csv
../swda/swda/sw06utt/sw_0610_3056.utt.csv
../swda/swda/sw06utt/sw_0611_3072.utt.csv
../swda/swda/sw06utt/sw_0612_3090.utt.csv
../swda/swda/sw06utt/sw_0613_3096.utt.csv
../swda/swda/sw06utt/sw_0614_3097.utt.csv
../swda/swda/sw06utt/sw_0615_3131.utt.csv
../swda/swda/sw06utt/sw_0616_3283.utt.csv
../swda/swda/sw06utt/sw_0617_3353.utt.csv
../swda/swda/sw06utt/sw_0618_3368.utt.csv
../swda/swda/sw06utt/sw_0619_3399.utt.csv
../swda/swda/sw06utt/sw_0620_3408.utt.csv
../swda/swda/sw06utt/sw_0621_3449.utt.csv
../swda/swda/sw06utt/sw_0622_3473.utt.csv
../swda/swda/sw06utt/sw_0623_3537.

../swda/swda/sw08utt/sw_0826_2981.utt.csv
../swda/swda/sw08utt/sw_0827_3019.utt.csv
../swda/swda/sw08utt/sw_0828_3055.utt.csv
../swda/swda/sw08utt/sw_0829_3228.utt.csv
../swda/swda/sw08utt/sw_0830_3233.utt.csv
../swda/swda/sw08utt/sw_0831_3253.utt.csv
../swda/swda/sw08utt/sw_0832_3265.utt.csv
../swda/swda/sw08utt/sw_0833_3281.utt.csv
../swda/swda/sw08utt/sw_0834_3282.utt.csv
../swda/swda/sw08utt/sw_0835_3319.utt.csv
../swda/swda/sw08utt/sw_0836_3326.utt.csv
../swda/swda/sw08utt/sw_0837_3379.utt.csv
../swda/swda/sw08utt/sw_0838_3405.utt.csv
../swda/swda/sw08utt/sw_0839_3424.utt.csv
../swda/swda/sw08utt/sw_0840_3426.utt.csv
../swda/swda/sw08utt/sw_0841_3530.utt.csv
../swda/swda/sw08utt/sw_0842_3657.utt.csv
../swda/swda/sw08utt/sw_0843_3738.utt.csv
../swda/swda/sw08utt/sw_0844_3743.utt.csv
../swda/swda/sw08utt/sw_0845_3754.utt.csv
../swda/swda/sw08utt/sw_0846_3956.utt.csv
../swda/swda/sw08utt/sw_0847_3985.utt.csv
../swda/swda/sw08utt/sw_0848_4666.utt.csv
../swda/swda/sw08utt/sw_0849_4709.

../swda/swda/sw10utt/sw_1047_2754.utt.csv
../swda/swda/sw10utt/sw_1048_2794.utt.csv
../swda/swda/sw10utt/sw_1049_2889.utt.csv
../swda/swda/sw10utt/sw_1050_2968.utt.csv
../swda/swda/sw10utt/sw_1051_3170.utt.csv
../swda/swda/sw10utt/sw_1052_3198.utt.csv
../swda/swda/sw10utt/sw_1053_3203.utt.csv
../swda/swda/sw10utt/sw_1054_3208.utt.csv
../swda/swda/sw10utt/sw_1055_3272.utt.csv
../swda/swda/sw10utt/sw_1056_3276.utt.csv
../swda/swda/sw10utt/sw_1057_3293.utt.csv
../swda/swda/sw10utt/sw_1058_3711.utt.csv
../swda/swda/sw10utt/sw_1059_3764.utt.csv
../swda/swda/sw10utt/sw_1060_3852.utt.csv
../swda/swda/sw10utt/sw_1061_4770.utt.csv
../swda/swda/sw10utt/sw_1062_2190.utt.csv
../swda/swda/sw10utt/sw_1063_2432.utt.csv
../swda/swda/sw10utt/sw_1064_2478.utt.csv
../swda/swda/sw10utt/sw_1065_2675.utt.csv
../swda/swda/sw10utt/sw_1066_2679.utt.csv
../swda/swda/sw10utt/sw_1067_2689.utt.csv
../swda/swda/sw10utt/sw_1068_2834.utt.csv
../swda/swda/sw10utt/sw_1069_2884.utt.csv
../swda/swda/sw10utt/sw_1070_3013.

Corpus stats
==

In [186]:
final_corpus = [(nltk.word_tokenize(utt_from.translate(None, string.punctuation)), nltk.word_tokenize(utt_to.translate(None, string.punctuation)))
                for utt_from, utt_to in parallel_corpus + parallel_corpus_fluent]

In [188]:
print 'Total number of utterances with disfluencies: {}'.format(len(parallel_corpus))
print 'Total number of utterances without disfluencies: {}'.format(len(parallel_corpus_fluent))
print 'Mean utterance length (utterance_from): {:.3f}'.format(np.mean([len(utt_from) for utt_from, utt_to in final_corpus]))
print 'Mean utterance length (utterance_to): {:.3f}'.format(np.mean([len(utt_to) for utt_from, utt_to in final_corpus]))
print 'Dislfuency stats by type:'
for key, value in disfluency_stats.iteritems():
    print '{}:\t{}'.format(key, value)

Total number of utterances with disfluencies: 95907
Total number of utterances without disfluencies: 97
Mean utterance length (utterance_from): 10.828
Mean utterance length (utterance_to): 8.696
Dislfuency stats by type:
[RM + {} RR]:	6475
{E...}:	3256
{A...}:	296
{C...}:	48086
{D...}:	28699
{F...}:	39466
[RM + RR]:	30548
[RM +]:	2965


In [192]:
out_folder = 'swda_parallel_corpus'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
with open(os.path.join(out_folder, 'encoder.txt'), 'w') as encoder_out, \
     open(os.path.join(out_folder, 'decoder.txt'), 'w') as decoder_out:
    for utt_from, utt_to in final_corpus:
        print >>encoder_out, ' '.join(utt_from)
        print >>decoder_out, ' '.join(utt_to)