In [22]:
import pandas as pd
from collections import defaultdict
from operator import itemgetter

In [16]:
dataset = pd.read_json('deep_disfluency_dataset_timings/trainset.json')

In [45]:
def build_repair_freq_dict(in_utterances, in_tags, in_length):
    result = defaultdict(lambda: 0)

    for utterance_i, tags_i in zip(in_utterances, in_tags):
        idx = 0
        repair_start_idx = None
        while idx < len(utterance_i):
            token, tag = utterance_i[idx], tags_i[idx]
            if repair_start_idx is None:
                if tag.startswith('<rm-{}/>'.format(in_length)):
                    repair_start_idx = idx - in_length
            else:
                if 'rpEndSub' in tag:
                    repair_end_idx = idx + 1
                    result[tuple(utterance_i[repair_start_idx: repair_end_idx])] += 1
                    repair_start_idx = None
            idx += 1
    return result

In [65]:
def build_whitelist_repair_freq_dict(in_utterances, in_tags):
    result = defaultdict(lambda: 0)

    for utterance_i, tags_i in zip(in_utterances, in_tags):
        for idx, (token, tag) in enumerate(zip(utterance_i, tags_i)):
            if (utterance_i[idx] == 'sorry' or utterance_i[idx:idx + 1] == ['i', 'mean']) and tag.startswith('<rm'):
                repair = utterance_i[max(0, idx - 3): idx + 3]
                result[tuple(repair)] += 1
    return result

In [46]:
repair_freq_dict_1 = build_repair_freq_dict(dataset['utterance'], dataset['tags'], 1)
for repair, freq in sorted(repair_freq_dict_1.items(), key=itemgetter(1), reverse=True)[:100]:
    print ' '.join(repair), freq

i i i 139
the the the 33
and and and 31
it it it 29
its its its 26
a a a 18
and and then 17
it its its 14
and and so 13
i i dont 13
thats thats thats 12
i i uh i 11
that that that 11
it it its 11
you you you 10
i i im 10
i im im 9
i when i 9
in in in 9
to to to 9
they they they 8
im im im 7
its it is 7
i i think 7
i i guess 6
that thats thats 6
its i think its 6
i i just 6
i i ive 5
if if you 5
is is is 5
and and uh and 5
i i have 5
what what what 5
that that thats 5
if if if 5
i i know 5
i i havent 4
its it was 4
you youre youre 4
it i think it 4
thats those are 4
i i like 4
in in uh in 4
of of of 4
a just a 4
it it was 4
ive ive ive 4
the the uh the 4
he he he 4
they i think they 4
they i guess they 3
i ive ive 3
do do do 3
and and you know and 3
and and uh so 3
we we have 3
its it it 3
or or or 3
and and the 3
it it seems 3
my my my 3
i i had 3
thats that is 3
as as as 3
that that was 3
i i guess i i 3
its its a 3
we i guess we 3
and and its its 3
i im i 3
theres there is 3
im i i 3

In [39]:
repair_freq_dict_2 = build_repair_freq_dict(dataset['utterance'], dataset['tags'], 2)
for repair, freq in sorted(repair_freq_dict_2.items(), key=itemgetter(1), reverse=True)[:100]:
    print ' '.join(repair), freq

it was it was 67
i dont i dont 57
i think i think 44
in the in the 39
do you do you 23
i was i was 23
that was that was 20
if you if you 16
i would i would 13
i can i can 13
in a in a 13
i just i just 13
i have i have 13
of the of the 12
its a its a 11
and the and the 11
it is it is 11
i like i like 11
and i and i 10
we have we have 10
they were they were 10
its just its just 10
on the on the 9
you can you can 9
when they when they 9
kind of kind of 9
i know i know 8
when i when i 8
its not its not 8
is it is it 8
i i i i 8
when you when you 8
im a im a 8
have you have you 7
they just they just 7
i uh i i 7
i had i had 7
if i if i 6
i do i do 6
and then and then 6
im not im not 6
on a on a 6
at a at a 6
i guess i guess 6
thats a thats a 6
i got i got 6
that they that they 6
with the with the 6
i didnt i didnt 5
that i that i 5
they are they are 5
as a as a 5
you cant you cant 5
there is there is 5
i havent i havent 5
thats the thats the 5
is that is that 5
i never i never 5
its like it

In [40]:
repair_freq_dict_3 = build_repair_freq_dict(dataset['utterance'], dataset['tags'], 3)
for repair, freq in sorted(repair_freq_dict_3.items(), key=itemgetter(1), reverse=True)[:100]:
    print ' '.join(repair), freq

a lot of a lot of 7
that was uh that was 5
it was uh it was 5
what do you what do you 4
i i dont i dont 4
i i think i think 4
i think uh i think 4
i do not i do not 4
how can you how can you 4
i dont know i dont know 4
i think that i think 3
i have uh i have 3
ive got a ive got a 3
in the uh in the 3
and you know and then 3
its kind of its kind of 3
i think its i think its 3
it was um it was 3
how do you how do you 3
do you uh do you 3
i i have i have 3
it was oh it was 3
i used to i used to 3
if you were if you were 3
kind of uh kind of 2
and you know and and 2
we had to we had to 2
we got uh we got 2
if you do if you do 2
i do not i am not 2
its its not its not 2
i i do i do 2
on the uh on the 2
if you uh if you 2
what are you what are you 2
i you know i i 2
i dont uh i dont 2
i dont think i think 2
i think its i think it 2
are you uh are you 2
i dont think i dont think 2
its not a its not a 2
how much uh how much 2
they had well they had 2
is not uh is not 2
the you know the the 2
w

In [41]:
repair_freq_dict_4 = build_repair_freq_dict(dataset['utterance'], dataset['tags'], 4)
for repair, freq in sorted(repair_freq_dict_4.items(), key=itemgetter(1), reverse=True)[:100]:
    print ' '.join(repair), freq

a lot of uh a lot of 2
i dont you know i dont 2
it was you know it was 2
of the of the of the 2
we were you know we were 2
when i you know when i 2
it was i mean it was 2
i have you know i have 2
that that would be that would be 2
what we you know what we 1
im not you know i have 1
she can make uh she makes 1
were keeping you know were were 1
like that for um that you know where 1
that that that says it says 1
not enjoy it look not acting like shes enjoying 1
theyve never had well theyve had 1
i have uh no $unc$t $unc$i as far as i concerned theyve 1
shes got so many shes got just a gorgeous yard so many uh flowers 1
kind of you know different kinds of 1
patterns i mean like a book of patterns 1
youre not going to youre not going to 1
i im very uh im very 1
i know i oh i know i 1
at the store or at the theater 1
it completely you know the shell finished 1
i would i would it would 1
she always had uh yeah she always had 1
do you do you does your wife 1
that that the communication that n

In [42]:
repair_freq_dict_5 = build_repair_freq_dict(dataset['utterance'], dataset['tags'], 5)
for repair, freq in sorted(repair_freq_dict_5.items(), key=itemgetter(1), reverse=True)[:100]:
    print ' '.join(repair), freq

when you think of uh when i just think of 1
it took me i mean it took 1
there theres theres usually yeah theres usually 1
i have more or less i have my favorite shows 1
so does $unc$detroit i mean so does $unc$chicago 1
when i was you know when i was 1
if everyone would take it i think if everyone took it 1
we didnt even you know we didnt even 1
i do try and um i do try and 1
by $unc$four oclock oh actually about $unc$four $unc$thirty 1
what we would you know what you 1
what types of what type what type of 1
and um so uh uh and then 1
do it by uh by do it by 1
i like to read uh i like to read 1
it it might you know itll it the same thing 1
$unc$atlantic $unc$cafe is down on i believe its on 1
and the uh you know and uh that 1
that was the worst impulse that was that was 1
what do you think that what do you think 1
all their structured of theirs all the structures of their 1
because you can you know if if 1
i i go to a i have a club that 1
goes out then to uh works for 1
a full time you

In [66]:
repairs = build_whitelist_repair_freq_dict(dataset['utterance'], dataset['tags'])
for key, value in sorted(repairs.items(), key=itemgetter(1), reverse=True)[:100]:
    print ' '.join(key), value

In [71]:
for utterance, tags in zip(dataset['utterance'], dataset['tags']):
    if 'i i uh i' in ' '.join(utterance):
        print utterance, tags
        break

[u'i', u'i', u'uh', u'i', u'just', u'went', u'out', u'and', u'got', u'a', u'new', u'$unc$v', u'$unc$c', u'$unc$r', u'yesterday', u'with', u'cash'] [u'<f/>', u'<rm-1/><rpEndSub/>', u'<e/>', u'<rm-3/><rpEndSub/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>', u'<f/>']
