In [1]:
# Need to create a data structure allowing for easy lookup of common word overlaps
# First, how big is this this going to be?

from nltk.corpus import cmudict
from collections import Counter

d = cmudict.dict()

def n_head_graphs(k):
    return len(set([g[:k] for g in d.iterkeys() if len(g) >= k]))

def n_tail_graphs(k):
    return len(set([g[-k:] for g in d.iterkeys() if len(g) >= k]))

def n_head_phones(k):
    return len(set([tuple(p[0][:k]) for p in d.itervalues() if len(p) >= k]))

def n_tail_phones(k):
    return len(set([tuple(p[0][-k:]) for p in d.itervalues() if len(p) >= k]))
                   
print n_head_graphs(1), n_head_graphs(2), n_head_graphs(3), n_head_graphs(4), n_head_graphs(5)
print n_tail_graphs(1), n_tail_graphs(2), n_tail_graphs(3), n_tail_graphs(4), n_tail_graphs(5)
print n_head_phones(1), n_head_phones(2), n_head_phones(3), n_head_phones(4), n_head_phones(5)
print n_tail_phones(1), n_tail_phones(2), n_tail_phones(3), n_tail_phones(4), n_tail_phones(5)

43 564 4438 22111 48075
35 510 4306 20177 46468
65 951 543 159 1
65 794 350 129 2


In [2]:
# Need each of the following:
# 1) subgrapheme head frequency
# 2) subgrapheme tail frequency
# 3) (subgrapheme,subphoneme) head frequency
# 4) (subgrapheme,subphoneme) tail frequency
# 5) (subgrapheme,subphoneme) overall frequency

# This requires loading in alignments

import sys
sys.path.insert(0, '../code')

from word import Word
from pronunciation_dictionary import PronunciationDictionary
from sequence_alignment import SequenceAlignment

pd = PronunciationDictionary.load('../data/pronunciation_dictionary.pkl')

In [3]:
print pd.grapheme_to_word_dict['alligator'].grapheme_to_arpabet_phoneme_alignment.seq1
print pd.grapheme_to_word_dict['alligator'].grapheme_to_arpabet_phoneme_alignment.seq2

[('a',), ('l', 'l'), ('i',), ('g',), ('a',), ('t',), ('o', 'r')]
[(u'AE1',), (u'L',), (u'AH0',), (u'G',), (u'EY2',), (u'T',), (u'ER0',)]


In [4]:
def graph_chunks_to_key(g):
    return ''.join(sum(map(list, g), []))

def phone_chunks_to_key(p):
    return tuple(sum(map(list, p), []))

In [5]:
from collections import defaultdict

# smooth by using default value of 1 to avoid singularities when length > 5
# fine... what if I use a Counter instead?

subgrapheme_head_counts = defaultdict(lambda: 1)
subgrapheme_tail_counts = defaultdict(lambda: 1)
subgrapheme_counts = defaultdict(lambda: 1)

subphoneme_head_counts = defaultdict(lambda: 1)
subphoneme_tail_counts = defaultdict(lambda: 1)
subphoneme_counts = defaultdict(lambda: 1)

subword_head_counts = defaultdict(lambda: 1)
subword_tail_counts = defaultdict(lambda: 1)
subword_counts = defaultdict(lambda: 1)

vocab_size = 0

for grapheme,word in pd.grapheme_to_word_dict.iteritems():
    graph_chunks = word.grapheme_to_arpabet_phoneme_alignment.seq1
    phone_chunks = word.grapheme_to_arpabet_phoneme_alignment.seq2
    vocab_size += 1
    for k in range(1,6):
        for i in range(len(graph_chunks)-k+1):
            g = graph_chunks_to_key(graph_chunks[i:i+k])
            p = phone_chunks_to_key(phone_chunks[i:i+k])
            subgrapheme_counts[g] += 1
            subphoneme_counts[p] += 1
            subword_counts[(g,p)] += 1
            if i == 0: # head
                subgrapheme_head_counts[g] += 1
                subphoneme_head_counts[p] += 1
                subword_head_counts[(g,p)] += 1
            if i + k == len(graph_chunks): # tail
                subgrapheme_tail_counts[g] += 1
                subphoneme_tail_counts[p] += 1
                subword_tail_counts[(g,p)] += 1

In [6]:
# Define new class 'SubwordFrequencies'

from subword_frequency import SubwordFrequency

sf = SubwordFrequency(
        subgrapheme_head_counts, 
        subgrapheme_tail_counts, 
        subgrapheme_counts, 
        subphoneme_head_counts, 
        subphoneme_tail_counts, 
        subphoneme_counts, 
        subword_head_counts, 
        subword_tail_counts, 
        subword_counts,
        vocab_size)
sf.save('../data/subword_frequency.pkl')

In [10]:
print sf.get_subgrapheme_frequency('ammy', side='tail')
print subgrapheme_tail_counts['ammy']
print len([g for g in d.iterkeys() if g[-4:] == 'ammy'])


8
8
7


In [10]:
x = ('gat',(u'G',u'EY2',u'T'))
print sf.get_subword_frequency(*x, side='head')
print sf.get_subword_frequency(*x, side='tail')
print sf.get_subword_frequency(*x, side='all')


1
1
103


In [12]:
print pd.grapheme_to_word_dict['holgate'].grapheme_to_arpabet_phoneme_alignment.seq1
print pd.grapheme_to_word_dict['holgate'].grapheme_to_arpabet_phoneme_alignment.seq2

[('h',), ('o',), ('l',), ('g',), ('a',), ('t',), ('e',)]
[(u'HH',), (u'OW1',), (u'L',), (u'G',), (u'EY2',), (u'T',), ()]


In [11]:
for grapheme,word in pd.grapheme_to_word_dict.iteritems():
    graph_chunks = word.grapheme_to_arpabet_phoneme_alignment.seq1
    phone_chunks = word.grapheme_to_arpabet_phoneme_alignment.seq2
    for k in range(1,6):
        for i in range(len(graph_chunks)-k+1):
            g = graph_chunks_to_key(graph_chunks[i:i+k])
            p = phone_chunks_to_key(phone_chunks[i:i+k])
            if g == 'gat' and p == (u'G',u'EY2',u'T'):
                print graph_chunks_to_key(graph_chunks)
                print phone_chunks_to_key(phone_chunks)
                print 
#             subword_counts[(g,p)] += 1
#             if i == 0: # head
#                 subgrapheme_head_counts[g] += 1
#                 subword_head_counts[(g,p)] += 1
#             if i + k == len(graph_chunks): # tail
#                 subgrapheme_tail_counts[g] += 1
#                 subword_tail_counts[(g,p)] += 1

holgate
(u'HH', u'OW1', u'L', u'G', u'EY2', u'T')

castigating
(u'K', u'AE1', u'S', u'T', u'AH0', u'G', u'EY2', u'T', u'IH0', u'NG')

interrogators
(u'IH2', u'N', u'T', u'EH1', u'R', u'AH0', u'G', u'EY2', u'T', u'ER0', u'Z')

abrogating
(u'AE1', u'B', u'R', u'AH0', u'G', u'EY2', u'T', u'IH0', u'NG')

obligate
(u'AA1', u'B', u'L', u'AH0', u'G', u'EY2', u'T')

obligates
(u'AA1', u'B', u'L', u'AH0', u'G', u'EY2', u'T', u'S')

obligated
(u'AA1', u'B', u'L', u'AH0', u'G', u'EY2', u'T', u'IH0', u'D')

delegates
(u'D', u'EH1', u'L', u'AH0', u'G', u'EY2', u'T', u'S')

delegated
(u'D', u'EH1', u'L', u'AH0', u'G', u'EY2', u'T', u'AH0', u'D')

investigating
(u'IH0', u'N', u'V', u'EH1', u'S', u'T', u'AH0', u'G', u'EY2', u'T', u'IH0', u'NG')

floodgates
(u'F', u'L', u'AH1', u'D', u'G', u'EY2', u'T', u'S')

litigator
(u'L', u'IH1', u'T', u'AH0', u'G', u'EY2', u'T', u'ER0')

irrigate
(u'IH1', u'R', u'AH0', u'G', u'EY2', u'T')

seagate
(u'S', u'IY1', u'G', u'EY2', u'T')

digate
(u'D', u'AY1', u'G', u'

In [11]:
from collections import defaultdict
import dill
import cPickle as pkl

test_dict = defaultdict(lambda: 1)
test_func = lambda: 1
with open('../data/test.pkl', 'wb') as outfile:
    pkl.dump(test_func, outfile)

TypeError: can't pickle function objects