In [40]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

For every word in the CMU-pronounciation dict:
1. downcase
2. associate with its first pronounciation match, with emphases stripped
3. contruct input file of the form: `"letter_1\sletter_2\s...letter_n\tphone_1\sphone_2\s...phone_m"`

In [2]:
import nltk

cmu_dict = nltk.corpus.cmudict.dict()

In [4]:
len(cmu_dict)

123455

In [34]:
# took ~6sec to run ~1k pairs, therefore to run ~100k pairs will take ~600sec = 10min

with open('data/m2m_preprocessed_cmudict.txt', 'w') as outfile:
    for word, phoneme_list in cmu_dict.iteritems():
        # leave in nonalpha characters, the aligner can handle it
        # also don't need to downcase, because all words in cmudict are already downcased
        clean_phoneme = [filter(str.isalpha, str(phone)) for phone in phoneme_list[0]]
        outfile.write(' '.join(word) + '\t' + ' '.join(clean_phoneme) + '\n')

In [43]:
# run m2m-aligner in command line...

# failed to align 35 words out of the total set of 100k, I'd say that's pretty good
# especially considering that the failures are extreme edge-cases of written english

with open('data/m2m_preprocessed_cmudict.txt.m-mAlign.2-2.delX.1-best.conYX.align.err') as infile:
    failed_words = [line.strip().split('\t')[0].replace(' ','') for line in infile.readlines()]

print '\n'.join(failed_words)

w's
a42128
c3
c5
c4
jr
m1
m3
mr
ltd
st
se
n92762
m5
m4
w.s
dwi
etc
feb
nov
cxc
dfw
rep.
wm
kwh
aaa
vs
w
c1
x
q
sr
w.
ws
bbq


In [44]:
# aligns pairs use the following syntax:
# 1) chunked graphemes/phonemes are divided by '|' symbols
# 2) two graphemes/phonemes which are chunked together in a mapping will be separated by a ':'
# 3) graphemes mapping to null-phonemes are denoted by '_'

with open('data/m2m_preprocessed_cmudict.txt.m-mAlign.2-2.delX.1-best.conYX.align') as infile:
    aligned_word_phoneme_pairs = [line.strip().split('\t') for line in infile.readlines()]

In [None]:
# two separate issues here:
# 1) finding grapheme from phoneme, and visa-versa
# 2) finding alignment from, given that we already have the grapheme/phoneme
# 
# don't worry about the first part, just focus on the second part

In [None]:
# what are we doing with this data?
# want a data structure which makes, given a grapheme+phoneme, and the inds of a subphoneme, we can easily return the corresponding inds of the subgrapheme,
# and visa-versa

# Alignment.grapheme_to_phoneme(grapheme) --> phoneme
# grapheme_to_alignment
# Alignment.grapheme_to_alignment(grapheme) --> alignment tuple list
# Alignment.subgrapheme_to_subphoneme(grapheme, i1, i2) --> subphoneme
# Alignment.subphoneme_to_subgraphene(phoneme, i1, i2) --> subgrapheme

class 

class GraphemePhonemeAlignment(object):
    def __init__(grapheme, phoneme, aligned_pairs):
        '''
        'alignment' is a list of (graph,phone) pairs, where either entry may contain a double-graph/phone, and where the phone may be None
        '''
        self.grapheme = grapheme
        self.phoneme = phoneme
        self.aligned_pairs = aligned_pairs
    
    def Alignment.subphoneme_to_subgrapheme(phone_i1, phone_i2):
        '''
        Given the a pair of (inclusive) phone indexes, find the grapheme that corresponds to that phoneme
        
        If the index lands in the middle of a double-phoneme, then DO return the entire accompanying grapheme
        '''
        if phone_i1 < 0 or phone_i1 >= len(self.phoneme) or phone_i2 < 0 or phone_i2 >= len(self.phoneme) or phone_i2 < phone_i1:
            raise Exception('Indices are not permissible')
        
        subgrapheme = ''
        subphoneme_idx = 0
        for graph,phone in aligned_pairs:
            if subphoneme_idx > phone_i2:
                break
            
            # null-phone
            if phone == '_':
                continue
            
            
            
            # is double-phone
            if ':' in phone:
                subphoneme_idx += 2
            # is not null-phone
            elif :
                subphoneme_idx += 1
        
        return subgrapheme

In [3]:
for g in cmu_dict:
    if ':' in g: print g

:colon
