In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

For every word in the CMU-pronunciation dict:
1. remove word if it contains characters other than :alpha, :hyphen, :underscore
2. associate with its first phoneme match, with stresses stripped
3. contruct input file of the form: `"letter_1\sletter_2\s...letter_n\tphone_1\sphone_2\s...phone_m"`
4. call m2m-aligner on file (https://github.com/letter-to-phoneme/m2m-aligner)
5. process results into data structure that allows for easy portmanteau generation

In [2]:
import nltk

cmu_dict = nltk.corpus.cmudict.dict()

len(cmu_dict)

123455

In [3]:
import re
pattern = re.compile("^([a-z_\-]+)+$")

# took ~6sec to run ~1k pairs, therefore to run ~100k pairs will take ~600sec = 10min
with open('../data/m2m_preprocessed_cmudict.txt', 'w') as outfile:
    for word, phoneme_list in cmu_dict.iteritems():
        # don't need to downcase, because all words in cmudict are already downcased
        # only allow words comprised of:
        # 1) alpha
        # 2) hyphen '-'
        # 3) underscore '_'
        if pattern.match(word):
            clean_phoneme = [filter(str.isalpha, str(phone)) for phone in phoneme_list[0]]
            outfile.write(' '.join(word) + '\t' + ' '.join(clean_phoneme) + '\n')

In [4]:
# run m2m-aligner in command line:
# > PATH/TO/M2M_ALIGNER/m2m-aligner --delX --maxX 2 --maxY 2 -i data/m2m_preprocessed_cmudict.txt

# failed to align 21 words out of the total set of 100k, I'd say that's pretty good
# especially considering that the failures are extreme edge-cases of written english

with open('../data/m2m_preprocessed_cmudict.txt.m-mAlign.2-2.delX.1-best.conYX.align.err') as infile:
    failed_words = [line.strip().split('\t')[0].replace(' ','') for line in infile.readlines()]

print '\n'.join(failed_words)

jr
mr
ltd
st
se
dwi
etc
feb
nov
cxc
dfw
wm
kwh
aaa
vs
w
x
q
sr
ws
bbq


In [5]:
# aligned pairs use the following syntax:
# 1) chunked graphemes/phonemes are divided by '|' symbols
# 2) two graphemes/phonemes which are chunked together in a mapping will be separated by a ':'
# 3) graphemes mapping to null-phonemes are denoted by '_'

with open('../data/m2m_preprocessed_cmudict.txt.m-mAlign.2-2.delX.1-best.conYX.align') as infile:
    aligned_word_phoneme_pairs = [line.strip().split('\t') for line in infile.readlines()]

In [6]:
seq1 = 'i|m|p|e|l:l|e|d|'
seq2 = 'IH|M|P|EH|L|_|D|'

divider_char = '|'
concat_char = ':'
null_char = '_' # null char CAN appear validly in seq1, and never appears with its usual usage

hash_key = seq1.replace(divider_char,'').replace(concat_char,'')

# get sequence tuples

seq1_chunks = seq1.strip(divider_char).split(divider_char)
new_seq1 = []
for chunk in seq1_chunks:
    # do NOT filter out null_chars
    new_chunk = tuple(chunk.split(concat_char))
    new_seq1.append(new_chunk)

seq2_chunks = seq2.strip(divider_char).split(divider_char)
new_seq2 = []
for chunk in seq2_chunks:
    # DO filter out null_chars
    if chunk == null_char:
        new_chunk = ()
    else:
        new_chunk = tuple(chunk.split(concat_char))
    new_seq2.append(new_chunk)
    
print hash_key
print new_seq1
print new_seq2

impelled
[('i',), ('m',), ('p',), ('e',), ('l', 'l'), ('e',), ('d',)]
[('IH',), ('M',), ('P',), ('EH',), ('L',), (), ('D',)]


In [7]:
# Reprocess alignment results into hash mapping grapheme to 2 lists-of-tuples
divider_char = '|'
concat_char = ':'
null_char = '_' # null char CAN appear validly in seq1, and never appears with its usual usage

grapheme_phoneme_alignment_dict = {}

for seq1, seq2 in aligned_word_phoneme_pairs:
    # get seq1 hash key
    hash_key = seq1.replace(divider_char,'').replace(concat_char,'')
    
    # get seq1 tuples
    seq1_chunks = seq1.strip(divider_char).split(divider_char)
    new_seq1 = []
    for chunk in seq1_chunks:
        # do NOT filter out null_chars
        new_chunk = tuple(chunk.split(concat_char))
        new_seq1.append(new_chunk)

    # get seq2 tuples
    seq2_chunks = seq2.strip(divider_char).split(divider_char)
    new_seq2 = []
    for chunk in seq2_chunks:
        # DO filter out null_chars
        if chunk == null_char:
            new_chunk = ()
        else:
            new_chunk = tuple(chunk.split(concat_char))
        new_seq2.append(new_chunk)
    
    grapheme_phoneme_alignment_dict.update({hash_key: (new_seq1, new_seq2)})

In [8]:
c = 0
for key, val in grapheme_phoneme_alignment_dict.iteritems():
    print key
    print val[0]
    print val[1]
    print
    c+=1
    if c == 5: break

fawn
[('f',), ('a', 'w'), ('n',)]
[('F',), ('AO',), ('N',)]

sermersheim
[('s',), ('e', 'r'), ('m',), ('e', 'r'), ('s',), ('h',), ('e', 'i'), ('m',)]
[('S',), ('ER',), ('M',), ('ER',), ('S',), ('HH',), ('AY',), ('M',)]

sonji
[('s',), ('o',), ('n',), ('j',), ('i',)]
[('S',), ('AO',), ('N',), ('JH',), ('IY',)]

scheuring
[('s', 'c'), ('h',), ('e',), ('u', 'r'), ('i',), ('n', 'g')]
[('SH',), (), (), ('ER',), ('IH',), ('NG',)]

nunnery
[('n',), ('u',), ('n', 'n'), ('e', 'r'), ('y',)]
[('N',), ('AH',), ('N',), ('ER',), ('IY',)]



In [9]:
import numpy as np

graph_chunks, phone_chunks = grapheme_phoneme_alignment_dict['scheuring']

chunk_lengths = map(len, phone_chunks)
valid_end_inds = np.cumsum(chunk_lengths)
valid_start_inds = np.cumsum(chunk_lengths) - chunk_lengths

print phone_chunks
print zip(valid_start_inds, valid_end_inds)

[('SH',), (), (), ('ER',), ('IH',), ('NG',)]
[(0, 1), (1, 1), (1, 1), (1, 2), (2, 3), (3, 4)]


In [10]:
import numpy as np

# Map stress-less phones to stressed phones
grapheme_to_stressed_phoneme_alignment_dict = {}
for grapheme, (graph_chunks, phone_chunks) in grapheme_phoneme_alignment_dict.iteritems():
    chunk_lengths = map(len, phone_chunks)
    valid_end_inds = np.cumsum(chunk_lengths)
    valid_start_inds = np.cumsum(chunk_lengths) - chunk_lengths
    idx_pairs = zip(valid_start_inds,valid_end_inds)
    
    stressed_phoneme = cmu_dict[grapheme][0]
    phone_chunks_stressed = [tuple(stressed_phoneme[start_idx:end_idx]) for (start_idx,end_idx) in idx_pairs]
    grapheme_to_stressed_phoneme_alignment_dict.update({grapheme: (graph_chunks, phone_chunks_stressed)})

In [11]:
c = 0
for key, val in grapheme_to_stressed_phoneme_alignment_dict.iteritems():
    print key
    print val[0]
    print val[1]
    print
    c+=1
    if c == 5: break

fawn
[('f',), ('a', 'w'), ('n',)]
[(u'F',), (u'AO1',), (u'N',)]

sermersheim
[('s',), ('e', 'r'), ('m',), ('e', 'r'), ('s',), ('h',), ('e', 'i'), ('m',)]
[(u'S',), (u'ER1',), (u'M',), (u'ER0',), (u'S',), (u'HH',), (u'AY0',), (u'M',)]

sonji
[('s',), ('o',), ('n',), ('j',), ('i',)]
[(u'S',), (u'AO1',), (u'N',), (u'JH',), (u'IY0',)]

scheuring
[('s', 'c'), ('h',), ('e',), ('u', 'r'), ('i',), ('n', 'g')]
[(u'SH',), (), (), (u'ER1',), (u'IH0',), (u'NG',)]

nunnery
[('n',), ('u',), ('n', 'n'), ('e', 'r'), ('y',)]
[(u'N',), (u'AH1',), (u'N',), (u'ER0',), (u'IY0',)]



In [12]:
import sys
sys.path.insert(0, '../code')

import re
from global_constants import ARPABET_PHONE_TO_PHONOLOGICAL_PHONE_DICT as a2p_dict

# Build additional arpabet_phoneme to pronunciation_phoneme alignment_dict
pattern = re.compile("^([a-z_\-]+)+$")

# maps graphemes to 2-lists of (1) arpa-phone-tuples and (2) feature-phone-tuples
arpabet_phoneme_to_feature_phoneme_alignment_dict = {}
grapheme_to_feature_phoneme_dict = {}
for grapheme, phoneme_list in cmu_dict.iteritems():
    # don't need to downcase, because all graphemes in cmudict are already downcased
    # only allow graphemes comprised of:
    # 1) alpha
    # 2) hyphen '-'
    # 3) underscore '_'
    if pattern.match(grapheme):
        # comma notation is used to create length-1 tuples
        arpabet_phone_chunks = [(phone,) for phone in phoneme_list[0]]
        feature_phone_chunks = [tuple(a2p_dict[phone]) for phone in phoneme_list[0]]
        feature_phoneme = sum(map(list, feature_phone_chunks), [])
        
        arpabet_phoneme_to_feature_phoneme_alignment_dict.update({grapheme: (arpabet_phone_chunks, feature_phone_chunks)})
        grapheme_to_feature_phoneme_dict.update({grapheme: feature_phoneme})

In [13]:
import sys
sys.path.insert(0, '../code')

from pronunciation_dictionary import PronunciationDictionary

pattern = re.compile("^([a-z_\-]+)+$")

grapheme_to_stressed_phoneme_dict = {}
for grapheme, phoneme_list in cmu_dict.iteritems():
    # don't need to downcase, because all graphemes in cmudict are already downcased
    # only allow graphemes comprised of:
    # 1) alpha
    # 2) hyphen '-'
    # 3) underscore '_'
    if pattern.match(grapheme):
        grapheme_to_stressed_phoneme_dict.update({grapheme: phoneme_list[0]})
            
# grapheme_to_stressed_phoneme_pronunciation_dictionary = PronunciationDictionary(grapheme_to_stressed_phoneme_dict, grapheme_to_stressed_phoneme_alignment_dict)

# grapheme_to_stressed_phoneme_pronunciation_dictionary.save('../data/grapheme_to_stressed_phoneme_pronunciation_dictionary.pkl')

In [15]:
# Create a list of Word objects, and use it to initialize a PronunciationDictionary
# Requires turning the 'alignment_dict' variables into proper SequenceAlignment objects
# ...
# BADLY need to clean up this script to make it comprehensible
# Since it really does contain ALL of the preprocessing steps, should really make it a stand-alone .py file
# which can be easily run from the terminal

import sys
sys.path.insert(0, '../code')

from word import Word
from pronunciation_dictionary import PronunciationDictionary
from sequence_alignment import SequenceAlignment

pattern = re.compile("^([a-z_\-]+)+$")

word_list = []
for grapheme, phoneme_list in cmu_dict.iteritems():
    # don't need to downcase, because all words in cmudict are already downcased
    # only allow words comprised of:
    # 1) alpha
    # 2) hyphen '-'
    # 3) underscore '_'
    if pattern.match(grapheme) and grapheme not in failed_words:
        stressed_arpabet_phoneme = grapheme_to_stressed_phoneme_dict[grapheme]
        graph_tuples, stressed_arpabet_phone_tuples = grapheme_to_stressed_phoneme_alignment_dict[grapheme]
        grapheme_to_stressed_arpabet_phoneme_alignment = SequenceAlignment(graph_tuples, stressed_arpabet_phone_tuples)

        feature_phoneme = grapheme_to_feature_phoneme_dict[grapheme]
        arpabet_phone_tuples, feature_phone_tuples = arpabet_phoneme_to_feature_phoneme_alignment_dict[grapheme]
        arpabet_phoneme_to_feature_phoneme_alignment = SequenceAlignment(arpabet_phone_tuples, feature_phone_tuples)
        
        new_word = Word(grapheme, stressed_arpabet_phoneme, feature_phoneme, grapheme_to_stressed_arpabet_phoneme_alignment, arpabet_phoneme_to_feature_phoneme_alignment)
        word_list.append(new_word)

PronunciationDictionary(word_list).save('../data/pronunciation_dictionary.pkl')