In [39]:
from pathlib import Path
from prosody.aligner import G2PAligner

In [12]:
aligner = G2PAligner('g2p_dict.txt', 'g2p_letters.txt')

In [21]:
# Simply call aligner to get a list of (graphemes, phonemes)
aligner(word='whereby', pron='W EH0 R B AY1'.split())

[('wh', ('W',)),
 ('e', ('EH0',)),
 ('r', ('R',)),
 ('e', ()),
 ('b', ('B',)),
 ('y', ('AY1',))]

In [25]:
# If it cannot be aligned, the default behaviour is to insert silent graphemes or phonemes and substitute
aligner(word='wherebok', pron='W Z EH0 R B AY1'.split())
# in this example, nothing maps to phoneme Z, 'o' gets mapped to AY1 (though invalid) and 'k' maps to nothing

[('wh', ('W',)),
 ('', 'Z'),
 ('e', ('EH0',)),
 ('r', ('R',)),
 ('e', ()),
 ('b', ('B',)),
 ('o', ('AY1',)),
 ('k', ())]

In [27]:
# If you need to align with letters being pronounced
alignment, valid, traceback = aligner.align_spell_letters(word='three-d', pron='TH R IY1 D IY2'.split())
alignment

[('th', ('TH',)),
 ('r', ('R',)),
 ('ee', ('IY1',)),
 ('-', ()),
 ('d', ('D', 'IY2'))]

In [36]:
# If you need to check whether the graphemes can be aligned or not
alignment, valid, traceback = aligner.align(word='wherebok', pron='W Z EH0 R B AY1'.split())
valid

False

In [38]:
# If you need to check what failed during alignment. Here we expect to fail with hitting phoneme Z
alignment, valid, traceback = aligner.align(
    word='wherebok', pron='W Z EH0 R B AY1'.split(), traceback=[]
)
traceback

[('wherebok', ['W', 'Z', 'EH0', 'R', 'B', 'AY1']),
 ('erebok', ['Z', 'EH0', 'R', 'B', 'AY1']),
 ('rebok', ['Z', 'EH0', 'R', 'B', 'AY1']),
 ('bok', ['Z', 'EH0', 'R', 'B', 'AY1']),
 ('herebok', ['Z', 'EH0', 'R', 'B', 'AY1']),
 ('erebok', ['Z', 'EH0', 'R', 'B', 'AY1']),
 ('rebok', ['Z', 'EH0', 'R', 'B', 'AY1']),
 ('bok', ['Z', 'EH0', 'R', 'B', 'AY1'])]

In [195]:
# Checking the CMU dictionary for words that cannot be aligned
# hence, either our G2P dictionary is wrong or the CMU dictionary is wrong
import re

def read_word_prons(cmudict_path):
    word_prons = []
    with open(cmudict_path) as dic:
        for line in dic:
            if not line.startswith(';'):
                word, pron = line.rstrip().split('  ')
                if '.' in word:
                    continue
                pron = pron.split(' ')
                word = word.lower()
                word = re.sub(r"[^a-z']", '', word)
                word_prons.append((word, pron))
    return word_prons


In [196]:
# Generate tracebacks for any word that can't be aligned
def check_invalids(word_prons, out_path):
    with open(out_path, 'w') as f:
        for i, (word, pron) in enumerate(word_prons):
            traceback = []
            alignment, valid, traceback = aligner.align(word, pron, traceback)
            if not valid:
                traceback = []
                alignment, valid, traceback = aligner.align_spell_letters(word, pron, traceback)
                if not valid:
                    f.write(str(i))
                    for subword, subpron in traceback:
                        f.write(f'\t{subword} {subpron}\n')
                    f.write('\n')

In [None]:
word_prons = read_word_prons('cmudict-0.7b.txt')
check_invalids(word_prons, 'invalids')

In [292]:
# If we want to check the penalty for invalid alignments (substition = 1.5, addition/deletion = 1)
with open('invalids') as f:
    for line in f:
        split_line = line.rstrip().split('  ', maxsplit=1)
        if len(split_line) == 2:
            word, pron = split_line[1].split(' ', maxsplit=1)
            alignment, penalty = aligner.align_fallback(word, pron)
            print(alignment, penalty)

In [296]:
# Log any changes made in the updated CMU dictionary to a new file
word_prons = {}
with open('cmudict-0.7c.txt') as dic:
    for line in dic:
        if not line.startswith(';'):
            word, pron = line.rstrip().split('  ')
            word_prons[word] = pron
old_word_prons = {}
with open('cmudict-0.7b.txt') as dic:
    for line in dic:
        if not line.startswith(';'):
            word, pron = line.rstrip().split('  ')
            old_word_prons[word] = pron

with open('dict_changes.txt', 'w') as f:
    f.write('**** ADDITIONS ****\n\n')
    for word in sorted(word_prons.keys() - old_word_prons.keys()):
        pron = word_prons[word]
        f.write(f'{word}  {pron}\n')
    f.write('\n**** DELETIONS ****\n\n')
    for word in sorted(old_word_prons.keys() - word_prons.keys()):
        old_pron = old_word_prons[word]
        f.write(f'{word}  {old_pron}\n')
    f.write('\n**** CORRECTIONS ****\n\n')
    for word in sorted(old_word_prons.keys() & word_prons.keys()):
        old_pron = old_word_prons[word]
        pron = word_prons[word]
        if old_pron != pron:
            f.write(f'{word}  {old_pron} >> {pron}\n')

In [301]:
old_word_prons.keys() & word_prons.keys()

{'BABLER',
 "RISSOLI'S",
 'NISWANDER',
 'SUMMONSES',
 'WHANG',
 'PIGEONHOLED',
 'OAKLEY',
 'FRITCHMAN',
 'INTEGER',
 "DARROW'S",
 'VOMIT',
 'HUGUET',
 'CLASSY',
 'QUES(1)',
 'NONSTARTER',
 'MONTRONE',
 'VINDICATION',
 'FILION',
 'GRANDBABY',
 'RICHCO',
 'AZALIA',
 'FRIL',
 'SELF-SUFFICIENCY',
 'RESPEAK',
 'PATRICE',
 'EURO(1)',
 'MANIC',
 'BROSHEARS',
 'DROMEDARY',
 'QUIZZICAL',
 'TOTMAN',
 'HONKY',
 'TANGUAY',
 'BALDERSON',
 'SCAM',
 "NATIONSBANC'S(1)",
 'CELIS',
 'KOFFLER(1)',
 'DECHERD',
 'KLUMB',
 'HARSHA',
 'ISLER',
 'VISITS',
 'COMMUNIQUE(1)',
 'STRIPER',
 'UREMIA',
 'FEARLESS',
 'JANINA',
 'MAGLICA',
 'POLITICAL',
 'GARSIDE',
 'SCHAEFFER',
 'TWAT',
 'RUDDICK',
 'INFECTING',
 "CUNNINGHAM'S",
 "SOUP'S",
 'GIROUARD',
 'FERNANDES',
 'SCHNIER',
 'PRONG',
 'LOBE',
 'KIS',
 'REVOKE(1)',
 'CRITICALLY',
 'TOWNSON',
 'VANDERWOUDE',
 'CIST',
 'JUNETTE',
 'CONROW',
 'ENIX',
 'OPPORTUNIST',
 'ALVERTON',
 'BUFFETING(1)',
 'LETTER',
 'HASER',
 'FARING',
 'INCUR',
 'MCKENNEY',
 'MOHLER',
 'MORP