# Creation of rhyming dictionary

**Àlex R. Atrio and Andrei Popescu-Belis, HEIG-VD/HES-SO and EPFL, 2023**

This notebook contains the functions that create a rhyming dictionary based on a phonetic dictionary provided by CMU Sphinx at: http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40.   

We already provide the rhyming dictionary, actually three Python dictionaries bundled as `rhyming_dictionaries.pickle` for use in `rhyme-counter.ipynb`.  But it can be re-created using this notebook, using the file `cmudict_SPHINX_40` obtained from [CMU Sphinx](http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/sphinxdict/cmudict_SPHINX_40).

In [1]:
import pickle
import os
import numpy as np

In [2]:
with open("./cmudict_SPHINX_40", "r") as f:
    raw = f.read().split("\n")[:-1][64:-5] # 64- -5 to avoid getting punctuation mark spellings
    
phonetic_dictionary = {line.split("\t")[0].lower(): line.split("\t")[1].lower().split() for line in raw}
print(len(list(phonetic_dictionary.items())))
print(list(phonetic_dictionary.items())[100:110])

132962
[('abdicate', ['ae', 'b', 'd', 'ah', 'k', 'ey', 't']), ('abdicated', ['ae', 'b', 'd', 'ah', 'k', 'ey', 't', 'ah', 'd']), ('abdicates', ['ae', 'b', 'd', 'ah', 'k', 'ey', 't', 's']), ('abdicating', ['ae', 'b', 'd', 'ih', 'k', 'ey', 't', 'ih', 'ng']), ('abdication', ['ae', 'b', 'd', 'ih', 'k', 'ey', 'sh', 'ah', 'n']), ('abdnor', ['ae', 'b', 'd', 'n', 'er']), ('abdo', ['ae', 'b', 'd', 'ow']), ('abdollah', ['ae', 'b', 'd', 'aa', 'l', 'ah']), ('abdomen', ['ae', 'b', 'd', 'ow', 'm', 'ah', 'n']), ('abdomen(2)', ['ae', 'b', 'd', 'ah', 'm', 'ah', 'n'])]


In [3]:
phonemic_vowels = ["AA","AE","AH","AO","AW","AY","EH","EY","IH","IY","OW","OY","UH","UW","W","Y"] + ["ER"]
# phonemic_consonants = ["B","CH","D","DH","F","G","HH","K","L","M","N","NG","JH","P","R","S","SH","T","TH","V","Z","ZH","SIL"]

In [4]:
# We consider the following definition of rhyme from Tim Van de Cruys, 
# "Automatic poetry generation from prosaic text" (Proc. of ACL 2020):
#
# identity of "final group of vowels, optionally followed by a group of consonants, 
# as well as the group of consonants that precedes the group of vowels." 

word2rhymes = {}

for key, phonemes in phonetic_dictionary.items():
    final_pos = 0
    for pos, phoneme in enumerate(phonemes):
        if phoneme.upper() in phonemic_vowels:
            final_pos = pos
    assonant_rhyme = phonemes[final_pos]
    perf_rhyme = "".join(phonemes[final_pos:])
    word2rhymes[key] = [perf_rhyme, assonant_rhyme]

In [5]:
for key in np.random.choice(list(word2rhymes.keys()), 10):
    print(key, word2rhymes[key])

figallo ['ow', 'ow']
sub ['ahb', 'ah']
fertitta ['ah', 'ah']
mongold ['owld', 'ow']
vigilant ['ahnt', 'ah']
prudhoe ['ow', 'ow']
birden ['ahn', 'ah']
adventuresome ['ahm', 'ah']
naming ['ihng', 'ih']
willem ['ahm', 'ah']


In [6]:
# Remove keys with parentheses and numbers:
print(len(word2rhymes))
word2rhymes = {key: val for key, val in word2rhymes.items() if "(" not in key and ")" not in key }
print(len(word2rhymes))

# Optional: remove keys with fewer than 3 characters:
# word2rhymes = {key: val for key, val in word2rhymes.items() if len(key) >= 3}
# print(len(word2rhymes))

132962
123631


In [8]:
perfect_rhyme = {}
assonant_rhyme = {}

# We perform a reverse mapping from rhymes to words (twice), and create 
# two new dictionaries to accelerate lookup when testing rhymes:
for key, vals in word2rhymes.items(): 
    cons = vals[0]
    asson = vals[1]
    
    if cons in perfect_rhyme.keys():
        perfect_rhyme[cons] += [key]
    else:
        perfect_rhyme[cons] = [key]

    if asson in assonant_rhyme.keys():
        assonant_rhyme[asson] += [key]
    else:
        assonant_rhyme[asson] = [key]

        
print(perfect_rhyme["erz"][:10])
print(assonant_rhyme["ah"][:10])

['abductors', 'absorbers', 'abusers', 'accelerators', 'accelerometers', "accor's", 'accumulators', 'accusers', 'achievers', "acker's"]
['a', 'aachen', 'aamodt', 'aardema', 'aaron', "aaron's", 'aarons', 'aaronson', "aaronson's", 'aasen']


In [9]:
# With the definition of rhyme given above,
# we obtain the following number of unique endings 
# (phonetic) for perfect rhymes, then for assonant rhymes:
print(len(perfect_rhyme.keys()))
print(len(assonant_rhyme.keys()))
print(list(assonant_rhyme.keys()))

1356
19
['ah', 'ey', 'er', 'eh', 'ao', 'aa', 'iy', 'ae', 'ow', 'ih', 'aw', 'uw', 'ay', 'oy', 'uh', 'w', 'f', 'y', 'th']


In [10]:
with open("./rhyming_dictionaries.pickle", "wb") as f:
    pickle.dump([word2rhymes, perfect_rhyme, assonant_rhyme], f, protocol=pickle.HIGHEST_PROTOCOL)