In [1]:
from collections import defaultdict, Counter
import re

import pandas as pd
from tqdm.auto import tqdm

In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../..")
from berp.languages import english

In [8]:
!wget http://raw.githubusercontent.com/Alexir/CMUdict/master/cmudict-0.7b

--2023-05-08 16:11:45--  http://raw.githubusercontent.com/Alexir/CMUdict/master/cmudict-0.7b
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://raw.githubusercontent.com/Alexir/CMUdict/master/cmudict-0.7b [following]
--2023-05-08 16:11:45--  https://raw.githubusercontent.com/Alexir/CMUdict/master/cmudict-0.7b
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3865710 (3.7M) [text/plain]
Saving to: ‘cmudict-0.7b’


2023-05-08 16:11:46 (36.4 MB/s) - ‘cmudict-0.7b’ saved [3865710/3865710]



## Syllabifier

From https://github.com/georgiee/lip-sync-lpc/blob/master/sources/p2tk/python/syllabify/syllabifier.py

In [29]:
# IPA stress annotations
ipa_stress_primary = "ˈ"
ipa_stress_secondary = "ˌ"

In [41]:
# This is the P2TK automated syllabifier. Given a string of phonemes,
# it automatically divides the phonemes into syllables.
#
# By Joshua Tauberer, based on code originally written by Charles Yang.
#
# The syllabifier requires a language configuration which specifies
# the set of phonemes which are consonants and vowels (syllable nuclei),
# as well as the set of permissible onsets.
#
# Then call syllabify with a language configuration object and a word
# represented as a string (or list) of phonemes.
#
# Returned is a data structure representing the syllabification.
# What you get is a list of syllables. Each syllable is a tuple
# of (stress, onset, nucleus, coda). stress is None or an integer stress
# level attached to the nucleus phoneme on input. onset, nucleus,
# and coda are lists of phonemes.
#
# Example:
#
# import syllabifier
# language = syllabifier.English # or: syllabifier.loadLanguage("english.cfg")
# syllables = syllabifier.syllabify(language, "AO2 R G AH0 N AH0 Z EY1 SH AH0 N Z")
#
# The syllables variable then holds the following:
# [ (2, [],     ['AO'], ['R']),
#   (0, ['G'],  ['AH'], []),
#   (0, ['N'],  ['AH'], []),
#   (1, ['Z'],  ['EY'], []),
#   (0, ['SH'], ['AH'], ['N', 'Z'])]
#
# You could process that result with this type of loop:
#
# for stress, onset, nucleus, coda in syllables :
#   print " ".join(onset), " ".join(nucleus), " ".join(coda)
#
# You can also pass the result to stringify to get a nice printable
# representation of the syllables, with periods separating syllables:
#
# print syllabify.stringify(syllables)
#
#########################################################################

English = {
	'consonants': ['B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 
	'NG', 'P', 'R', 'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH'],
	'vowels': [ 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'EH', 'ER', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW'],
	'onsets': ['P', 'T', 'K', 'B', 'D', 'G', 'F', 'V', 'TH', 'DH', 'S', 'Z', 'SH', 'CH', 'JH', 'M',
	'N', 'R', 'L', 'HH', 'W', 'Y', 'P R', 'T R', 'K R', 'B R', 'D R', 'G R', 'F R',
	'TH R', 'SH R', 'P L', 'K L', 'B L', 'G L', 'F L', 'S L', 'T W', 'K W', 'D W', 
	'S W', 'S P', 'S T', 'S K', 'S F', 'S M', 'S N', 'G W', 'SH W', 'S P R', 'S P L',
	'S T R', 'S K R', 'S K W', 'S K L', 'TH W', 'ZH', 'P Y', 'K Y', 'B Y', 'F Y', 
	'HH Y', 'V Y', 'TH Y', 'M Y', 'S P Y', 'S K Y', 'G Y', 'HH W', '']
	}

def loadLanguage(filename) :
	'''This function loads up a language configuration file and returns
	the configuration to be passed to the syllabify function.'''

	L = { "consonants" : [], "vowels" : [], "onsets" : [] }
	
	f = open(filename, "r")
	section = None
	for line in f :
		line = line.strip()
		if line in ("[consonants]", "[vowels]", "[onsets]") :
			section = line[1:-1]
		elif section == None :
			raise ValueError("File must start with a section header such as [consonants].")
		elif not section in L :
			raise ValueError("Invalid section: " + section)
		else :
			L[section].append(line)
			
	for section in "consonants", "vowels", "onsets" :
		if len(L[section]) == 0 :
			raise ValueError("File does not contain any consonants, vowels, or onsets.")
			
	return L

def syllabify(language, word) :
	'''Syllabifies the word, given a language configuration loaded with loadLanguage.
	   word is either a string of phonemes from the CMU pronouncing dictionary set
	   (with optional stress numbers after vowels), or a Python list of phonemes,
	   e.g. "B AE1 T" or ["B", "AE1", "T"]'''
	   
	if type(word) == str :
		word = word.split()
		
	syllables = [] # This is the returned data structure.

	internuclei = [] # This maintains a list of phonemes between nuclei.
	
	for phoneme in word :
	
		phoneme = phoneme.strip()
		if phoneme == "" :
			continue
		stress = None
		if phoneme[-1].isdigit() :
			stress = int(phoneme[-1])
			phoneme = phoneme[0:-1]
		
		if phoneme in language["vowels"] :
			# Split the consonants seen since the last nucleus into coda and onset.
			
			coda = None
			onset = None
			
			# If there is a period in the input, split there.
			if "." in internuclei :
				period = internuclei.index(".")
				coda = internuclei[:period]
				onset = internuclei[period+1:]
			
			else :
				# Make the largest onset we can. The 'split' variable marks the break point.
				for split in range(0, len(internuclei)+1) :
					coda = internuclei[:split]
					onset = internuclei[split:]
					
					# If we are looking at a valid onset, or if we're at the start of the word
					# (in which case an invalid onset is better than a coda that doesn't follow
					# a nucleus), or if we've gone through all of the onsets and we didn't find
					# any that are valid, then split the nonvowels we've seen at this location.
					if " ".join(onset) in language["onsets"] \
					   or len(syllables) == 0 \
					   or len(onset) == 0 :
					   break
			   
			# Tack the coda onto the coda of the last syllable. Can't do it if this
			# is the first syllable.
			if len(syllables) > 0 :
				syllables[-1][3].extend(coda)
			
			# Make a new syllable out of the onset and nucleus.
			syllables.append( (stress, onset, [phoneme], []) )
				
			# At this point we've processed the internuclei list.
			internuclei = []

		elif not phoneme in language["consonants"] and phoneme != "." :
			raise ValueError("Invalid phoneme: " + phoneme)
			
		else : # a consonant
			internuclei.append(phoneme)
	
	# Done looping through phonemes. We may have consonants left at the end.
	# We may have even not found a nucleus.
	if len(internuclei) > 0 :
		if len(syllables) == 0 :
			syllables.append( (None, internuclei, [], []) )
		else :
			syllables[-1][3].extend(internuclei)

	return syllables

def stringify(syllables) :
	'''This function takes a syllabification returned by syllabify and
	   turns it into a string, with phonemes spearated by spaces and
	   syllables spearated by periods.'''
	ret = []
	for syl in syllables :
		stress, onset, nucleus, coda = syl
		if stress == 1:
			ret.append(ipa_stress_primary)
		elif stress == 2:
			ret.append(ipa_stress_secondary)
		else:
			ret.append(".")
		ret.append(" ".join(onset + nucleus + coda))
	return " ".join(ret)

## Go

In [32]:
input_path = "cmudict-0.7b"
output_path = "cmudict_ipa.csv"

In [33]:
brackets_re = re.compile(r"\(\d+\)")
stress_re = re.compile(r"[012]")
comments_re = re.compile(r"\s*#.+$")

In [66]:
mapping = defaultdict(list)

In [67]:
for word, pron in english.cmudict_overrides.items():
    mapping[word].append(pron)

In [68]:
with open(input_path, encoding="latin-1") as f:
    i = 0
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line.startswith(";;;"):
            continue

        word, arpa = line.split(" ", 1)
        word = brackets_re.sub("", word)
        arpa_with_stress = stringify(syllabify(English, arpa)).strip().split(" ")
        
        result = []
        for part in arpa_with_stress:
            # Let syllable annotations pass through
            if part in [".", ipa_stress_primary, ipa_stress_secondary]:
                ipa = part
            else:
                ipa = english.cmu_ipa_mapping[part]
            result.append(ipa)
            
        mapping[word].append(" ".join(result))
        i += 1

  0%|          | 0/134429 [00:00<?, ?it/s]

In [70]:
# Prepare dataframe
rows = [(word.lower(), i, pron) for word, prons in mapping.items()
        for i, pron in enumerate(prons)]
df = pd.DataFrame(rows, columns=["word", "pronunciation_idx", "pronunciation_syllable"])
df["pronunciation"] = df.pronunciation_syllable.str.replace(f"[.{ipa_stress_primary}{ipa_stress_secondary}]\s+", "", regex=True)
df

Unnamed: 0,word,pronunciation_idx,pronunciation_syllable,pronunciation
0,was,0,ˈ w ʌ z,w ʌ z
1,was,1,ˈ w ɑ z,w ɑ z
2,was,2,. w ɑ z,w ɑ z
3,wind,0,ˈ w ɪ n d,w ɪ n d
4,wind,1,ˈ w aɪ n d,w aɪ n d
...,...,...,...,...
134431,{brace,0,ˈ b ɹ ɛɪ s,b ɹ ɛɪ s
134432,{left-brace,0,ˈ l ɛ f t ˈ b ɹ ɛɪ s,l ɛ f t b ɹ ɛɪ s
134433,{open-brace,0,ˈ oʊ . p ɛ n ˈ b ɹ ɛɪ s,oʊ p ɛ n b ɹ ɛɪ s
134434,}close-brace,0,ˈ k l oʊ z ˈ b ɹ ɛɪ s,k l oʊ z b ɹ ɛɪ s


In [71]:
df.to_csv(output_path, index=False)