In [60]:
from collections import defaultdict, Counter
import re

import pandas as pd
from tqdm.auto import tqdm

In [44]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../..")
from berp.languages import english

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
input_path = "../../cmudict-0.7b"
output_path = "cmudict_ipa.csv"

In [46]:
brackets_re = re.compile(r"\(\d+\)")
stress_re = re.compile(r"[012]")
comments_re = re.compile(r"\s*#.+$")

In [133]:
mapping = defaultdict(list)

In [134]:
# Manual insertions of preferred pronunciations over what is in CMUdict
preferred = {
    "WAS": "w ʌ z",
    "WIND": "w ɪ n d",  # default is  waInd ???

    "SALAO": "s æ l ɑ u",
    "FURLED": "f ɚ l d",
    "CREASED": "k ɹ i s t",
    "EROSIONS": "ɪ ɹ oʊ ʒ ʌ n",
    "FISHLESS": "f ɪ ʃ l ʌ s",
    "BUDSHIELDS": "b ʌ d ʃ i l d z",
    "FIBERED": "f aɪ b ɚ d",
    "PERICO": "p v ɹ ɪ k oʊ",
    "BAREFOOTED": "b ɛ ɹ f ʊ t ʌ d",
    "HATUEY": "h æ t j ʌ i",
    "SISLER'S": "s ɪ s l ɚ z",
    "JOTA": "h oʊ t ʌ",
    "VA": "v ɑ",
    "OAKUM": "oʊ k ʌ m",
    "HARBOURS": "h ɑ ɹ b ɚ z",
    "ROADSTEADS": "ɹ oʊ d s t ɛ d z",
    "URINATED": "j ɚ ʌ n ɛɪ t ɪ d",
    "MANOLIN": "m æ n ʌ l ɪ n",
    "PEBBLED": "p ɛ b ʌ l d",
    "MOTORBOATS": "m oʊ t ɚ b oʊ t z",
    "ALBACORES": "æ l b ʌ k ɔ ɹ z",
    "INEFFECTUALLY": "ɪ n ʌ f ɛ k tʃ u ʌ l i", # NB not the same as annot
    "WELTS": "w ɛ l t z",
    "FALSEST": "f ɑ l s ʌ s t",
    "CARAPACED": "k ɛ ɹ ʌ p ɛɪ s t",
    "GRIPPES": "g ɹ ɪ p z",
    "TUNA'S": "t u n ʌ z",
    "UNINTELLIGENT": "ʌ n ɪ n t ɛ l ʌ dʒ ʌ n t",
    "SARDINE'S": "s ɑ ɹ d i n z",
    "BITT": "b ɪ t",
    "PHOSPHORESCENT": "f ɑ s f ʌ ɹ ɛ s ʌ n t",
    "GAFFED": "g æ f t",
    "TREACHERIES": "t ɹ ɛ tʃ ɚ i s",
    "GUNWALE": "g ʌ n w ɛɪ l",
    "CARDEL": "k ɑ ɹ d ʌ l", # TODO typo in annotation? should be "cordel" right?
    "BROADBILL": "b ɹ ɔ d b ɪ l",
    "COAGULATED": "k oʊ æ g j ʌ l ɛɪ t ʌ d",
    "WINDLESS": "w ɪ n d l ʌ s",
    "LONGITUDINALLY": "l ɑ n dʒ ʌ t u d ʌ n ʌ l i",
}

for word, pron in preferred.items():
    mapping[word].append(pron)

In [135]:
with open(input_path, encoding="latin-1") as f:
    i = 0
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line.startswith(";;;"):
            continue

        word, arpa = line.split(" ", 1)
        word = brackets_re.sub("", word)
        arpa = stress_re.sub("", arpa).strip().split(" ")
        
        result = []
        for part in arpa:
            ipa = english.cmu_ipa_mapping[part]
            result.append(ipa)
            
        mapping[word].append(" ".join(result))

  0%|          | 0/134429 [00:00<?, ?it/s]

In [136]:
# Prepare dataframe
rows = [(word.lower(), i, pron) for word, prons in mapping.items()
        for i, pron in enumerate(prons)]
df = pd.DataFrame(rows, columns=["word", "pronunciation_idx", "pronunciation"])
df

Unnamed: 0,word,pronunciation_idx,pronunciation
0,was,0,w ʌ z
1,was,1,w ɑ z
2,was,2,w ɑ z
3,wind,0,w ɪ n d
4,wind,1,w aɪ n d
...,...,...,...
134409,{brace,0,b ɹ ɛɪ s
134410,{left-brace,0,l ɛ f t b ɹ ɛɪ s
134411,{open-brace,0,oʊ p ɛ n b ɹ ɛɪ s
134412,}close-brace,0,k l oʊ z b ɹ ɛɪ s


In [137]:
df.to_csv(output_path, index=False)

## DEV: check match with Heilbron

In [138]:
phonemes_df = pd.read_csv("phoneme.csv", index_col=[0, 1])

In [139]:
import itertools
cmu_phons = set(itertools.chain.from_iterable(phons.split(" ") for phons in df.pronunciation))

In [140]:
heilbron_phons = set(phonemes_df.phoneme)

In [141]:
cmu_phons - heilbron_phons

set()

In [142]:
heilbron_phons - cmu_phons

{'ɝ'}

### Check individual words

In [143]:
misses = Counter()
mismatches = defaultdict(list)
matches = 0
for _, word_data in tqdm(phonemes_df.groupby(["run", "word_idx"])):
    word = word_data.word.iloc[0]
    heilbron_pron = word_data.phoneme.str.cat(sep=" ")
    
    if word not in mapping or not mapping[word]:
        misses[word] += 1
    cmu_prons = mapping[word]

    if heilbron_pron in cmu_prons:
        matches += 1
    else:
        mismatches[word].append(heilbron_pron)

  0%|          | 0/10737 [00:00<?, ?it/s]

In [144]:
sorted({word: len(mismatches_i) for word, mismatches_i in mismatches.items()}.items(),
       key=lambda x: -x[1])

[('THE', 204),
 ('AND', 197),
 ('HE', 65),
 ('OF', 34),
 ('TO', 30),
 ('WAS', 29),
 ('I', 27),
 ('IT', 24),
 ('OLD', 22),
 ('THEN', 22),
 ('IN', 20),
 ('AS', 19),
 ('WITH', 19),
 ('FROM', 18),
 ('THAT', 18),
 ('HIS', 18),
 ('THEY', 17),
 ('YOU', 17),
 ('MAN', 14),
 ('FISH', 13),
 ('THEM', 12),
 ('A', 11),
 ('NOT', 11),
 ('WILL', 11),
 ('SAID', 11),
 ('FOR', 10),
 ('IS', 10),
 ('BUT', 10),
 ('LINE', 10),
 ('HAD', 9),
 ('HIM', 9),
 ('JUST', 9),
 ('ALMOST', 9),
 ('ON', 9),
 ('WERE', 8),
 ('WATER', 8),
 ('ALL', 8),
 ('MUST', 7),
 ('MADE', 7),
 ('WHAT', 7),
 ('TWO', 7),
 ('CAN', 7),
 ('NOW', 7),
 ('TOO', 7),
 ('BOY', 7),
 ('HAVE', 7),
 ('STILL', 6),
 ('WHEN', 6),
 ('TODAY', 6),
 ('THOUGHT', 6),
 ('LONG', 6),
 ('BY', 6),
 ('ONLY', 6),
 ('GREAT', 6),
 ('WOULD', 6),
 ('COULD', 6),
 ('WHERE', 5),
 ('DID', 5),
 ('AT', 5),
 ('ARE', 5),
 ('STRANGE', 5),
 ('THERE', 5),
 ('DO', 5),
 ('OUT', 5),
 ('SO', 5),
 ('EAT', 5),
 ('STARTED', 5),
 ('LIKE', 5),
 ('HURT', 5),
 ('AGAINST', 5),
 ('HER', 5),
 ('NO'

In [145]:
misses

Counter()

In [146]:
mismatches

defaultdict(list,
            {'WAS': ['w ʌ z ʌ',
              'ʌ z',
              'ʌ z',
              'ʌ z',
              'ʌ z',
              'w ʌ z t',
              'ʌ z',
              'ʌ z',
              'ʌ z',
              'w ʌ z ɔ',
              'ʌ z',
              'ʌ z',
              'w ʌ z h',
              'w ʌ z n',
              'ʌ z',
              'w ʌ z n',
              'w ʌ z oʊ',
              'w ʌ z ʌ',
              'ʌ z',
              'w ʌ z j',
              'ʌ z',
              'w ʌ z s',
              'w ʌ z ʃ',
              'ʌ z t',
              'w ʌ z ð',
              'ʌ z',
              'ʌ z',
              'ʌ z',
              'w ʌ z oʊ'],
             'AN': ['n', 'ʌ n ɚ', 'ʌ n oʊ'],
             'PARENTS': ['p ɛ ɹ ʌ n t s h'],
             'HAD': ['æ d',
              'æ d',
              'h æ d p',
              'h æ d ɔ',
              'h æ d θ',
              'h æ d f',
              'h æ d s',
              'h æ d n',
              'æ d']

In [152]:
mapping["BEER"]

['b ɪ ɹ']

In [150]:
# TODO fix this -- lots of incorrect annotations of "from"
phonemes_df[phonemes_df.word == "FROM"].groupby(["run", "word_idx"]).apply(lambda xs: xs.phoneme.str.cat())

run  word_idx
1    176          fɹʌm
     205           fɚm
     262            ɚm
2    13            fɚm
     240           fɚm
     305          fɚmt
3    51             ɚm
     118          fɹʌm
     153          fɹʌm
     515           fɚm
4    108          fɹʌm
     371          fɹʌm
     484          fɹʌm
6    468            ɚm
7    48           fɚmð
     82            ɹʌm
     224          fɹʌm
     504          fɹʌm
9    295           fɚm
10   155           fɚm
     366           fɚm
11   7            fɹʌm
     108          fɹʌm
     152          fɹʌm
     375          fɹʌm
12   398          fɹʌm
13   139           fɚm
     161          fɹʌm
     251           fɚm
14   311          fɹʌm
15   118          fɹʌm
     238           fɚm
     417          fɹʌm
17   153           fɚm
     162          fɹʌm
18   306          fɹʌm
19   472         fɹʌmð
     490          fɹʌm
dtype: object