In [221]:
from collections import defaultdict, Counter
import re

import pandas as pd
from tqdm.auto import tqdm

In [222]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../..")
from berp.languages import english

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [223]:
input_path = "../../cmudict-0.7b"
output_path = "cmudict_ipa.csv"

In [224]:
brackets_re = re.compile(r"\(\d+\)")
stress_re = re.compile(r"[012]")
comments_re = re.compile(r"\s*#.+$")

In [225]:
mapping = defaultdict(list)

In [226]:
for word, pron in english.cmudict_overrides.items():
    mapping[word].append(pron)

In [227]:
with open(input_path, encoding="latin-1") as f:
    i = 0
    for line in tqdm(f.readlines()):
        line = line.strip()
        if line.startswith(";;;"):
            continue

        word, arpa = line.split(" ", 1)
        word = brackets_re.sub("", word)
        arpa = stress_re.sub("", arpa).strip().split(" ")
        
        result = []
        for part in arpa:
            ipa = english.cmu_ipa_mapping[part]
            result.append(ipa)
            
        mapping[word].append(" ".join(result))

  0%|          | 0/134429 [00:00<?, ?it/s]

In [228]:
# Prepare dataframe
rows = [(word.lower(), i, pron) for word, prons in mapping.items()
        for i, pron in enumerate(prons)]
df = pd.DataFrame(rows, columns=["word", "pronunciation_idx", "pronunciation"])
df

Unnamed: 0,word,pronunciation_idx,pronunciation
0,was,0,w ʌ z
1,was,1,w ɑ z
2,was,2,w ɑ z
3,wind,0,w ɪ n d
4,wind,1,w aɪ n d
...,...,...,...
134415,{brace,0,b ɹ ɛɪ s
134416,{left-brace,0,l ɛ f t b ɹ ɛɪ s
134417,{open-brace,0,oʊ p ɛ n b ɹ ɛɪ s
134418,}close-brace,0,k l oʊ z b ɹ ɛɪ s


In [229]:
df.to_csv(output_path, index=False)

## DEV: check match with Heilbron

In [237]:
phonemes_df = pd.read_csv("phoneme.csv", index_col=[0, 1])

In [238]:
import itertools
cmu_phons = set(itertools.chain.from_iterable(phons.split(" ") for phons in df.pronunciation))

In [239]:
heilbron_phons = set(phonemes_df.phoneme)

In [242]:
assert cmu_phons == heilbron_phons

### Check individual words

In [243]:
misses = Counter()
mismatches = defaultdict(list)
matches = 0
for _, word_data in tqdm(phonemes_df.groupby(["run", "word_idx"])):
    word = word_data.word.iloc[0]
    heilbron_pron = word_data.phoneme.str.cat(sep=" ")
    
    if word not in mapping or not mapping[word]:
        misses[word] += 1
    cmu_prons = mapping[word]

    if heilbron_pron in cmu_prons:
        matches += 1
    else:
        mismatches[word].append(heilbron_pron)

  0%|          | 0/10763 [00:00<?, ?it/s]

In [244]:
sorted({word: len(mismatches_i) for word, mismatches_i in mismatches.items()}.items(),
       key=lambda x: -x[1])

[('AND', 148),
 ('THE', 71),
 ('THEN', 19),
 ('FROM', 16),
 ('TO', 11),
 ('JUST', 9),
 ('THEY', 8),
 ('NOT', 6),
 ('TOO', 5),
 ('YEARS', 4),
 ('IT', 4),
 ('STRANGE', 4),
 ('AFRICA', 4),
 ('THAT', 4),
 ('JUMP', 4),
 ('OF', 3),
 ('TAKE', 3),
 ('WITH', 3),
 ('I', 3),
 ('HIM', 3),
 ('GENTLY', 3),
 ('THEIR', 2),
 ('EDGE', 2),
 ('YOU', 2),
 ('TELL', 2),
 ('YOUR', 2),
 ('ARE', 2),
 ('A', 2),
 ('WE', 2),
 ('PUT', 2),
 ('DIMAGGIO', 2),
 ('WANTED', 2),
 ('GREAT', 2),
 ('HE', 2),
 ('URINATED', 2),
 ('TODAY', 2),
 ('DROPPED', 2),
 ('WOULD', 2),
 ('DREW', 2),
 ('WERE', 1),
 ('BEER', 1),
 ('CARRIED', 1),
 ('ODOUR', 1),
 ('PLAY', 1),
 ('BANGING', 1),
 ('FEELING', 1),
 ("I'D", 1),
 ('GAMBLE', 1),
 ('CONFIDENCE', 1),
 ('THIS', 1),
 ('GET', 1),
 ('OLD', 1),
 ('THERE', 1),
 ('GO', 1),
 ('OTHER', 1),
 ('PLACE', 1),
 ('SACRED', 1),
 ('WILL', 1),
 ('HALF', 1),
 ('SPREAD', 1),
 ('SHOULDERS', 1),
 ('CARE', 1),
 ('SENT', 1),
 ('TWO', 1),
 ('EAT', 1),
 ('VILLAGE', 1),
 ('WATER', 1),
 ('ANOTHER', 1),
 ('THOSE', 

In [245]:
misses

Counter()

In [246]:
sum(len(mismatches_i) for mismatches_i in mismatches.values())

486

In [247]:
mismatches

defaultdict(list,
            {'AND': ['ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
              'æ n',
              'ʌ n',
              'ʌ n',
              'ʌ n',
         

In [248]:
# TODO fix this -- lots of incorrect annotations of "from"
phonemes_df[phonemes_df.word == "FROM"].groupby(["run", "word_idx"]).apply(lambda xs: xs.phoneme.str.cat())

run  word_idx
1    176         fɹʌm
     205          fɚm
     262          fɚm
2    13           fɚm
     240          fɚm
     305          fɚm
3    51           fɚm
     118         fɹʌm
     153         fɹʌm
     515          fɚm
4    108         fɹʌm
     371         fɹʌm
     484         fɹʌm
6    468          fɚm
7    48           fɚm
     82          fɹʌm
     224         fɹʌm
     504         fɹʌm
9    295          fɚm
10   155          fɚm
     366          fɚm
11   7           fɹʌm
     108         fɹʌm
     152         fɹʌm
     375         fɹʌm
12   398         fɹʌm
13   139          fɚm
     161         fɹʌm
     251          fɚm
14   311         fɹʌm
15   118         fɹʌm
     238          fɚm
     417         fɹʌm
17   153          fɚm
     162         fɹʌm
18   306         fɹʌm
19   472         fɹʌm
     490         fɹʌm
dtype: object