The code here is good enough to support a vanilla run, but not checked closely enough to support a good Berp run. Things we need to work out:

1. mismatches between IPA transcription in CMUdict and IPA transcription in Heilbron (collected in middle of this notebook)
2. some clear mistakes (??) in CMUPhonemizer, see phonemization of "first" as /twɛntifɚst/
3. we should do a final check that, for all words in the annotated Heilbron, their annotated pronunciations match those in cmudict

In [1]:
from collections import Counter
from pathlib import Path
import pickle
import sys
from typing import List

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

%load_ext autoreload
%autoreload 2
sys.path.append(str(Path(".").resolve().parent.parent))
from berp.datasets import NaturalLanguageStimulusProcessor
from berp.languages import english

In [2]:
tokenized_path = "oms.txt"
aligned_words_path = "word.csv"
aligned_phonemes_path = "phoneme.csv"
story_name = "old-man-and-the-sea"

cmu_ipa_dict_path = "../../cmudict-0.7b-ipa.txt"
vocab_path = "../../workflow/heilbron2022/data/frequency/subtlexus2.csv"

output_dir = "old-man-and-the-sea"

model = "distilgpt2"
n_candidates = 10

In [3]:
Path(output_dir).mkdir(exist_ok=True)

## Prepare tokenized data and aligned data

In [4]:
tokens = Path(tokenized_path).read_text().split(" ")

In [5]:
words_df = pd.read_csv(aligned_words_path, index_col=[0, 1])
phonemes_df = pd.read_csv(aligned_phonemes_path, index_col=[0, 1])

In [6]:
phonemes_df.phoneme.value_counts()

ʌ     3630
t     2377
n     2369
d     1958
ɪ     1832
ð     1530
l     1388
i     1342
s     1270
ɹ     1140
h     1015
m      962
ɛ      948
z      935
k      931
w      926
ɚ      762
b      717
æ      715
f      697
aɪ     609
oʊ     596
ɛɪ     526
ɔ      506
v      498
ɑ      493
p      440
ŋ      388
u      368
g      292
ʃ      287
aʊ     263
ʊ      232
θ      204
j      171
ɔɪ     117
tʃ     116
dʒ      79
ɝ        2
ʒ        2
Name: phoneme, dtype: int64

In [7]:
set(phonemes_df.phoneme) - english.ipa_chars

{'ɝ'}

## Prepare frequency data

In [8]:
frequency_df = pd.read_csv(vocab_path, sep="\t")

frequency_df["Word"] = frequency_df.Word.str.lower()
assert frequency_df.Word.value_counts().max() == 1

frequency_df["log_freq"] = -np.log2(frequency_df.FREQcount / frequency_df.FREQcount.sum())

In [9]:
words_df["word_lower"] = words_df.word.str.lower()
old_size = len(words_df)
words_df = pd.merge(words_df.reset_index(), frequency_df[["Word", "log_freq"]], left_on="word_lower", right_on="Word",
                    how="left")
assert len(words_df) == old_size

In [10]:
# Put words with missing frequency in the lowest 2 percentile.
missing_freq = words_df.log_freq.isna()
print(f"{missing_freq.sum()} ({int(missing_freq.mean() * 1000) / 1000}%) words missing frequency values.")
oov_freq = pd.qcut(words_df.log_freq, 50, retbins=True, duplicates="drop")[1][-1]
print(f"Replacing with 2-percentile log-frequency: {oov_freq}")
words_df.loc[missing_freq, "log_freq"] = oov_freq

244 (0.021%) words missing frequency values.
Replacing with 2-percentile log-frequency: 25.56731019333269


---

In [11]:
cmu_phonemizer = english.CMUPhonemizer(cmu_ipa_dict_path)
ipa_chars = set(char for word in cmu_phonemizer.mapping.values() for char in word)

In [12]:
# NB here English "r" is transcribed /r/, because it wasn't distinguished in CMU presumably
cmu_phonemizer.mapping["abbreviation"]

'əbrivieɪʃən'

In [13]:
cmu_phonemizer.mapping["father"]

'fɑðɚ'

In [14]:
dict_ipa_chars = set(phoneme for word in cmu_phonemizer.mapping.values()
                     for phoneme in cmu_phonemizer.ipa_phonemes_re.findall(word))

In [15]:
dict_ipa_chars - set(phonemes_df.phoneme)

{'ə', 'ər'}

In [25]:
[w for w, mapping in cmu_phonemizer.mapping.items() if "ər" in mapping][:20]

['acreage',
 'adarand',
 'agriculturally',
 'ahranat',
 'alfareda',
 'amaretto',
 'anamaria',
 'anchorage',
 'arabia',
 'arabian',
 'arabians',
 'arabichon',
 'arabicon',
 'arachnid',
 'arachnids',
 'arantza',
 'ararat',
 'arau',
 'arida',
 'arkarow']

In [27]:
# TODO this is a weird annotation....
cmu_phonemizer("around")

'ɚaʊnd'

In [16]:
phonemes_df[phonemes_df.phoneme == "ɚ"]

Unnamed: 0_level_0,Unnamed: 1_level_0,phoneme,onset,offset,word_idx,offset_word,word
run,phoneme_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,84,ɚ,8.852,8.969,29,9.09,FIRST
1,116,ɚ,11.531,11.586,39,11.59,AFTER
1,202,ɚ,18.453,18.516,64,18.66,WORST
1,238,ɚ,21.859,21.938,75,22.04,ORDERS
1,246,ɚ,22.422,22.469,77,22.47,ANOTHER
...,...,...,...,...,...,...,...
19,1343,ɚ,132.961,133.055,432,133.06,SHOULDER
19,1496,ɚ,146.945,146.969,482,146.97,WERE
19,1622,ɚ,156.859,156.945,524,157.07,TROUSERS
19,1670,ɚ,160.430,160.484,538,160.87,OVERBOARD


In [21]:
phonemes_df[phonemes_df.phoneme == "ɝ"]

Unnamed: 0_level_0,Unnamed: 1_level_0,phoneme,onset,offset,word_idx,offset_word,word
run,phoneme_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,868,ɝ,88.195,88.258,282,88.26,ARE
19,1268,ɝ,126.984,127.164,410,127.28,STEM


In [26]:
phonemes_df[phonemes_df.word.str.startswith("AR")]

Unnamed: 0_level_0,Unnamed: 1_level_0,phoneme,onset,offset,word_idx,offset_word,word
run,phoneme_idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,396,ɚ,36.961,36.984,129,37.11,AROUND
1,397,aʊ,36.992,37.047,129,37.11,AROUND
1,398,n,37.055,37.078,129,37.11,AROUND
1,399,d,37.086,37.102,129,37.11,AROUND
1,400,ð,37.109,37.156,129,37.11,AROUND
...,...,...,...,...,...,...,...
19,478,ɑ,48.531,48.602,154,48.64,ARE
19,479,ɹ,48.609,48.633,154,48.64,ARE
19,1373,ɑ,135.273,135.531,441,135.65,ARM
19,1374,ɹ,135.539,135.594,441,135.65,ARM


## Note to self

There's more work to do here harmonizing the annotations. That being said, for testing the vanilla model, none of this is necessary. So let's move forward for now!

In [17]:
# TODO ??????
cmu_phonemizer("first")

'twɛntifɚst'

In [19]:
english.cmu_ipa_mapping["ER0"]

'ɚ'

In [22]:
set(phonemes_df.phoneme) - dict_ipa_chars

{'ɛɪ', 'ɝ', 'ɹ'}

In [15]:
PAD_PHONEME = "_"
proc = NaturalLanguageStimulusProcessor(phonemes=list(ipa_chars) + [PAD_PHONEME],
                                        hf_model=model,
                                        num_candidates=n_candidates,
                                        phonemizer=cmu_phonemizer)

Using pad_token, but it is not set yet.


In [18]:
%pdb 1
for run, run_words in tqdm(words_df.groupby("run"), unit="run"):
    run_phonemes = phonemes_df.loc[run]
    
    # Prepare proc metadata input.
    word_to_token = run_words.groupby("word_idx") \
        .apply(lambda x: list(x.token_idx)).to_dict()
    
    ground_truth_phonemes = run_phonemes.groupby("word_idx") \
        .apply(lambda xs: list(xs.phoneme)).to_dict()
    
    # Prepare word-level features.
    word_features = dict(run_words.groupby("word_idx").apply(lambda xs: torch.tensor(xs.iloc[0].log_freq).unsqueeze(0)))
    word_feature_names = ["word_frequency"]
    
    # TODO
    phoneme_features = {idx: torch.zeros((0, 0)) for idx in ground_truth_phonemes}
    phoneme_feature_names = []
    
    # NB `tokens` contains tokens from all runs. so we'll actually be 
    # processing way too much per run. but it's okay I think
    
    stim = proc(story_name, tokens, word_to_token,
                word_features, word_feature_names,
                phoneme_features, phoneme_feature_names,
                ground_truth_phonemes)
    
    with (Path(output_dir) / f"run{run}.pkl").open("wb") as f:
        pickle.dump(stim, f)

Automatic pdb calling has been turned ON


  0%|          | 0/19 [00:00<?, ?run/s]

  0%|          | 0/53 [00:00<?, ?batch/s]

['', '', '', '', '', '', '', '', '', '']
['Ġa', 'Ġin', 'Ġof', 'Ġnear', 'Ġand', 'Ġthat', 'Ġwhere', 'Ġwith', 'Ġon', 'Ġarea']
['ĠWhen', 'Ġat', 'Ġabout', 'Ġwith', 'Ġand', 'Ġor', 'Ġenough', 'Ġbecause', 'Ġthat', 'Ġwhen']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġmade', 'ĠThe', 'ĠA', 'Ġthe', 'Ġand', 'The', 'Ġto', 'Ġof', 'ĠIn', 'ĠThis']
['Ġa', 'ĠThe', 'ĠA', 'Ġthe', 'ĠThis', 'ĠI', 'The', 'Ġto', 'Ġand', 'ĠIt']
['ĠI', 'Ġand', 'Ġwith', 'Ġthat', 'Ġto', 'Ġof', 'Ġfor', 'Ġin', 'Ġride', 'Ġsituation']
['Ġa', 'Ġone', 'Ġmatter', 'Ġway', 'Ġmore', 'Ġdoubt', 'Ġneed', 'Ġproblem', 'Ġcomment', 'Ġother']


  0%|          | 0/53 [00:00<?, ?batch/s]

['No', 'ĠThe', 'The', 'I', 'ĠA', 'A', 'It', 'We', 'In', 'This']
['Ġa', 'ĠI', 'Ġthey', 'Ġthat', 'Ġthe', 'Ġit', 'Ġwe', 'Ġfor', 'Ġtheir', 'Ġnot']
['', '', '', '', '', '', '', '', '', '']
['I', 'ĠThe', 'The', 'ĠA', 'I', 'A', 'ĠThis', 'ĠI', 'We', 'It']


  0%|          | 0/53 [00:00<?, ?batch/s]

['But', 'ĠThe', 'The', 'ĠA', 'I', 'A', 'ĠThis', 'ĠI', 'We', 'It']
['', '', '', '', '', '', '', '', '', '']
['I', 'Ġit', 'Ġme', 'Ġyour', 'Ġthe', 'Ġthis', 'Ġmy', 'Ġa', 'Ġthat', 'Ġsome']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Come', 'ĠThe', 'The', 'I', 'ĠA', 'A', 'It', 'We', 'In', 'This']
['', '', '', '', '', '', '', '', '', '']
['ĠI', 'Ġand', 'Ġthat', 'Ġto', 'Ġwhere', 'Ġwith', 'Ġin', 'Ġare', 'Ġas', 'Ġor']
['He', 'Ġof', 'Ġand', 'Ġin', 'Ġago', 'Ġto', 'Ġafter', 'Ġor', 'Ġbefore', 'Ġare']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġdays', 'ĠThe', 'ĠA', 'Ġthe', 'Ġand', 'Ġto', 'The', 'ĠI', 'Ġof', 'ĠIn']
['', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '']
['ĠThe', 'Ġthe', 'Ġaround', 'Ġto', 'Ġin', 'Ġand', 'Ġthrough', 'Ġa', 'Ġon', 'Ġup']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġbe', 'ĠThe', 'ĠA', 'Ġthe', 'Ġto', 'The', 'ĠI', 'I', 'Ġa', 'ĠThis']
['', '', '', '', '', '', '', '', '', '']
['ĠSometimes', 'y', 'ing', 'er', 'ers', 'Ġand', 'iest', 'ies', 'Ġline', "'s"]


  0%|          | 0/53 [00:00<?, ?batch/s]

['its', 'ĠThe', 'ĠA', 'I', 'The', 'Ġthe', 'A', 'B', 'S', 'Ġto']
['', '', '', '', '', '', '', '', '', '']
['Ġa', 'Ġand', 'Ġso', 'Ġin', 'Ġfor', 'Ġas', 'Ġbut', 'Ġmore', 'Ġbecause', 'Ġtoo']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġher', 'ĠThe', 'ĠA', 'Ġthe', 'The', 'Ġto', 'I', 'A', 'Ġand', 'Ġof']
['Ġa', 'Ġin', 'Ġand', 'Ġof', 'Ġthat', 'Ġas', 'Ġat', 'Ġon', 'Ġso', 'Ġto']
['', '', '', '', '', '', '', '', '', '']
['ĠJust', 'Ġand', 'Ġin', 'Ġas', 'Ġbecause', 'Ġfor', 'Ġwith', 'Ġon', 'Ġto', 'Ġbut']


  0%|          | 0/53 [00:00<?, ?batch/s]

['ĠI', 'ĠThe', 'ĠA', 'Ġthe', 'Ġand', 'I', 'ĠIt', 'The', 'ĠThis', 'ĠIn']
['Ġa', 'Ġto', 'Ġas', "'s", 'Ġfor', 'Ġin', 'Ġand', 'Ġwith', 'Ġor', 'Ġthat']
['', '', '', '', '', '', '', '', '', '']
['A', 'Ġand', 'Ġpaint', 'Ġin', 'Ġon', 'Ġto', 'Ġthat', 'ish', 'Ġlight', 'ing']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġsome', 'ĠThe', 'ĠA', 'Ġthe', 'ĠI', 'The', 'I', 'ĠIt', 'ĠThis', 'Ġto']
['Ġa', 'ĠThe', 'ĠA', 'Ġthe', 'Ġto', 'Ġand', 'The', 'Ġa', 'Ġof', 'ĠI']
['The', 'Ġin', 'Ġand', 'Ġfor', 'Ġon', 'Ġup', 'Ġat', 'Ġout', 'Ġto', 'Ġwhile']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġliver', 'ĠThe', 'ĠA', 'Ġthe', 'The', 'Ġand', 'I', 'Ġto', 'ĠThis', 'S']
['', '', '', '', '', '', '', '', '', '']
['ĠI', 'Ġthe', 'Ġwhat', 'Ġhis', 'Ġhow', 'Ġwhen', 'Ġthat', 'Ġhim', 'Ġit', 'Ġher']
['ĠI', 'ing', 'fl', 'ed', 'Ġand', 'Ġcovered', 'sh', 'balls', 'Ġin', 'board']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġblue', 'ĠThe', 'ĠA', 'Ġthe', 'The', 'ĠI', 'I', 'ĠIn', 'ĠIt', 'ĠThis']
['', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '']
['ĠThen', 'Ġand', 'Ġto', 'Ġin', 'Ġas', 'Ġso', 'Ġagain', 'Ġall', 'Ġnow', 'Ġlike']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġmouth', 'ĠThe', 'ĠA', 'Ġthe', 'The', 'ĠI', 'Ġto', 'I', 'Ġand', 'ĠIn']
['ĠI', 'Ġto', 'Ġfor', 'Ġand', 'Ġwith', 'Ġin', 'Ġif', 'Ġon', 'Ġnow', 'Ġor']
['ĠI', 'Ġthe', 'Ġto', 'Ġa', 'Ġand', 'Ġin', 'Ġon', 'Ġthere', 'Ġwith', 'Ġhere']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġdown', 'ĠThe', 'ĠA', 'Ġto', 'Ġthe', 'The', 'Ġa', 'ĠThis', 'Ġand', 'ĠIn']
['Ġa', 'Ġin', 'Ġto', 'Ġon', 'Ġat', 'Ġor', 'Ġfrom', 'Ġand', 'Ġfor', 'Ġuntil']
['', '', '', '', '', '', '', '', '', '']
['ĠDuring', 'Ġand', 'Ġin', 'Ġfor', 'Ġenough', 'Ġfood', 'Ġon', 'Ġhis', 'Ġthe', 'Ġwith']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġeat', 'ĠThe', 'Ġthe', 'ĠA', 'The', 'Ġa', 'Ġto', 'ĠThis', 'ĠIt', 'ĠIn']
['Ġa', 'ĠThe', 'ĠA', 'Ġthe', 'Ġto', 'The', 'ĠI', 'I', 'Ġand', 'ĠIt']
['Ġa', 'Ġoff', 'Ġup', 'Ġthe', 'Ġa', 'Ġhis', 'Ġhim', 'Ġthat', 'Ġhow', 'Ġsome']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġhis', 'ĠThe', 'Ġthe', 'ĠA', 'Ġof', 'Ġto', 'The', 'ĠI', 'Ġand', 'ĠThis']
['ĠI', 'y', 'er', 'ed', 'ing', 'ers', 'Ġand', 'Ġis', "'s", 'es']
['Ġa', 'Ġwas', "'s", 'Ġwho', 'Ġis', 'Ġand', 'Ġin', 'Ġhad', 'Ġhas', 'Ġsaid']
['He', 'Ġand', 'Ġas', 'Ġto', 'Ġon', 'Ġin', 'Ġbut', 'Ġwith', 'Ġthe', 'Ġat']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġsoftly', 'ĠThe', 'Ġthe', 'ĠA', 'The', 'Ġto', 'ĠI', 'I', 'ĠIn', 'ĠThis']
['', '', '', '', '', '', '', '', '', '']
['ĠJust', 'Ġthan', 'Ġwith', 'Ġand', 'Ġin', 'Ġon', 'Ġplaying', 'Ġat', 'Ġto', 'Ġgoing']


  0%|          | 0/53 [00:00<?, ?batch/s]

['Ġon', 'ĠThe', 'ĠA', 'Ġthe', 'Ġto', 'Ġand', 'The', 'ĠThis', 'Ġa', 'Ġof']
['', '', '', '', '', '', '', '', '', '']
['', '', '', '', '', '', '', '', '', '']
['I', 'Ġof', 'Ġend', 'Ġand', 'Ġto', 'gate', 'Ġsection', 'Ġside', 'Ġin', 'Ġwith']


In [None]:
# TODO for words with ground-truth pronunciations in CMUdict, check proportion which
# agree with the corpus. Maybe 100% because this is automatic and the forced aligner
# draws on Kaldi/CMU reprs?