In [2]:
from collections import Counter
from pathlib import Path
import pickle
import sys
from typing import List

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

%load_ext autoreload
%autoreload 2
sys.path.append(str(Path(".").resolve().parent.parent))
from berp.datasets import NaturalLanguageStimulusProcessor
from berp.languages import english

In [112]:
tokenized_path = "lw1.tokenized.txt"
aligned_words_path = "word.csv"
aligned_phonemes_path = "phoneme.csv"
story_name = "lw1"

cmu_ipa_dict_path = "../../cmudict-0.7b-ipa.txt"

output_path = "lw1.pkl"

model = "distilgpt2"
n_candidates = 10

In [90]:
tokens = Path(tokenized_path).read_text().split(" ")

In [28]:
words_df = pd.read_csv(aligned_words_path, index_col=0)
phonemes_df = pd.read_csv(aligned_phonemes_path, index_col=0)

In [29]:
# The phonemes in this corpus are output from Gentle. The phoneme string representation is
#
# `<CMUDict_phoneme>_{B,I,E,S}`
#
# where B,I,E indicate that the phoneme is at the beginning, middle, and end of a word. S
# indicates singleton (single-phoneme words).
phonemes_df.phoneme.value_counts()

ah_I    119
ih_I     98
n_I      87
t_E      77
r_I      68
       ... 
zh_I      1
hh_I      1
iy_B      1
oy_E      1
ch_B      1
Name: phoneme, Length: 101, dtype: int64

In [30]:
phonemes_df["phoneme"] = phonemes_df.phoneme.str.rstrip(r"_[BIES]")

In [31]:
phonemes_df.phoneme.unique()

array(['t', 'eh', 'r', 'ah', 's', 'uh', 'd', 'aa', 'k', 'ih', 'l', 'w',
       'ey', 'ng', 'f', 'er', 'dh', 'ay', 'n', 'iy', 'g', 'm', 'aw', 'ae',
       'uw', 'p', 'v', 'jh', 'b', 'z', 'oy', 'hh', 'ow', 'sh', 'ao', 'th',
       'ch', 'zh', 'y'], dtype=object)

---

In [81]:
cmu_phonemizer = english.CMUPhonemizer(cmu_ipa_dict_path)
ipa_chars = set(char for word in cmu_phonemizer.mapping.values() for char in word)

In [72]:
# NB here English "r" is transcribed /r/, because it wasn't distinguished in CMU presumably
cmu_phonemizer.mapping["abbreviation"]

'əbrivieɪʃən'

In [87]:
dict_ipa_chars = set(char for word in cmu_phonemizer.mapping.values() for char in word)
dict_ipa_chars - set(english.ipa_chars)

{'a', 'e', 'o'}

In [89]:
PAD_PHONEME = "_"
proc = NaturalLanguageStimulusProcessor(phonemes=list(ipa_chars) + [PAD_PHONEME],
                                        hf_model=model,
                                        num_candidates=n_candidates,
                                        phonemizer=cmu_phonemizer)

Using pad_token, but it is not set yet.


In [95]:
# Prepare proc metadata input.
word_to_token = words_df.groupby("word_idx") \
    .apply(lambda x: list(x.token_idx)).to_dict()

In [99]:
ground_truth_phonemes = phonemes_df.groupby("word_idx") \
    .apply(lambda xs: list(xs.phoneme)).to_dict()
# Convert CMU representation to IPA representation
ground_truth_phonemes = {
    idx: [english.cmu_ipa_mapping[cmu_phon.upper()] for cmu_phon in cmu_phons]
    for idx, cmu_phons in ground_truth_phonemes.items()
}

In [101]:
# TODO for words with ground-truth pronunciations in CMUdict, check proportion which
# agree with the corpus. Maybe 100% because this is automatic and the forced aligner
# draws on Kaldi/CMU reprs?

In [104]:
list(zip(range(5), ground_truth_phonemes.items()))

[(0, (0, ['t', 'ɛ', 'r', 'ʌ'])),
 (1, (1, ['s', 't', 'ʊ', 'd'])),
 (2, (2, ['s', 't', 'ɑ', 'k'])),
 (3, (3, ['s', 't', 'ɪ', 'l'])),
 (4, (4, ['w', 'eɪ', 't', 'ɪ', 'ŋ']))]

In [105]:
# Prepare word-level features.
# TODO
word_features = {idx: torch.tensor(1.) for idx in word_to_token.keys()}

In [106]:
stim = proc(story_name, tokens, word_to_token, word_features, ground_truth_phonemes)

  0%|          | 0/4 [00:00<?, ?batch/s]

In [111]:
with Path(output_path).open("wb") as f:
    pickle.dump(stim, f)