In [149]:
from collections import Counter
from pathlib import Path
import pickle
import sys
from typing import List

import numpy as np
import pandas as pd
import torch
from tqdm.auto import tqdm

%load_ext autoreload
%autoreload 2
sys.path.append(str(Path(".").resolve().parent.parent))
from berp.datasets import NaturalLanguageStimulusProcessor
from berp.languages import english

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [150]:
tokenized_path = "lw1.tokenized.txt"
aligned_words_path = "word.csv"
aligned_phonemes_path = "phoneme.csv"
story_name = "lw1"

cmu_ipa_dict_path = "../../cmudict-0.7b-ipa.txt"
vocab_path = "../../workflow/meg-masc/data/frequency/subtlexus2.csv"

output_path = "lw1.pkl"

model = "distilgpt2"
n_candidates = 10

## Prepare tokenized data and aligned data

In [151]:
tokens = Path(tokenized_path).read_text().split(" ")

In [152]:
words_df = pd.read_csv(aligned_words_path, index_col=0)
phonemes_df = pd.read_csv(aligned_phonemes_path, index_col=0)

In [153]:
# The phonemes in this corpus are output from Gentle. The phoneme string representation is
#
# `<CMUDict_phoneme>_{B,I,E,S}`
#
# where B,I,E indicate that the phoneme is at the beginning, middle, and end of a word. S
# indicates singleton (single-phoneme words).
phonemes_df.phoneme.value_counts()

ah_I    119
ih_I     98
n_I      87
t_E      77
r_I      68
       ... 
zh_I      1
hh_I      1
iy_B      1
oy_E      1
ch_B      1
Name: phoneme, Length: 101, dtype: int64

In [154]:
phonemes_df["phoneme"] = phonemes_df.phoneme.str.rstrip(r"_[BIES]")

In [155]:
phonemes_df.phoneme.unique()

array(['t', 'eh', 'r', 'ah', 's', 'uh', 'd', 'aa', 'k', 'ih', 'l', 'w',
       'ey', 'ng', 'f', 'er', 'dh', 'ay', 'n', 'iy', 'g', 'm', 'aw', 'ae',
       'uw', 'p', 'v', 'jh', 'b', 'z', 'oy', 'hh', 'ow', 'sh', 'ao', 'th',
       'ch', 'zh', 'y'], dtype=object)

## Prepare frequency data

In [156]:
frequency_df = pd.read_csv(vocab_path, sep="\t")

frequency_df["Word"] = frequency_df.Word.str.lower()
assert frequency_df.Word.value_counts().max() == 1

frequency_df["log_freq"] = -np.log2(frequency_df.FREQcount / frequency_df.FREQcount.sum())

In [157]:
words_df["word_lower"] = words_df.word.str.lower()
old_size = len(words_df)
words_df = pd.merge(words_df, frequency_df[["Word", "log_freq"]], left_on="word_lower", right_on="Word",
                    how="left")
assert len(words_df) == old_size

In [158]:
# Put words with missing frequency in the lowest 2 percentile.
missing_freq = words_df.log_freq.isna()
print(f"{missing_freq.sum()} ({int(missing_freq.mean() * 1000) / 1000}%) words missing frequency values.")
oov_freq = pd.qcut(words_df.log_freq, 50, retbins=True, duplicates="drop")[1][-1]
print(f"Replacing with 2-percentile log-frequency: {oov_freq}")
words_df.loc[missing_freq, "log_freq"] = oov_freq

4 (0.006%) words missing frequency values.
Replacing with 2-percentile log-frequency: 24.56731019333269


---

In [159]:
cmu_phonemizer = english.CMUPhonemizer(cmu_ipa_dict_path)
ipa_chars = set(char for word in cmu_phonemizer.mapping.values() for char in word)

In [160]:
# NB here English "r" is transcribed /r/, because it wasn't distinguished in CMU presumably
cmu_phonemizer.mapping["abbreviation"]

'əbrivieɪʃən'

In [161]:
dict_ipa_chars = set(char for word in cmu_phonemizer.mapping.values() for char in word)
dict_ipa_chars - set(english.ipa_chars)

{'a', 'e', 'o'}

In [162]:
PAD_PHONEME = "_"
proc = NaturalLanguageStimulusProcessor(phonemes=list(ipa_chars) + [PAD_PHONEME],
                                        hf_model=model,
                                        num_candidates=n_candidates,
                                        phonemizer=cmu_phonemizer)

In [163]:
# Prepare proc metadata input.
word_to_token = words_df.groupby("word_idx") \
    .apply(lambda x: list(x.token_idx)).to_dict()

In [164]:
ground_truth_phonemes = phonemes_df.groupby("word_idx") \
    .apply(lambda xs: list(xs.phoneme)).to_dict()
# Convert CMU representation to IPA representation
ground_truth_phonemes = {
    idx: [english.cmu_ipa_mapping[cmu_phon.upper()] for cmu_phon in cmu_phons]
    for idx, cmu_phons in ground_truth_phonemes.items()
}

In [165]:
# TODO for words with ground-truth pronunciations in CMUdict, check proportion which
# agree with the corpus. Maybe 100% because this is automatic and the forced aligner
# draws on Kaldi/CMU reprs?

In [166]:
list(zip(range(5), ground_truth_phonemes.items()))

[(0, (0, ['t', 'ɛ', 'r', 'ʌ'])),
 (1, (1, ['s', 't', 'ʊ', 'd'])),
 (2, (2, ['s', 't', 'ɑ', 'k'])),
 (3, (3, ['s', 't', 'ɪ', 'l'])),
 (4, (4, ['w', 'eɪ', 't', 'ɪ', 'ŋ']))]

In [167]:
# Prepare word-level features.
word_features = dict(words_df.groupby("word_idx").apply(lambda xs: torch.tensor(xs.iloc[0].log_freq).unsqueeze(0)))
list(zip(range(5), word_features.items()))

[(0, (0, tensor([17.5617], dtype=torch.float64))),
 (1, (1, tensor([15.2065], dtype=torch.float64))),
 (2, (2, tensor([15.2230], dtype=torch.float64))),
 (3, (3, tensor([10.2715], dtype=torch.float64))),
 (4, (4, tensor([12.1730], dtype=torch.float64)))]

In [168]:
stim = proc(story_name, tokens, word_to_token, word_features, ground_truth_phonemes)

  0%|          | 0/4 [00:00<?, ?batch/s]

In [169]:
with Path(output_path).open("wb") as f:
    pickle.dump(stim, f)