This notebook converts the raw TIMIT corpus (as represented in Huggingface `timit_asr`) into a format friendly for our modeling and analysis.

Our specific modifications to the annotation include
- mapping from TIMIT phonetic annotation into a CMUDICT phonemic annotation (available in item key `phonemic_detail`; `word_phonemic_detail` grouped by words)
- syllable annotation (available in item key `word_syllable_detail`)

Processes in this notebook
- run the conversion
- cross-check resulting lexical mappings with cmudict representation
- make sure we don't have overlapping phonemes
- visually check some waveforms and make sure they make sense
- check syllable annotations, pay attention to how syllabic consonants represented especially. there are a lot of these in the timit annotations and we are removing the evidence before syllabification

In [None]:
from collections import Counter, defaultdict
import logging
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import datasets
import transformers
from tqdm.auto import tqdm, trange

logging.basicConfig(level=logging.INFO)

In [None]:
base_dir = "/userdata/jgauthier/projects/ideal-word-representations"

# input Huggingface-format `timit_asr` dataset, with train/test split
dataset_path = "data/timit_raw"

out_path = "data/timit_syllables"

model_name = "facebook/wav2vec2-base"

drop_timit_phones = ["h#", "pau", "epi"]
drop_cmudict_phonemes = ["[SIL]"]

In [None]:
cd {base_dir}

In [None]:
sns.set_theme(style="whitegrid", font_scale=2)

In [None]:
datasets.disable_caching()

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from src.utils import timit

In [None]:
tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("charsiu/tokenizer_en_cmu")
feature_extractor = transformers.Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = transformers.Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
raw_corpus = datasets.load_dataset(dataset_path)

In [None]:
# Tools for introspecting on raw/resulting annotations, if necessary

log_item = 25

# list(zip(raw_corpus["train"][log_item]["phonetic_detail"]["utterance"],
#          raw_corpus["train"][log_item]["phonetic_detail"]["start"],
#             raw_corpus["train"][log_item]["phonetic_detail"]["stop"]))

# list(zip(raw_corpus["train"][log_item]["word_detail"]["utterance"],
#             raw_corpus["train"][log_item]["word_detail"]["start"],
#                 raw_corpus["train"][log_item]["word_detail"]["stop"]))

# list(zip(corpus[log_item]["phonemic_detail"]["utterance"],
#          corpus[log_item]["phonemic_detail"]["start"],
#             corpus[log_item]["phonemic_detail"]["stop"]))

In [None]:
# Mark original indices in each item, then concatenate into a single dataset
def mark_item_indices(item, idx, split=None):
    item["original_idx"] = idx
    item["original_split"] = split
raw_corpus["train"] = raw_corpus["train"].map(mark_item_indices, with_indices=True, fn_kwargs=dict(split="train"))
raw_corpus["test"] = raw_corpus["test"].map(mark_item_indices, with_indices=True, fn_kwargs=dict(split="test"))
corpus = datasets.concatenate_datasets([raw_corpus["train"], raw_corpus["test"]])

def mark_item_indices_2(item, idx):
    item["idx"] = idx
corpus = corpus.map(mark_item_indices_2, with_indices=True)

In [None]:
corpus = timit.prepare_corpus(corpus, processor,
                              drop_phones=drop_timit_phones + drop_cmudict_phonemes)

In [None]:
def plot_item(item_idx, ax, plot_units="phoneme", viz_rate=1000):
    item = corpus[item_idx]

    times = np.linspace(0, len(item["input_values"]) / 16000, int(len(item["input_values"]) / 16000 * viz_rate))
    # normalize to [-1, 1]
    values = np.array(item["input_values"])
    values = (values - values.min()) / (values.max() - values.min()) * 2 - 1
    # resample to viz frame rate
    values = np.interp(times, np.arange(len(values)) / 16000, values)
    ax.plot(times, values, alpha=0.2)

    # plot word and phoneme boundaries
    for i, word in enumerate(item["word_phonemic_detail"]):
        word_str = item["word_detail"]["utterance"][i]

        word_start, word_stop = word[0]["start"] / 16000, word[-1]["stop"] / 16000
        ax.axvline(word_start, color="black", linestyle="--")
        ax.text(word_start, 0.8, word_str, rotation=90, verticalalignment="bottom", alpha=0.7)

        if plot_units == "phoneme":
            for j, phoneme in enumerate(word):
                phoneme_str = phoneme["phone"]
                phoneme_start, phoneme_stop = phoneme["start"] / 16000, phoneme["stop"] / 16000

                if j > 0:
                    color = "black" if phoneme["idx_in_syllable"] == 0 else "gray"
                    ax.axvline(phoneme_start, color=color, linestyle=":", alpha=0.5)
                ax.text(phoneme_start + 0.01, -6, phoneme_str, rotation=90, verticalalignment="bottom",
                        fontdict={"size": 15})
        elif plot_units == "syllable":
            for j, syllable in enumerate(item["word_syllable_detail"][i]):
                syllable_str = " ".join(syllable["phones"])
                syllable_start, syllable_stop = syllable["start"] / 16000, syllable["stop"] / 16000

                if j > 0:
                    ax.axvline(syllable_start, color="black", linestyle=":", alpha=0.5)
                ax.text(syllable_start + 0.01, -6, syllable_str, rotation=90, verticalalignment="bottom",
                        fontdict={"size": 15})
        else:
            raise ValueError(f"Unknown plot_units: {plot_units}")

    # align at origin
    ax.set_ylim((-8, 8))

    ax.set_title(f"{item['speaker_id']}_{item['id']}: {item['text']}")
    ax.set_yticks([])
    ax.grid(False)
    ax.axis("off")

In [None]:
f, axs = plt.subplots(2, 1, figsize=(25, 2 * 8))
idx = np.random.choice(len(corpus))
print(idx)
plot_item(idx, axs[0], plot_units="phoneme")
plot_item(idx, axs[1], plot_units="syllable")

## Check word-level correspondence with CMUdict

In [None]:
from tempfile import NamedTemporaryFile
from urllib.request import urlretrieve
import re

# Download and parse cmudict
cmudict_entries = defaultdict(list)
with NamedTemporaryFile() as f:
    urlretrieve("https://github.com/cmusphinx/cmudict/raw/master/cmudict.dict", f.name)

    with open(f.name, "r") as f:
        for line in f:
            # remove comments
            line = re.sub(r'(\s)*#.*', '', line)

            fields = line.strip().split(" ")
            word = fields[0]

            # remove word idx number, indicating secondary pronunciation
            word = re.sub(r"\(\d\)$", "", word)

            phones = tuple(fields[1:])
            # remove stress markers
            phones = tuple(re.sub(r"\d", "", p) for p in phones)

            cmudict_entries[word].append(phones)


In [None]:
# Track attested pronunciations of each word in TIMIT
corpus_cmudict_mapping = defaultdict(Counter)
def process_item(item):
    for word, word_phonemes in zip(item["word_detail"]["utterance"], item["word_phonemic_detail"]):
        corpus_cmudict_mapping[word][tuple(p["phone"] for p in word_phonemes)] += 1
corpus.map(process_item)

In [None]:
# How many words have multiple pronunciations?
multiple_pronunciations = {k: v for k, v in corpus_cmudict_mapping.items() if len(v) > 1}
print(f"{len(multiple_pronunciations)} words ({len(multiple_pronunciations) / len(corpus_cmudict_mapping) * 100}%) have multiple pronunciations")

In [None]:
# How many words have CMUDICT pronunciations?
has_cmudict = {k: v for k, v in corpus_cmudict_mapping.items() if k in cmudict_entries}
print(f"{len(has_cmudict)} words ({len(has_cmudict) / len(corpus_cmudict_mapping) * 100}%) have CMUDICT pronunciations")

In [None]:
# For how many words does the majority pronunciation align with the CMUDICT pronunciation?
majority_aligned = {k: v for k, v in corpus_cmudict_mapping.items()
                    if len(cmudict_entries[k]) > 0 and v.most_common(1)[0][0] == cmudict_entries[k][0]}
majority_misaligned = {k: v for k, v in corpus_cmudict_mapping.items()
                       if len(cmudict_entries[k]) > 0 and v.most_common(1)[0][0] != cmudict_entries[k][0]}
print(f"{len(majority_aligned)} words ({len(majority_aligned) / len(corpus_cmudict_mapping) * 100}%) have majority-aligned CMUDICT pronunciations")

In [None]:
# For misaligned majorities, compare with CMUDICT
for word, counts in majority_misaligned.items():
    print(f"{word}: {' '.join(counts.most_common(1)[0][0])} (TIMIT) vs {' '.join(cmudict_entries[word][0])} (CMUDICT)")

In [None]:
majority_misaligned.get("success")

In [None]:
majority_misaligned.get("provoked")

In [None]:
# matches = [idx for idx in trange(len(corpus)) if "success" in corpus[idx]["word_detail"]["utterance"]]
# matches

## Syllable analysis

In [None]:
all_syllable_counts = Counter()
word_syllable_counts = defaultdict(Counter)

def process_item(item):
    for i, (word, syllables) in enumerate(zip(item["word_detail"]["utterance"], item["word_syllable_detail"])):
        syll_string = tuple(tuple(syllable["phones"]) for syllable in syllables)
        word_syllable_counts[word][syll_string] += 1
        for syllable in syll_string:
            all_syllable_counts[syllable] += 1
corpus.map(process_item)

In [None]:
all_syllable_counts.most_common(20)

In [None]:
cmudict_vowels = {"AA", "AE", "AH", "AO", "AW", "AY", "EH", "ER", "EY", "IH", "IY", "OW", "OY", "UH", "UW"}

print("Syllabic consonant frequencies:")
syllabic_frequencies = Counter({k: v for k, v in all_syllable_counts.items() if len(k) == 1 and k[0] not in cmudict_vowels})
pprint(syllabic_frequencies)

print("Proportion of total syllable tokens: ", sum(syllabic_frequencies.values()) / sum(all_syllable_counts.values()) * 100, "%")

In [None]:
multiple_syllabification_words = Counter({k: v for k, v in word_syllable_counts.items() if len(v) > 1})
print(f"{len(multiple_syllabification_words)} words ({len(multiple_syllabification_words) / len(word_syllable_counts) * 100}%) have multiple syllabifications")

In [None]:
# Log top token frequency syllables
sorted(multiple_syllabification_words.items(), key=lambda x: sum(x[1].values()), reverse=True)[:10]

In [None]:
# Syllables without any content
# This can emerge when, according to TIMIT annotation, a syllable is completely coarticulated
# with its preceding syllable. We arbitrarily assign phoneme annotations to the preceding syllable,
# leaving the latter syllable empty.
empty_syllables = {word: counts[()] for word, counts in word_syllable_counts.items()
                   if () in counts and counts[()] > 0}
print(f"{len(empty_syllables)} syllables ({sum(empty_syllables.values())} tokens, {sum(empty_syllables.values()) / sum(all_syllable_counts.values()) * 100}%) are empty")

## Save to disk

In [None]:
corpus.save_to_disk(out_path)