In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
from pathlib import Path

import datasets
import transformers
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
from models import Vocabulary
from models.frame_level import FrameLevelRNNClassifier
from models.transformer import TilingWordFeatureExtractor2, DataCollator
from utils.timit import group_phonetic_detail

## Inner model and corpus prep

In [7]:
device = "cuda:0"

In [8]:
tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("charsiu/tokenizer_en_cmu")
model = transformers.Wav2Vec2ForCTC.from_pretrained("charsiu/en_w2v2_ctc_libris_and_cv")
feature_extractor = transformers.Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = transformers.Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
corpus = datasets.load_dataset("timit_asr", data_dir="/userdata/jgauthier/data/TIMIT")

In [10]:
# TODO cross-check resulting mappings with cmudict
TIMIT_MAPPING = {
    'ax': 'AH',
    'ax-h': 'AH',
    'axr': 'ER',
    'dx': 'T',
    'el': ['AH', 'L'],
    'em': ['AH', 'M'],
    'en': ['AH', 'N'],
    'eng': ['IH', 'NG'],
    'hv': 'HH',
    'ix': 'IH',
    'nx': ['N', 'T'],
    'q': 'T',
    'pau': '[SIL]',
    'epi': '[SIL]',
    'ux': 'UW'
}

VOCAB = set(tokenizer.get_vocab().keys())
for src, tgt in TIMIT_MAPPING.items():
    if isinstance(tgt, str):
        tgt = [tgt]
    for t in tgt:
        assert t in VOCAB

In [11]:
TIMIT_IGNORE = ["h#"]
TIMIT_JOIN_CLOSURES = ['bcl', 'dcl', 'gcl', 'kcl', 'pcl', 'tcl']


def map_timit_phone_to_cmudict_phoneme(phone):
    if phone in TIMIT_IGNORE:
        return []
    elif phone in TIMIT_MAPPING:
        ret = TIMIT_MAPPING[phone]
        if isinstance(ret, str):
            ret = [ret]
        return ret
    elif not phone.upper() in VOCAB:
        raise ValueError(f"Invalid phone {phone.upper()}")
    return [phone.upper()]


def map_timit_to_cmudict(timit_item):
    phonemes = []

    i = 0
    phonetic_detail = timit_item["phonetic_detail"]
    num_phones = len(phonetic_detail["start"])
    while i < num_phones:
        phone = phonetic_detail["utterance"][i]
        start = phonetic_detail["start"][i]
        stop = phonetic_detail["stop"][i]

        if phone in TIMIT_IGNORE:
            i += 1
        elif phone in TIMIT_JOIN_CLOSURES:
            release_phone = phone[:-len("cl")]
            if phonetic_detail["utterance"][i + 1] == release_phone:
                phoneme_start = start
                phoneme_end = phonetic_detail["stop"][i + 1]
                phoneme_label = map_timit_phone_to_cmudict_phoneme(release_phone)

                for phoneme in phoneme_label:
                    phonemes.append((phoneme_start, phoneme_end, phoneme))
                i += 2
            else:
                phoneme_label = map_timit_phone_to_cmudict_phoneme(release_phone)
                for phoneme in phoneme_label:
                    phonemes.append((start, stop, phoneme))
                i += 1
        else:
            for phoneme in map_timit_phone_to_cmudict_phoneme(phone):
                phonemes.append((start, stop, phoneme))
            i += 1

    return phonemes


def add_phonemic_detail(item):
    phonemes = map_timit_to_cmudict(item)
    # TODO group by word

    starts, stops, utterances = zip(*phonemes)
    item["phonemic_detail"] = {
        "start": starts,
        "stop": stops,
        "utterance": utterances
    }

    return item

    
corpus = corpus.map(add_phonemic_detail, batched=False, load_from_cache_file=False)

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

In [12]:
def prepare_corpus(processor, drop_phones=None):
    corpus = datasets.load_dataset("timit_asr", data_dir="/userdata/jgauthier/data/TIMIT")

    corpus = corpus.map(add_phonemic_detail, batched=False)

    # Compute phonetic and phonemic details grouped by word span
    corpus = corpus.map(group_phonetic_detail, batched=False,
                        fn_kwargs=dict(drop_phones=drop_phones))
    corpus = corpus.map(group_phonetic_detail, batched=False,
                        fn_kwargs=dict(key="phonemic_detail"))
    
    def prepare_audio(batch):
        audio = batch["audio"]
        batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
        return batch
    corpus = corpus.map(prepare_audio)

    twfe = TilingWordFeatureExtractor2(tokenizer, item_key="word_phonemic_detail")
    def add_features(example):
        example["phone_targets"] = twfe(example)
        return example
    corpus = corpus.map(add_features, load_from_cache_file=False)
    
    return corpus

In [13]:
def load_corpus(processor, corpus_path="timit_phoneme_corpus"):
    if not Path(corpus_path).exists():
        corpus = prepare_corpus(processor)
        corpus.save_to_disk(corpus_path)
    else:
        corpus = datasets.load_from_disk(corpus_path)

    return corpus

In [14]:
corpus = load_corpus(processor)

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

## Model prep

In [127]:
def make_model_init(model_name_or_path, config, device="cpu"):
    def model_init(trial):
        model = FrameLevelRNNClassifier.from_pretrained(
            model_name_or_path, config=config).to(device)

        model.freeze_feature_extractor()

        if hasattr(config, "drop_layers"):
            model.wav2vec2 = drop_wav2vec_layers(model.wav2vec2, config.drop_layers)

        # Freeze all model weights.
        for param in model.wav2vec2.parameters():
            param.requires_grad = False
        
        return model
    return model_init

In [128]:
def compute_metrics(p: transformers.EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    label_mask, labels = p.label_ids

    def evaluate_label(j):
        preds_j = preds[:, :, j]
        labels_j = labels[:, :, j]

        preds_j = preds_j[label_mask == 1]
        labels_j = labels_j[label_mask == 1]
        if labels_j.std() == 0:
            # Only one class. Quit
            return None
        return roc_auc_score(labels_j, preds_j)

    roc_auc_scores = [evaluate_label(j) for j in range(preds.shape[-1])]
    return {"roc_auc": np.mean([score for score in roc_auc_scores if score is not None])}

In [129]:
model_name_or_path = "charsiu/en_w2v2_ctc_libris_and_cv"
config = transformers.AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=tokenizer.vocab_size)
setattr(config, "classifier_bias", False)
setattr(config, "rnn_hidden_size", 128)
# setattr(config, "drop_layers", drop_layers)
model_init = make_model_init(model_name_or_path, config, device=device)



In [130]:
coll = DataCollator(processor=processor, model=model_init(None), padding=True,
                    num_labels=tokenizer.vocab_size)

Some weights of FrameLevelRNNClassifier were not initialized from the model checkpoint at charsiu/en_w2v2_ctc_libris_and_cv and are newly initialized: ['rnn.weight_hh_l0', 'rnn.bias_hh_l0', 'rnn.bias_ih_l0', 'classifier.weight', 'rnn.weight_ih_l0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [131]:
training_args = TrainingArguments(
    output_dir=f"out/rnn/testrun",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=50,
    gradient_accumulation_steps=2,
    save_steps=50,
    eval_steps=50,
    logging_steps=2,
    learning_rate=1e-2,
    save_total_limit=5,
    use_cpu=False,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    label_names=["label_mask", "labels"],
    disable_tqdm=True,
)

trainer = Trainer(
    model=None, model_init=model_init,
    data_collator=coll,
    args=training_args,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    compute_metrics=compute_metrics,
    train_dataset=corpus["train"],
    eval_dataset=corpus["test"],
    tokenizer=processor.feature_extractor,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Some weights of FrameLevelRNNClassifier were not initialized from the model checkpoint at charsiu/en_w2v2_ctc_libris_and_cv and are newly initialized: ['rnn.weight_hh_l0', 'rnn.bias_hh_l0', 'rnn.bias_ih_l0', 'classifier.weight', 'rnn.weight_ih_l0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [132]:
trainer.train()

Some weights of FrameLevelRNNClassifier were not initialized from the model checkpoint at charsiu/en_w2v2_ctc_libris_and_cv and are newly initialized: ['rnn.weight_hh_l0', 'rnn.bias_hh_l0', 'rnn.bias_ih_l0', 'classifier.weight', 'rnn.weight_ih_l0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Roc Auc
50,0.0095,0.009974,0.678679
100,0.0082,0.008518,0.967915
150,0.0045,0.004474,0.978912
200,0.0028,0.002593,0.983546


KeyboardInterrupt: 