In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
from pathlib import Path

import datasets
import transformers
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from src.models.frame_level import FrameLevelLexicalAccess, \
    LexicalAccessConfig, LexicalAccessDataCollator
from src.models.transformer import drop_wav2vec_layers

In [5]:
regressor_target_size = 32

In [6]:
def make_model_init(
        model_name_or_path,
        config, 
        word_vocabulary, word_representations,
        device="cpu"):
    def model_init(trial):
        encoder = transformers.Wav2Vec2Model.from_pretrained(
            model_name_or_path, config=config.encoder_config).to(device)
        model = FrameLevelLexicalAccess(
            config, word_vocabulary, word_representations,
            encoder=encoder).to(device)

        model.freeze_feature_extractor()

        if hasattr(config, "drop_layers"):
            model.encoder = drop_wav2vec_layers(model.encoder, config.drop_layers)

        if getattr(config, "reinit_feature_extractor_weights", False):
            model.encoder.feature_extractor.apply(lambda x: model.encoder._init_weights(x))
        if getattr(config, "reinit_encoder_weights", False):
            model.encoder.encoder.apply(lambda x: model.encoder._init_weights(x))

        # Freeze all model weights.
        for param in model.encoder.parameters():
            param.requires_grad = False
        
        return model
    return model_init

In [7]:
dataset = datasets.load_from_disk("./data/timit_phoneme/")
dataset_split = dataset["train"].train_test_split(test_size=0.1, shuffle=True)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

## Prepare semantic representations

In [8]:
all_words = set()
def update_all_words(item):
    all_words.update(set(item["word_detail"]["utterance"]))
    return None
dataset.map(update_all_words)

all_words = sorted(all_words)
all_words[:10]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

["'em",
 'a',
 'abbreviate',
 'abdomen',
 'abides',
 'ability',
 'able',
 'ably',
 'abolish',
 'aborigine']

In [9]:
word_representations = torch.randn(len(all_words), regressor_target_size)
word_representations /= word_representations.norm(dim=-1, keepdim=True)

## Run

In [11]:
tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("charsiu/tokenizer_en_cmu")
model = transformers.Wav2Vec2ForCTC.from_pretrained("charsiu/en_w2v2_ctc_libris_and_cv")
feature_extractor = transformers.Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = transformers.Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
encoder_config = transformers.AutoConfig.from_pretrained(
    "facebook/wav2vec2-base")
model_config = LexicalAccessConfig(
    encoder_config=encoder_config.to_dict(),
    num_labels=tokenizer.vocab_size,
    regressor_target_size=regressor_target_size)



NameError: name 'tokenizer' is not defined

In [None]:
model_init = make_model_init(
    "charsiu/en_w2v2_ctc_libris_and_cv", model_config,
    all_words, word_representations)

In [None]:
collator = LexicalAccessDataCollator(
    processor=processor,
    model=model_init(None),
    padding=True,
    num_labels=tokenizer.vocab_size,
    regression_target_size=regressor_target_size)

## Trainer loop

In [None]:
def compute_classifier_metrics(p: transformers.EvalPrediction) -> dict:
    assert isinstance(p.predictions, tuple)
    preds = p.predictions[0]
    label_mask, labels, _ = p.label_ids

    def evaluate_label(j):
        preds_j = preds[:, :, j]
        labels_j = labels[:, :, j]

        preds_j = preds_j[label_mask == 1]
        labels_j = labels_j[label_mask == 1]
        if labels_j.std() == 0:
            # Only one class. Quit
            return None
        return roc_auc_score(labels_j, preds_j)

    roc_auc_scores = [evaluate_label(j) for j in range(preds.shape[-1])]
    return {"roc_auc": np.mean([score for score in roc_auc_scores if score is not None])}


def compute_regressor_metrics(p: transformers.EvalPrediction) -> dict:
    assert isinstance(p.predictions, tuple)
    preds = p.predictions[1]
    target_mask, _, targets = p.label_ids

    preds = preds[target_mask]
    targets = targets[target_mask]

    return {"mse": ((preds - targets) ** 2).mean().item()}


def compute_metrics_dual_head(p: transformers.EvalPrediction):
    return {
        **compute_classifier_metrics(p),
        **compute_regressor_metrics(p)
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./tst_dual_head",
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=5,
    gradient_accumulation_steps=2,
    save_steps=50,
    eval_steps=50,
    logging_steps=2,
    learning_rate=1e-2,
    save_total_limit=5,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    label_names=["target_mask", "classifier_labels", "regressor_targets"],
    disable_tqdm=True,
)

trainer = Trainer(
    args=training_args,
    data_collator=collator,
    model=None, model_init=model_init,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics_dual_head,
    tokenizer=processor.tokenizer,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


: 

In [None]:
trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


{'loss': 0.3616, 'learning_rate': 9.984615384615385e-05, 'epoch': 0.01}
{'loss': 0.3601, 'learning_rate': 9.96923076923077e-05, 'epoch': 0.02}
{'loss': 0.3578, 'learning_rate': 9.953846153846155e-05, 'epoch': 0.02}
{'loss': 0.3569, 'learning_rate': 9.938461538461539e-05, 'epoch': 0.03}
{'loss': 0.3551, 'learning_rate': 9.923076923076923e-05, 'epoch': 0.04}
{'loss': 0.3526, 'learning_rate': 9.907692307692308e-05, 'epoch': 0.05}
{'loss': 0.3506, 'learning_rate': 9.892307692307693e-05, 'epoch': 0.05}
{'loss': 0.3504, 'learning_rate': 9.876923076923077e-05, 'epoch': 0.06}
{'loss': 0.3488, 'learning_rate': 9.861538461538462e-05, 'epoch': 0.07}
{'loss': 0.3452, 'learning_rate': 9.846153846153848e-05, 'epoch': 0.08}
{'loss': 0.3414, 'learning_rate': 9.830769230769231e-05, 'epoch': 0.08}
{'loss': 0.3395, 'learning_rate': 9.815384615384616e-05, 'epoch': 0.09}
{'loss': 0.3363, 'learning_rate': 9.8e-05, 'epoch': 0.1}
{'loss': 0.3326, 'learning_rate': 9.784615384615386e-05, 'epoch': 0.11}
{'loss':

KeyboardInterrupt: 