In [60]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [61]:
from pathlib import Path

import datasets
import transformers
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve

In [62]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
from src.models.frame_level import FrameLevelLexicalAccess, \
    LexicalAccessConfig, LexicalAccessDataCollator
from src.models.transformer import drop_wav2vec_layers

In [64]:
regressor_target_size = 32

In [65]:
def make_model_init(
        model_name_or_path,
        config, 
        word_vocabulary, word_representations,
        device="cpu"):
    def model_init(trial):
        encoder = transformers.Wav2Vec2Model.from_pretrained(
            model_name_or_path, config=config.encoder_config).to(device)
        model = FrameLevelLexicalAccess(
            config, word_vocabulary, word_representations,
            encoder=encoder).to(device)

        model.freeze_feature_extractor()

        if config.drop_encoder_layers is not None:
            model.encoder = drop_wav2vec_layers(model.encoder, config.drop_encoder_layers)

        if config.reinit_feature_extractor_weights:
            model.encoder.feature_extractor.apply(lambda x: model.encoder._init_weights(x))
        if config.reinit_encoder_weights:
            model.encoder.encoder.apply(lambda x: model.encoder._init_weights(x))

        # Freeze all model weights.
        for param in model.encoder.parameters():
            param.requires_grad = False
        
        return model
    return model_init

In [66]:
dataset = datasets.load_from_disk("./data/timit_phoneme/")
dataset_split = dataset["train"].train_test_split(test_size=0.1, shuffle=True)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]

## Prepare semantic representations

In [67]:
all_words = set()
def update_all_words(item):
    all_words.update(set(item["word_detail"]["utterance"]))
    return None
dataset.map(update_all_words)

all_words = sorted(all_words)
all_words[:10]

Map:   0%|          | 0/4620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

["'em",
 'a',
 'abbreviate',
 'abdomen',
 'abides',
 'ability',
 'able',
 'ably',
 'abolish',
 'aborigine']

In [68]:
word_representations = torch.randn(len(all_words), regressor_target_size)
word_representations /= word_representations.norm(dim=-1, keepdim=True)

## Run

In [69]:
tokenizer = transformers.Wav2Vec2Tokenizer.from_pretrained("charsiu/tokenizer_en_cmu")
model = transformers.Wav2Vec2ForCTC.from_pretrained("charsiu/en_w2v2_ctc_libris_and_cv")
feature_extractor = transformers.Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = transformers.Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [70]:
encoder_config = transformers.AutoConfig.from_pretrained(
    "facebook/wav2vec2-base")
model_config = LexicalAccessConfig(
    encoder_config=encoder_config.to_dict(),
    drop_encoder_layers=6,
    num_labels=tokenizer.vocab_size,
    regressor_target_size=regressor_target_size)

In [71]:
model_init = make_model_init(
    "charsiu/en_w2v2_ctc_libris_and_cv", model_config,
    all_words, word_representations)

In [72]:
collator = LexicalAccessDataCollator(
    processor=processor,
    model=model_init(None),
    padding=True,
    num_labels=tokenizer.vocab_size,
    regression_target_size=regressor_target_size)

## Trainer loop

In [73]:
def compute_classifier_metrics(p: transformers.EvalPrediction) -> dict:
    assert isinstance(p.predictions, tuple)
    preds = p.predictions[0]
    label_mask, labels, _ = p.label_ids

    def evaluate_label(j):
        preds_j = preds[:, :, j]
        labels_j = labels[:, :, j]

        preds_j = preds_j[label_mask == 1]
        labels_j = labels_j[label_mask == 1]
        if labels_j.std() == 0:
            # Only one class. Quit
            return None
        return roc_auc_score(labels_j, preds_j)

    roc_auc_scores = [evaluate_label(j) for j in range(preds.shape[-1])]
    return {"roc_auc": np.mean([score for score in roc_auc_scores if score is not None])}


def compute_regressor_metrics(p: transformers.EvalPrediction) -> dict:
    assert isinstance(p.predictions, tuple)
    preds = p.predictions[1]
    target_mask, _, targets = p.label_ids

    preds = preds[target_mask == 1]
    targets = targets[target_mask == 1]

    return {"mse": ((preds - targets) ** 2).mean().item()}


def compute_metrics_dual_head(p: transformers.EvalPrediction):
    return {
        **compute_classifier_metrics(p),
        **compute_regressor_metrics(p)
    }

In [74]:
training_args = TrainingArguments(
    output_dir="./tst_dual_head",
    per_device_train_batch_size=8,
    evaluation_strategy="steps",
    num_train_epochs=5,
    gradient_accumulation_steps=2,
    save_steps=50,
    eval_steps=50,
    logging_steps=2,
    learning_rate=1e-2,
    save_total_limit=5,
    remove_unused_columns=False,
    load_best_model_at_end=True,
    label_names=["target_mask", "classifier_labels", "regressor_targets"],
    disable_tqdm=True,
)

trainer = Trainer(
    args=training_args,
    data_collator=collator,
    model=None, model_init=model_init,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics_dual_head,
    tokenizer=processor.tokenizer,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [75]:
trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


{'loss': 0.3496, 'learning_rate': 0.006666666666666666, 'epoch': 0.31}
{'eval_loss': 0.26008492708206177, 'eval_roc_auc': 0.4858707499653592, 'eval_mse': 0.10612313449382782, 'eval_runtime': 14.3662, 'eval_samples_per_second': 32.159, 'eval_steps_per_second': 4.037, 'epoch': 0.31}
{'loss': 0.232, 'learning_rate': 0.003333333333333333, 'epoch': 0.62}
{'eval_loss': 0.181578129529953, 'eval_roc_auc': 0.49654516446804225, 'eval_mse': 0.04297766461968422, 'eval_runtime': 13.9539, 'eval_samples_per_second': 33.109, 'eval_steps_per_second': 4.157, 'epoch': 0.62}
{'loss': 0.1735, 'learning_rate': 0.0, 'epoch': 0.92}
{'eval_loss': 0.1679603010416031, 'eval_roc_auc': 0.5146918944178374, 'eval_mse': 0.041869502514600754, 'eval_runtime': 13.8624, 'eval_samples_per_second': 33.328, 'eval_steps_per_second': 4.184, 'epoch': 0.92}


KeyboardInterrupt: 

## Roll out on test

In [None]:
def estimate_decision_thresholds(trainer: transformers.Trainer):
    assert trainer.eval_dataset is not None

    trainer.model.eval()
    with torch.no_grad():
        preds = trainer.predict(trainer.eval_dataset)
        label_mask, labels, _ = preds.label_ids
        preds = preds.predictions[0] if isinstance(preds.predictions, tuple) else preds.predictions

    # Get optimal cut-off for each label
    optimal_thresholds = []
    fpr, tpr, thresholds = [], [], []
    # roc_aucs = []
    for j in range(preds.shape[-1]):
        preds_j = preds[:, :, j]
        labels_j = labels[:, :, j]

        mask = label_mask == 1
        preds_j = preds_j[mask]
        labels_j = labels_j[mask]

        fpr_j, tpr_j, thresholds_j = roc_curve(labels_j, preds_j, pos_label=1)
        fpr.append(fpr_j)
        tpr.append(tpr_j)
        thresholds.append(thresholds_j)

        optimal_thresholds.append(thresholds_j[np.argmax(tpr_j - fpr_j)])

    return torch.tensor(optimal_thresholds)

def rollout_dataset(trainer, dataset, optimal_thresholds: torch.FloatTensor) -> datasets.Dataset:
    """
    Roll out on a new dataset; return a new dataset containing both the original
    inputs and the internal states, predictions, and accuracies produced
    during the model evaluation.
    """
    trainer.model.eval()
    with torch.no_grad():
        dataset_preds = trainer.predict(dataset)

    def add_model_outputs(batch, idxs):
        eval_logits, eval_targets, rnn_states = dataset_preds.predictions
        logits = eval_logits[idxs]
        preds = (logits > optimal_thresholds.numpy()).astype(int)

        # Internals
        batch["rnn_hidden_states"] = rnn_states[idxs]

        # Classifier outputs
        batch["logits"] = logits
        batch["distance_from_decision_threshold"] = logits - optimal_thresholds.numpy()
        batch["predicted_labels"] = preds

        # Regressor outputs
        batch["regression_output"] = eval_targets[idxs]

        return batch

    result = dataset.map(add_model_outputs, batched=True, batch_size=8, with_indices=True)

    def compute_accuracy(item, idx):
        label_mask, labels, _ = dataset_preds.label_ids
        label_mask = label_mask[idx] == 1
        labels = labels[idx]

        item["real_frames"] = label_mask.sum()
        item["labels"] = labels[label_mask]
        item["compression_ratio"] = item["real_frames"] / len(item["input_values"])
        item["correct"] = (np.array(item["predicted_labels"])[label_mask] == labels[label_mask])
        item["fp"] = (np.array(item["predicted_labels"])[label_mask] == 1) & (labels[label_mask] == 0)
        item["fn"] = (np.array(item["predicted_labels"])[label_mask] == 0) & (labels[label_mask] == 1)
        item["tp"] = (np.array(item["predicted_labels"])[label_mask] == 1) & (labels[label_mask] == 1)
        item["tn"] = (np.array(item["predicted_labels"])[label_mask] == 0) & (labels[label_mask] == 0)
        item["accuracy"] = item["correct"].mean()
        return item

    result = result.map(compute_accuracy, batched=False, with_indices=True)
    return result

In [None]:
optimal_thresholds = estimate_decision_thresholds(trainer)



In [None]:
test_result = rollout_dataset(trainer, dataset["test"], optimal_thresholds)

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

In [None]:
# Remove columns which are redundant and take up lots of space
test_result = test_result.remove_columns(["audio", "input_values", "phone_targets"])

In [None]:
test_result.save_to_disk(Path(trainer.args.output_dir) / "test_result")

Saving the dataset (0/2 shards):   0%|          | 0/1680 [00:00<?, ? examples/s]