In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from datasets import load_from_disk
import distribution_inference.models.asr as models_asr
import evaluate
from distribution_inference.training.utils import load_model
import numpy as np
import torch as ch
from tqdm import tqdm

from dataclasses import dataclass
from typing import Any, List, Dict, Union

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 150

In [None]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizerFast
from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration

import evaluate

In [None]:
base_data_dir = "/p/adversarialml/as9rw/datasets/librispeech/"
small_data_sample = load_from_disk(os.path.join(base_data_dir, "processed", "adv", "audit_subjects"))

In [None]:
tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny.en")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

In [None]:
sam = small_data_sample[0]

In [None]:
z = tokenizer(sam['text']).input_ids

In [None]:
z = model.generate(input_features=ch.Tensor(small_data_sample[:10]['input_features']))

In [None]:
pred_str = tokenizer.batch_decode(z, skip_special_tokens=True, normalize=True)

In [None]:
ids = tokenizer(small_data_sample[:10]['text']).input_ids
label_str = tokenizer.batch_decode(ids, skip_special_tokens=True, normalize=True)

In [None]:
pred_str

In [None]:
label_str

In [None]:
from audiomentations import Compose, AddGaussianNoise, PitchShift, AirAbsorption, TanhDistortion
import IPython.display as ipd

In [None]:
modify = small_data_sample[0]['audio']['array']
ipd.Audio(modify, rate=16_000, autoplay=False)

In [None]:
# Approved
transform = AddGaussianNoise(
    min_amplitude=0.01,
    max_amplitude=0.015,
    p=1.0
)
# Approved
transform = PitchShift(
    min_semitones=-4.0,
    max_semitones=4.0,
    p=1.0
)
# Approved
transform = AirAbsorption(
    min_distance=100,
    max_distance=500,
    p=1.0)
# Approved
transform = TanhDistortion(
    min_distortion=0.1,
    max_distortion=0.7,
    p=1.0)

augmented_sound = transform(modify, sample_rate=16_000)
ipd.Audio(augmented_sound, rate=16_000, autoplay=False)

In [None]:
fe = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny.en")

In [None]:
aug_feature = fe(augmented_sound, sampling_rate=16_000).input_features[0]
z = model.generate(input_features=ch.Tensor([aug_feature]))
pred_str = tokenizer.batch_decode(z, skip_special_tokens=True, normalize=True)

In [None]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], ch.Tensor]]]) -> Dict[str, ch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]}
                          for feature in features]
        batch = self.processor.feature_extractor.pad(
            input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]}
                          for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(
            label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
collator = DataCollatorSpeechSeq2SeqWithPadding(processor=WhisperProcessor.from_pretrained("openai/whisper-tiny.en", language="en", task="transcribe"))

In [None]:
zz = model(**collator([{"input_features": aug_feature, "labels": tokenizer(small_data_sample[0]['text']).input_ids}]))

In [None]:
zz.loss.item()

In [None]:
metric = evaluate.load("wer")

In [None]:
metric.compute(predictions=pred_str, references=label_str)

In [None]:
def metric_changes_under_augs(data, tokenizer, batch_size: int, sample_rate: int = 16000):
    # Approved
    transforms = [
        AddGaussianNoise(
            min_amplitude=0.01,
            max_amplitude=0.015,
            p=1.0),
        PitchShift(
            min_semitones=-4.0,
            max_semitones=4.0,
            p=1.0),
        AirAbsorption(
            min_distance=100,
            max_distance=500,
            p=1.0),
        TanhDistortion(
            min_distortion=0.1,
            max_distortion=0.7,
            p=1.0)
    ]
    aug_data_flat = []
    for x in tqdm(data, "Generating augmented data"):
        aug_data_flat.extend([transform(x['audio']['array'], sample_rate) for transform in transforms])
    aug_data_flat = ch.from_numpy(np.concatenate(aug_data_flat))
    # Get encodings for text in data
    all_text = data['text']
    encodings = tokenizer(data['text']).input_ids
    # Get model outputs for augmented data
    wers = []
    for i in range(0, len(aug_data_flat), batch_size):
        batch = aug_data_flat[i:i+batch_size]
        # Could make more efficient by only making forward call and using that to infer
        # Generated sequence, but following is more fool-proof
        
        # Get loss values
        collated_batch = 
        model()
        
        # Get outputs (for WER computation)
        output = model.generate(input_features=batch.cuda())
        pred_str = tokenizer.batch_decode(output, skip_special_tokens=True, normalize=True)
        wers.append([metric.compute(predictions=pred, references=all_text[(i + j) // len(transforms)]) for j, pred in enumerate(pred_str)])
        

In [None]:
# Load model
model = models_asr.WhisperTiny()
model, (train_ids, _) = load_model(model, model_path, on_cpu=False)
model.eval()
print("Loaded model!")

In [None]:
# Extract member speakers and their information
pool_speakers = members["speaker_id"]
# Identify 'member' and 'non-member' data
members_mask = np.where(np.isin(pool_speakers, train_ids))[0]
members = members.select(members_mask)

In [None]:
# TODO: Pick only N speakers from both sets and focus on their metrics
def pick_speakers(ds, num: int):
    unique_speakers = np.unique(ds["speaker_id"])
    picked_speakers = np.random.choice(unique_speakers, num, replace=False)
    mask = np.where(np.isin(ds["speaker_id"], picked_speakers))[0]
    return ds.select(mask)

In [None]:
num_pick = 5

In [None]:
np.unique(subset_nonmembers["speaker_id"], return_counts=True)

In [None]:
subset_members    = pick_speakers(members, num_pick)
subset_nonmembers = pick_speakers(non_members, num_pick)

In [None]:
def recursive_to_device(data, device):
    return {key: value.to(device) for key, value in data.items()}

In [None]:
model.to("cuda:0")

In [None]:
def get_batch_metrics(m, features):
    wer_metric = evaluate.load("wer")

    input_features = [{"input_features": x} for x in features["input_features"]]
    batch = m.processor.feature_extractor.pad(input_features, return_tensors="pt")

    # get the tokenized label sequences
    label_features = [{"input_ids": x} for x in features["labels"]]
    # pad the labels to max length
    labels_batch = m.processor.tokenizer.pad(label_features, return_tensors="pt")

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(
    labels_batch.attention_mask.ne(1), -100)

    # if bos token is appended in previous tokenization step,
    # cut bos token here as it's append later anyways
    if (labels[:, 0] == m.processor.tokenizer.bos_token_id).all().cpu().item():
        labels = labels[:, 1:]

    batch["labels"] = labels
    
    # Get model output
    with ch.no_grad():
        batch_cuda = recursive_to_device(batch, "cuda:0")
        logits = m.model(**batch_cuda).logits.cpu()
        pred_ids = m.model.generate(**batch_cuda, max_length=225).cpu()
        label_ids = batch["labels"]

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = model.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = model.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    pred_str = [x.lstrip().strip() for x in pred_str]
    label_str = model.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    
    wer = [wer_metric.compute(predictions=[x], references=[y]) for (x, y) in zip(pred_str, label_str)]
    for x, y in zip(pred_str, label_str):
        print(x)
        print(y)
        print()

    # Compute loss
    loss_function = ch.nn.CrossEntropyLoss()
    losses = [loss_function(x.view(-1, x.shape[-1]), y.view(-1)).item() for (x, y) in zip(logits, label_ids)]

    return wer, losses

In [None]:
def get_metrics(m, data, batch_size: int = 8):
    all_metrics = []
    for i in tqdm(range(0, len(data), batch_size)):
        all_metrics.append(get_batch_metrics(m, data[i:i+batch_size]))
        break
    all_metrics = np.concatenate(all_metrics, 0).T
    return all_metrics

In [None]:
# losses_members = get_metrics(model, subset_members)
losses_nonmembers = get_metrics(model, subset_nonmembers)

In [None]:
losses_nonmembers[:, 0]

In [None]:
# Plot loss values, WER, and CER for both scenarios
plt.hist(losses_members, 21, alpha=0.5, label="members")
plt.hist(losses_nonmembers, 21, alpha=0.5, label="non-members")
plt.legend()

In [None]:
# Later: Consider adding noise/augmentations to input and measure robustness in model behavior