In [None]:
!nvidia-smi

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
import matplotlib.pyplot as plt
import json

from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer

import IPython.display as ipd
import numpy as np
import random
import os
import torch
import re

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

from datasets import load_metric
from datasets import load_dataset

from finetuning_util_hausa import preprocess_texts, ASRDataset
from finetuning_util_hausa import return_combined_dataset, return_fleurs_test, return_cv_test, return_bibletts

In [6]:
#cache_dir="/data/users/kashrest/lrl-asr-experiments/data/fleurs"

root = "/data/users/kashrest/lrl-asr-experiments/"

pretrained_model_card = "facebook/wav2vec2-xls-r-300m"

training_experiment_number = "combined-hausa_experiment_combined-1"

out_dir = root+pretrained_model_card.replace("/", "_")+"/hausa/"+training_experiment_number+"/"

best_checkpoint = out_dir+"checkpoint-18100/"

hausa_vocab_file = out_dir+"vocab_hausa_combined_train_val_test.json"
hyperparameter_file = out_dir+"hyperparameters.jsonl"

In [4]:
fleurs_test_audio, fleurs_test_transcriptions = return_fleurs_test() 
cv_test_audio, cv_test_transcriptions = return_cv_test() 
bibletts_test_audio, bibletts_test_transcriptions = return_bibletts() 

fleurs_test_transcriptions = preprocess_texts(fleurs_test_transcriptions)
cv_test_transcriptions = preprocess_texts(cv_test_transcriptions)
bibletts_test_transcriptions = preprocess_texts(bibletts_test_transcriptions)

Found cached dataset fleurs (/data/users/kashrest/lrl-asr-experiments/data/fleurs/google___fleurs/ha_ng/2.0.0/af82dbec419a815084fa63ebd5d5a9f24a6e9acdf9887b9e3b8c6bbd64e0b7ac)
Found cached dataset common_voice_13_0 (/data/users/kashrest/lrl-asr-experiments/data/cv_13/mozilla-foundation___common_voice_13_0/ha/13.0.0/22809012aac1fc9803eaffc44122e4149043748e93933935d5ea19898587e4d7)


In [5]:
len(fleurs_test_audio), len(cv_test_audio), len(bibletts_test_audio)

(621, 659, 124)

In [7]:
tokenizer = Wav2Vec2CTCTokenizer(hausa_vocab_file, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

sampling_rate = 16000
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=sampling_rate, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [9]:
fleurs_test_dataset = ASRDataset(fleurs_test_audio, fleurs_test_transcriptions, sampling_rate, processor)
cv_test_dataset = ASRDataset(cv_test_audio, cv_test_transcriptions, sampling_rate, processor)
bibletts_test_dataset = ASRDataset(bibletts_test_audio, bibletts_test_transcriptions, sampling_rate, processor)

In [11]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}

with open(hyperparameter_file, "r") as f:
    obj = json.load(f)
    batch_size = obj["training batch size"]
    learning_rate = obj["learning rate"]
    num_train_epochs = obj["number of training epochs"]
    attention_dropout = obj["attention dropout probability"]
    hidden_dropout = obj["hidden layer dropout probability"]
    feat_proj_dropout = obj["feature projection layer dropout probability"]
    mask_time_prob = obj["mask time probability"]
    layerdrop = obj["layer dropout probability"]
    warmup_steps = obj["warm up steps"]

model = Wav2Vec2ForCTC.from_pretrained(
    best_checkpoint, 
    attention_dropout=attention_dropout,
    hidden_dropout=hidden_dropout,
    feat_proj_dropout=feat_proj_dropout,
    mask_time_prob=mask_time_prob,
    layerdrop=layerdrop,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model.num_parameters() # facebook/wav2vec2-xls-r-300m
model.freeze_feature_extractor()
model.gradient_checkpointing_enable()


training_args = TrainingArguments(
  output_dir=out_dir+"temp/",
  group_by_length=True,
  per_device_train_batch_size=batch_size,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=num_train_epochs,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  load_best_model_at_end=True,
  learning_rate=learning_rate,
  warmup_steps=warmup_steps,
  save_total_limit=2,
  metric_for_best_model="wer",
  greater_is_better=False
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=fleurs_test_dataset,
    eval_dataset=fleurs_test_dataset, 
    tokenizer=processor.feature_extractor,
)



In [21]:
preds = trainer.predict(cv_test_dataset)

compute_metrics(preds)

{'wer': 0.3690682036503362, 'cer': 0.09676691729323308}

In [22]:
pred_logits = preds.predictions
pred_ids = np.argmax(pred_logits, axis=-1)

preds.label_ids[preds.label_ids == -100] = processor.tokenizer.pad_token_id

pred_strs = processor.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
label_strs = processor.batch_decode(preds.label_ids, group_tokens=False)

with open(out_dir+"model_predicted_cv13_test.jsonl", "w") as f:
    for pred in pred_strs:
        json.dump(pred, f)
        f.write("\n")

In [None]:
tokenizer = Wav2Vec2CTCTokenizer(hausa_vocab_file, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

sampling_rate = 16000
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=sampling_rate, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

def prepare_dataset(batch):
    audio = batch["audio"]
    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

fleurs_hausa_train = fleurs_hausa_train.map(prepare_dataset, remove_columns=fleurs_hausa_train.column_names, num_proc=4)
fleurs_hausa_val = fleurs_hausa_val.map(prepare_dataset, remove_columns=fleurs_hausa_val.column_names, num_proc=4)
fleurs_hausa_test = fleurs_hausa_test.map(prepare_dataset, remove_columns=fleurs_hausa_test.column_names, num_proc=4)

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer, "cer": cer}

In [None]:
# make sure this preprocessing matches your checkpoint

hausa_train_audio = data["train"]["audio"]
hausa_train_transcriptions = data["train"]["transcriptions"]
fleurs_hausa_val = load_dataset("google/fleurs", "ha_ng", split="validation", cache_dir=cache_dir)
fleurs_hausa_test = load_dataset("google/fleurs", "ha_ng", split="test", cache_dir=cache_dir)


chars_to_remove_regex = '[\,\?\!\-\;\:\"\“\%\‘\'\”\�\$]'

and_sym = '&'

def remove_special_characters(batch):
    batch["transcription"] = re.sub(chars_to_remove_regex, '', batch["transcription"]).lower()
    batch["transcription"] = re.sub("\[", '', batch["transcription"])
    batch["transcription"] = re.sub("\]", '', batch["transcription"])
    batch["transcription"] = re.sub("\{", '', batch["transcription"])
    batch["transcription"] = re.sub("\}", '', batch["transcription"])
    batch["transcription"] = re.sub(r'[\\]', '', batch["transcription"])
    batch["transcription"] = re.sub(r'[/]', '', batch["transcription"])
    batch["transcription"] = re.sub(u'[¥£°¾½²]', '', batch["transcription"])
    batch["transcription"] = re.sub(u'[\+><]', '', batch["transcription"])
    #batch["transcription"] = re.sub(and_sym, "and", batch["transcription"])
    return batch

def normalize_diacritics(batch):
    a = '[āăáã]'
    u = '[ūúü]'
    o = '[öõó]' 
    c = '[ç]'
    i = '[í]'
    s = '[ş]'
    e = '[é]'
    
    batch["transcription"] = re.sub(a, "a", batch["transcription"])
    batch["transcription"] = re.sub(u, "u", batch["transcription"])
    batch["transcription"] = re.sub(o, "o", batch["transcription"])
    batch["transcription"] = re.sub(c, "c", batch["transcription"])
    batch["transcription"] = re.sub(i, "i", batch["transcription"])
    batch["transcription"] = re.sub(s, "s", batch["transcription"])
    batch["transcription"] = re.sub(e, "e", batch["transcription"])

    return batch

fleurs_hausa_train = fleurs_hausa_train.map(remove_special_characters)
fleurs_hausa_val = fleurs_hausa_val.map(remove_special_characters)
fleurs_hausa_test = fleurs_hausa_test.map(remove_special_characters)

fleurs_hausa_train = fleurs_hausa_train.map(normalize_diacritics)
fleurs_hausa_val = fleurs_hausa_val.map(normalize_diacritics)
fleurs_hausa_test = fleurs_hausa_test.map(normalize_diacritics)


# Character vocabulary code from: https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_Tune_XLSR_Wav2Vec2_on_Turkish_ASR_with_%F0%9F%A4%97_Transformers.ipynb#scrollTo=_0kRndSvqaKk
def extract_all_chars(batch):
  all_text = " ".join(batch["transcription"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = fleurs_hausa_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=fleurs_hausa_train.column_names)
vocab_val = fleurs_hausa_val.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=fleurs_hausa_val.column_names)
vocab_test = fleurs_hausa_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=fleurs_hausa_test.column_names)

vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]) | set(vocab_val["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

# for word delimiter, change " " --> "|" (ex. "Hello my name is Kaleen" --> "Hello|my|name|is|Kaleen")
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict) # this is for CTC to predict the end of a character (e.g. "hhh[PAD]iiiiii[PAD]" == "hi")
print(len(vocab_dict))

In [None]:
actual_vocab_dict = None
with open(hausa_vocab_file, "r") as f:
    for line in f:
        actual_vocab_dict = json.loads(line)
actual_vocab_dict

In [None]:
set([elem[0] for elem in actual_vocab_dict.items()]) == set([elem[0] for elem in vocab_dict.items()])

# Evaluation

In [None]:
# set up Trainer for prediction
hyperparameters_file = out_dir+"hyperparameters.jsonl"
with open(hyperparameters_file, "r") as f:
    obj = json.load(f)
    batch_size = obj["training batch size"]
    learning_rate = obj["learning rate"]
    num_train_epochs = obj["number of training epochs"]
    attention_dropout = obj["attention dropout probability"]
    hidden_dropout = obj["hidden layer dropout probability"]
    feat_proj_dropout = obj["feature projection layer dropout probability"]
    mask_time_prob = obj["mask time probability"]
    layerdrop = obj["layer dropout probability"]
    warmup_steps = obj["warm up steps"]

model = Wav2Vec2ForCTC.from_pretrained(
    best_checkpoint, 
    attention_dropout=attention_dropout,
    hidden_dropout=hidden_dropout,
    feat_proj_dropout=feat_proj_dropout,
    mask_time_prob=mask_time_prob,
    layerdrop=layerdrop,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model.num_parameters() # facebook/wav2vec2-xls-r-300m
model.freeze_feature_extractor()
model.gradient_checkpointing_enable()


training_args = TrainingArguments(
  output_dir=out_dir+"temp/",
  group_by_length=True,
  per_device_train_batch_size=batch_size,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=num_train_epochs,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  load_best_model_at_end=True,
  learning_rate=learning_rate,
  warmup_steps=warmup_steps,
  save_total_limit=2,
  metric_for_best_model="wer",
  greater_is_better=False
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=fleurs_hausa_train,
    eval_dataset=fleurs_hausa_val, 
    tokenizer=processor.feature_extractor,
)

In [None]:
preds_val = trainer.predict(fleurs_hausa_val)

compute_metrics(preds_val)

In [None]:
preds_test = trainer.predict(fleurs_hausa_test)

compute_metrics(preds_test)

In [None]:
pred_logits = preds_val.predictions
pred_ids = np.argmax(pred_logits, axis=-1)

preds_val.label_ids[preds_val.label_ids == -100] = processor.tokenizer.pad_token_id

pred_strs = processor.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
label_strs = processor.batch_decode(preds_val.label_ids, group_tokens=False)


for i in range(100, 126):
    print(i)
    print(f"Predicted transcript:\n{pred_strs[i]}\n\nOriginal:\n{label_strs[i]}")
    print("\n****\n")

In [None]:
ipd.Audio(data=fleurs_hausa_test_orig[2]["audio"]["array"], autoplay=True, rate=16000)

In [None]:
cer_metric.compute(predictions=["ciki6"], references=["ciki"])

## Sanity check: check against a model that hasn't been finetuned


In [None]:
hyperparameters_file = out_dir+"hyperparameters.jsonl"
with open(hyperparameters_file, "r") as f:
    obj = json.load(f)
    batch_size = obj["training batch size"]
    learning_rate = obj["learning rate"]
    num_train_epochs = obj["number of training epochs"]
    attention_dropout = obj["attention dropout probability"]
    hidden_dropout = obj["hidden layer dropout probability"]
    feat_proj_dropout = obj["feature projection layer dropout probability"]
    mask_time_prob = obj["mask time probability"]
    layerdrop = obj["layer dropout probability"]
    warmup_steps = obj["warm up steps"]

baseline_model = Wav2Vec2ForCTC.from_pretrained(
    pretrained_model_card, 
    attention_dropout=attention_dropout,
    hidden_dropout=hidden_dropout,
    feat_proj_dropout=feat_proj_dropout,
    mask_time_prob=mask_time_prob,
    layerdrop=layerdrop,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

baseline_model.num_parameters() # facebook/wav2vec2-xls-r-300m
baseline_model.freeze_feature_extractor()
baseline_model.gradient_checkpointing_enable()



baseline_training_args = TrainingArguments(
  output_dir=out_dir+"temp/",
  group_by_length=True,
  per_device_train_batch_size=batch_size,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=num_train_epochs,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  load_best_model_at_end=True,
  learning_rate=learning_rate,
  warmup_steps=warmup_steps,
  save_total_limit=2,
)

baseline_trainer = Trainer(
    model=baseline_model,
    data_collator=data_collator,
    args=baseline_training_args,
    compute_metrics=compute_metrics,
    train_dataset=fleurs_hausa_train,
    eval_dataset=fleurs_hausa_val, 
    tokenizer=processor.feature_extractor,
)

In [None]:
baseline_preds = baseline_trainer.predict(fleurs_hausa_test)

In [None]:
compute_metrics(baseline_preds)

In [None]:
pred_logits = baseline_preds.predictions
pred_ids = np.argmax(pred_logits, axis=-1)

baseline_preds.label_ids[baseline_preds.label_ids == -100] = processor.tokenizer.pad_token_id

baseline_pred_strs = processor.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
baseline_label_strs = processor.batch_decode(baseline_preds.label_ids, group_tokens=False)

In [None]:
print(f"Predicted transcript:\n{baseline_pred_strs[0]}\n\nOriginal:\n{baseline_label_strs[0]}")

In [None]:
!nvidia-smi

In [None]:
!who