## House Keeping

### Check GPU

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Dec  8 01:16:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   40C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Install Packages

In [2]:
!pip install bitsandbytes
!pip install wandb
!pip install -q -r community-events/whisper-fine-tuning-event/requirements.txt
!pip install --quiet datasets git+https://github.com/huggingface/transformers evaluate huggingface_hub jiwer bitsandbytes accelerate

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


### Load Libraries

In [3]:
from datasets import Audio, interleave_datasets, IterableDataset, load_dataset, IterableDatasetDict
from typing import List, Optional
from tqdm.notebook import tqdm
from transformers import WhisperProcessor

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
class StreamDatasetLoader:
    def __init__(self, dataset_names, dataset_config_names, dataset_splits, text_column_names, stopping_strategy="all_exhausted"):
        """
        Initialize DatasetLoader instance.

        Args:
            dataset_names (list): list of dataset names
            dataset_config_names (list): list of dataset config names
            dataset_splits (list): list of dataset splits
            text_column_names (list): list of text column names
            stopping_strategy (str, optional): stopping strategy for interleaved datasets. Defaults to "all_exhausted".
        """
        self.dataset_names = dataset_names
        self.dataset_config_names = dataset_config_names
        self.splits = dataset_splits
        self.text_column_names = text_column_names
        self.stopping_strategy = stopping_strategy

    def load_and_process_datasets(self):
        """
        Load and process datasets.

        Returns:
            tuple: tuple of interleaved dataset and all datasets
        """
        # Use list comprehension to create splits and text_column_names lists
        splits = ["train" for i in range(len(self.dataset_names))] if self.splits is None else self.splits
        text_column_names = ["text" for i in range(len(self.dataset_names))] if self.text_column_names is None else self.text_column_names

        all_datasets = []
        for i, dataset_name in tqdm(enumerate(self.dataset_names)):
            for split in splits[i]:
                dataset = load_dataset(path=self.dataset_names[i], name=self.dataset_config_names[i], split=split, streaming=True, use_auth_token=True)
                # cast "audio" column to 'Audio' type with sampling rate of 16Mhz
                dataset = dataset.cast_column("audio", Audio(16000))
                # rename text column to "sentence" if not already named so
                if text_column_names[i] != "sentence":
                    dataset = dataset.rename_column(text_column_names[i], "sentence")
                # Store set of keys to be removed in a variable
                keys_to_remove = set(dataset.features.keys()) - set(["audio", "sentence"])
                # Use stored variable to remove columns
                dataset = dataset.remove_columns(keys_to_remove)
                all_datasets.append(dataset)

        interleaved_dataset = interleave_datasets(all_datasets, stopping_strategy=self.stopping_strategy)
        return interleaved_dataset


In [5]:
dataset_names = ["mozilla-foundation/common_voice_11_0", "google/fleurs"]
dataset_config_names = ["sw", "sw_ke"]
dataset_splits = [["train", "validation"], ["train", "validation", "test"]]
text_column_names = ["sentence", "transcription"]

dataset_loader = StreamDatasetLoader(dataset_names, dataset_config_names, dataset_splits, text_column_names)
stream_dataset = dataset_loader.load_and_process_datasets()

# for i, sample in enumerate(stream_dataset):
#     print(i, sample["sentence"])
#     if i == 9:
#         break

0it [00:00, ?it/s]

Downloading builder script:   0%|          | 0.00/8.30k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/60.9k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/12.8k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

In [6]:
from transformers import WhisperProcessor
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

class NormalizeTranscriptions(object):
    def __init__(self, do_lower_case, do_remove_punctuation):
        self.processor = WhisperProcessor.from_pretrained("openai/whisper-large", language="sw", task="transcribe")
        self.normalizer = BasicTextNormalizer()
        self.do_lower_case = do_lower_case
        self.do_remove_punctuation = do_remove_punctuation
        
    def __call__(self, batch):
        # optional pre-processing steps
        self.transcription = batch["sentence"]
        if self.do_lower_case:
            self.transcription = self.transcription.lower()
        if self.do_remove_punctuation:
            self.transcription = self.normalizer(self.transcription).strip()
        batch["sentence"] = self.transcription
        batch['labels'] = self.processor.tokenizer(self.transcription).input_ids

        return batch

normalizer = NormalizeTranscriptions(do_lower_case=True, do_remove_punctuation=True)
normalizing_function = lambda x: normalizer(x)

ds = stream_dataset.map(normalizing_function)

# for i, sample in enumerate(ds):
#   print(i, sample['sentence'])
#   if i == 5:
#     break

# for i, sample in enumerate(ds):
#   print(i, sample['labels'])
#   if i == 5:
#     break

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [7]:
def prepare_dataset(batch):
    # load and (possibly) resample audio datato 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # compute input length of audio sample in seconds
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
        
    return batch

In [8]:
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Swahili", task="transcribe")
vectorized_datasets = ds.map(prepare_dataset, remove_columns=['audio', 'sentence']).with_format("torch")

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

In [9]:
vectorized_datasets = vectorized_datasets.shuffle(
    buffer_size=500,
    seed=0,
)

In [10]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

In [11]:
vectorized_datasets = vectorized_datasets.filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

In [12]:
# streaming
dataset_names = ["mozilla-foundation/common_voice_11_0"]
dataset_config_names = ["sw"]
dataset_splits = [["test"]]
text_column_names = ["sentence"]

dataset_loader = StreamDatasetLoader(dataset_names, dataset_config_names, dataset_splits, text_column_names)
stream_dataset = dataset_loader.load_and_process_datasets()

# normalization
stream_dataset = stream_dataset.map(normalizing_function)

# pre-processing
vectorized_test_datasets = stream_dataset.map(prepare_dataset, remove_columns=['audio', 'sentence']).with_format("torch")

next(iter(vectorized_test_datasets)).keys()

0it [00:00, ?it/s]

Reading metadata...: 10238it [00:00, 31549.93it/s]


dict_keys(['labels', 'input_features', 'input_length'])

In [13]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [14]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [15]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [16]:
# evaluate with the 'normalised' WER
do_normalize_eval = True

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [17]:
# Load pretrained checkpoint

from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

Downloading:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/967M [00:00<?, ?B/s]

In [18]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [19]:
import os
os.mkdir('./whisper-small-sw')

In [28]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-sw",  # your repo name
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=400,
    max_steps=3000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

PyTorch: setting up devices


In [29]:
from transformers import TrainerCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from torch.utils.data import IterableDataset

# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)

In [30]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=vectorized_datasets,
    eval_dataset=vectorized_test_datasets,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[ShuffleCallback()],
)

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [31]:
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)

Configuration saved in ./whisper-small-sw/config.json
Model weights saved in ./whisper-small-sw/pytorch_model.bin
Feature extractor saved in ./whisper-small-sw/preprocessor_config.json
tokenizer config file saved in ./whisper-small-sw/tokenizer_config.json
Special tokens file saved in ./whisper-small-sw/special_tokens_map.json
added tokens file saved in ./whisper-small-sw/added_tokens.json


In [32]:
import wandb

In [34]:
# W&B argument tracking
config = dict(
    output_dir="./whisper-small-sw",  # your repo name
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=400,
    max_steps=3000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

# start a new W&B training run
wandb.init(
    project="whisper-small-sw", 
    entity="mldude", 
    tags=["whisper-event", "whisper-small-es", "Swahili"],
    config=config
)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668430100003205, max=1.0…

In [35]:
%%wandb

trainer.train()

***** Running training *****
  Num examples = 192000
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 2
  Total optimization steps = 3000
  Number of trainable parameters = 241734912
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Reading metadata...: 26614it [00:00, 47934.99it/s]
Reading metadata...: 10233it [00:00, 29387.17it/s]
The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss,Wer
1000,0.099,0.412005,26.043352
2000,0.0391,0.366616,22.314021
3000,0.0167,0.355822,21.934203


Reading metadata...: 10233it [00:01, 5234.85it/s]
***** Running Evaluation *****
  Num examples: Unknown
  Batch size = 8
Reading metadata...: 10238it [00:00, 29139.64it/s]
The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.
Saving model checkpoint to ./whisper-small-sw/checkpoint-1000
Configuration saved in ./whisper-small-sw/checkpoint-1000/config.json
Model weights saved in ./whisper-small-sw/checkpoint-1000/pytorch_model.bin
Feature extractor saved in ./whisper-small-sw/checkpoint-1000/preprocessor_config.json
tokenizer config file saved in ./whisper-small-sw/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./whisper-small-sw/checkpoint-1000/special_tokens_map.json
added tokens file saved in ./whisper-small-sw/checkpoint-1000/added_tokens.jso

TrainOutput(global_step=3000, training_loss=0.18269595941901207, metrics={'train_runtime': 49063.7464, 'train_samples_per_second': 3.913, 'train_steps_per_second': 0.061, 'total_flos': 5.540579959283712e+19, 'train_loss': 0.18269595941901207, 'epoch': 1.31})

In [1]:
## push weights into HuggingFace repository
from huggingface_hub import HfApi

CN.MODEL_NAME = "whisper-large-sw" # Whisper variant
CN.MODEL_PATH = Path("C:/Users/Hedronstone/Desktop/whisper_event") / "pretrained_weights" / "whisper-small-sw" / "checkpoint-3000" # path to weights folder
CN.REPO_TYPE = "model" # repository type, `model` or `space`
CN.REPO_ID = "hedronstone/" + CN.MODEL_NAME # repository id

# load saved weights into model
model = model.from_pretrained(model_path)

# prepare upload pipeline
api = HfApi()

# launch upload task
api.upload_folder(
    folder_path=model_path,
    path_in_repo="weights/pytorch",
    repo_id="hedronstone/whisper-small-sw",
    repo_type="model",
    create_pr=1,
)