In [None]:
# Ensure datasets is installed from main. Uncomment the following line if you face issues running this script:
# !pip install git+https://github.com/huggingface/datasets

In [None]:
from datasets import Audio, interleave_datasets, IterableDataset, load_dataset, SplitDict
from typing import List, Optional

In [None]:
def load_streaming_dataset(dataset_name, dataset_config_name, split, **kwargs):
    if "+" in split:
        # load multiple splits separated by the `+` symbol *with* streaming mode
        dataset_splits = [load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs) for split_name in split.split("+")]
        # interleave multiple splits to form one dataset
        interleaved_dataset = interleave_datasets(dataset_splits)
        return interleaved_dataset
    else:
        # load a single split *with* streaming mode
        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)
        return dataset

### Define the dataset attributes

In this example, we'll show how to combine the Common Voice 11 and FLEURS datasets for Greek (el), parameters for , VoxPopuli, Mulitlingual LibriSpeech are commented out. The resulting training corpus will be equal to the sum of the individual datasets and the test dataset will be equal to the Common Voice 11 test split only. This is particularly beneficial in low-resource settings, where any one of the datasets alone might have insufficient data to train a model.

The parameters required are shown below in lists of the same length, with one element per dataset:

-   `dataset_names` contains the Hugging Face Hub name of the datasets
-   `dataset_config_names` contains the respective language codes.
-   `text_column_names` contains the name used for the text feature (column) in each respective dataset
-   `train_splits` and `test_splits` contain split names used for each respective dataset. If multiple splits need to be interleaved the names are concatenated into one string separated by the + sign. E.g to merge both test and validation the code should be `"test+validation"`. A split name `"-"` (dash) can be used to suppress a dataset. This for example is the case if you want the training datasets to contain two datasets and the test come only from the Common Voice 11. 

In [None]:
#dataset_names = ["mozilla-foundation/common_voice_11_0", "facebook/voxpopuli", "facebook/multilingual_librispeech", "google/fleurs"]
#dataset_config_names = ["es", "es", "spanish", "es_419"]
#text_column_names = ["sentence", "normalized_text", "text", "transcription"]

In [None]:
dataset_names = ["mozilla-foundation/common_voice_11_0", "google/fleurs"]
dataset_config_names = ["el", "el_gr"]
text_column_names = ["sentence",  "transcription"]
train_splits =["train+validation","train+validation"]
test_splits  = ["test","-"]

### Define the merging function

We define a function, `load_multiple_streaming_datasets`, that takes as argument a list of datasets, configs, splits (optional) and text column names (optional). It sets them to a specified sampling rate and interleaves them together, giving one merged dataset. This is all 
done in _streaming mode_: as we iterate over the merged dataset we load samples one-by-one on the fly. No data is
saved to disk.

We can also specify our strategy for interleaving datasets. The default strategy, `all_exhausted` is an oversampling 
strategy. In this case, the dataset construction is stopped as soon as every samples in every dataset 
has been added at least once. In practice, it means that if a dataset is exhausted, it will return to the 
beginning of this dataset until the stop criterion has been reached. You can specify `stopping_strategy=first_exhausted` 
for a subsampling strategy, i.e the dataset construction is stopped as soon one of the dataset runs out of samples. 

In [None]:
def load_multiple_streaming_datasets(
    dataset_names: List,
    dataset_config_names: List,
    train_splits: Optional[List] = None,
    test_splits: Optional[List] = None,
    text_column_names: Optional[List] = None,
    sampling_rate: Optional[int] = 16000,
    stopping_strategy: Optional[str] = "all_exhausted",
    **kwargs
) -> IterableDataset:

    if len(dataset_names) != len(dataset_config_names):
        raise ValueError(
            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
            f" {len(dataset_config_names)} configs."
        )

    if train_splits is not None and len(train_splits) != len(dataset_names):
        raise ValueError(
            f"Ensure one train_split is passed for each dataset, got {len(dataset_names)} datasets and {len(train_splits)} splits."
        )

    if test_splits is not None and len(test_splits) != len(dataset_names):
        raise ValueError(
            f"Ensure one test_split is passed for each dataset, got {len(dataset_names)} datasets and {len(test_splits)} splits."
        )

    if text_column_names is not None and len(text_column_names) != len(dataset_names):
        raise ValueError(
            f"Ensure one text column name is passed for each dataset, got {len(dataset_names)} datasets and"
            f" {len(text_column_names)} text column names."
        )

    train_splits = train_splits if train_splits is not None \
        else ["train" for i in range(len(dataset_names))]

    test_splits = test_splits if test_splits is not None \
        else ["test" for i in range(len(dataset_names))]

    text_column_names = (
        text_column_names if text_column_names is not None else ["text" for i in range(len(dataset_names))]
    )


    all_datasets = []
    all_train_splits = []
    all_test_splits  = []
    # iterate over the datasets we want to interleave
    for dset, cfgNm, trnSplit, tstSplit, colNm in zip(dataset_names,dataset_config_names,train_splits,test_splits,text_column_names):

        train_dset_splits = [load_dataset(dset, cfgNm, split=c, streaming=True, **kwargs) for c in trnSplit.split('+') if c != '-']
        test_dset_splits  = [load_dataset(dset, cfgNm, split=c, streaming=True, **kwargs) for c in tstSplit.split('+') if c != '-']

        train_dset_splits = [ds.cast_column("audio", Audio(sampling_rate)) for ds in train_dset_splits]
        test_dset_splits  = [ds.cast_column("audio", Audio(sampling_rate)) for ds in test_dset_splits]

        train_dset_splits = [ds.rename_column(colNm, "text") for ds in train_dset_splits]
        test_dset_splits  = [ds.rename_column(colNm, "text") for ds in test_dset_splits]

        cols2keep = set(["audio", "text"])

        train_dset_splits = [ds.remove_columns(set(ds.features.keys()) - cols2keep) for ds in train_dset_splits]
        test_dset_splits  = [ds.remove_columns(set(ds.features.keys()) - cols2keep) for ds in test_dset_splits]

        all_train_splits +=   train_dset_splits
        all_test_splits  +=   test_dset_splits


    interleaved_train_dataset = interleave_datasets(all_train_splits, stopping_strategy=stopping_strategy)
    interleaved_test_dataset = interleave_datasets(all_test_splits, stopping_strategy=stopping_strategy)

    return interleaved_train_dataset, interleaved_test_dataset

Let's apply this function to load and merge our four datasets:

In [None]:
train_ds, test_ds = load_multiple_streaming_datasets(dataset_names, 
        dataset_config_names=dataset_config_names, 
        train_splits = train_splits,
        test_splits = test_splits,
        text_column_names=text_column_names, 
        use_auth_token=True)

In [None]:
print('train',train_ds.dataset_size)
print('test',train_ds.dataset_size)


### Iterate over the dataset

We iterate over the dataset, loading and merging samples on the fly. Let's print the transcriptions for the first 10 samples of our merged dataset:

In [None]:
for i, sample in enumerate(test_ds):
    print(i, sample["text"])
    if i == 20:
        break

We can see that the transcriptions take several different formats. Those from Common Voice 11 are cased and punctuated. Those from VoxPopuli are punctuated only. Those from Multilingual LibriSpeech and FLEURS are neither cased not punctuated. We need to normalise the transcriptions to a uniform format before training our model. 

The following code cell is lifted from the Whisper training notebook: https://github.com/huggingface/community-events/blob/main/whisper-fine-tuning-event/fine-tune-whisper-streaming.ipynb

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer

do_lower_case = True
do_remove_punctuation = True

normalizer = BasicTextNormalizer()

Now we define a function to normalise our transcriptions:

In [None]:
def normalize_transcriptions(batch):
    # optional pre-processing steps
    transcription = batch["sentence"]
    if do_lower_case:
        transcription = transcription.lower()
    if do_remove_punctuation:
        transcription = normalizer(transcription).strip()
    batch["sentence"] = transcription
    return batch

Let's apply the data pre-processing steps to our dataset and view the first 10 samples again:

In [None]:
ds = ds.map(normalize_transcriptions)

for i, sample in enumerate(ds):
    print(i, sample["sentence"])
    if i == 9:
        break

This time the transcriptions are in a consistent format. We can use this data to fine-tune our Whisper model. Note that since we've removed punctuation and casing, the Whisper model won't learn to predict these features.