In [1]:
from datasets import load_dataset, DatasetDict
 
# get data set from mozilla common voice 11 zh-HK
common_voice = DatasetDict()
 
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-HK", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-HK", split="test", use_auth_token=True)

# remove extra columns
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
print(common_voice)

Using the latest cached version of the module from C:\Users\ken20\.cache\huggingface\modules\datasets_modules\datasets\mozilla-foundation--common_voice_11_0\3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631 (last modified on Sat Nov 11 21:28:46 2023) since it couldn't be found locally at mozilla-foundation/common_voice_11_0., or remotely on the Hugging Face Hub.
  table = cls._concat_blocks(blocks, axis=0)
Using the latest cached version of the module from C:\Users\ken20\.cache\huggingface\modules\datasets_modules\datasets\mozilla-foundation--common_voice_11_0\3f27acf10f303eac5b6fbbbe02495aeddb46ecffdb0a2fe3507fcfbf89094631 (last modified on Sat Nov 11 21:28:46 2023) since it couldn't be found locally at mozilla-foundation/common_voice_11_0., or remotely on the Hugging Face Hub.


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 14014
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 5591
    })
})


In [2]:
# load feature extractor and tokenizer
from transformers import WhisperFeatureExtractor,WhisperTokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="chinese", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [3]:
# test tokenizer
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)
 
print(f"Input: {input_str}")
print(f"Decoded w/ special: {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal: {input_str == decoded_str}")

Input: 才能勇往直前
Decoded w/ special: <|startoftranscript|><|zh|><|transcribe|><|notimestamps|>才能勇往直前<|endoftext|>
Decoded w/out special: 才能勇往直前
Are equal: True


In [4]:
# resample data set to 16kHz
from datasets import Audio
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

# print sample rate
print(common_voice["train"][0])


{'audio': {'path': 'C:\\Users\\ken20\\.cache\\huggingface\\datasets\\downloads\\extracted\\8df09a2f9ee0d7c086ec3aa9b6d8313e73dd3cd0cd3b31b2fe7fb9b62cc045ea\\zh-HK_train_0/common_voice_zh-HK_22942304.mp3', 'array': array([ 5.45696821e-12,  2.72848411e-12,  3.63797881e-12, ...,
        1.48210138e-05,  9.73203896e-07, -4.09249424e-06]), 'sampling_rate': 16000}, 'sentence': '才能勇往直前'}


In [5]:
# preprocess data set

def preprocess_data(batch):

    # re import feature extractor and tokenizer let them run in parallel 
    # it is slow becaues it need to load model every time
    
    # from transformers import WhisperFeatureExtractor,WhisperTokenizer
    # tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="chinese", task="transcribe")
    # feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
 
    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [7]:
common_voice = common_voice.map(preprocess_data, remove_columns=common_voice.column_names["train"], num_proc=1)

Map:   0%|          | 0/14014 [00:00<?, ? examples/s]

Map:   0%|          | 0/5591 [00:00<?, ? examples/s]

In [8]:
common_voice.save_to_disk("common_voice_zh-HK")

Saving the dataset (0/27 shards):   0%|          | 0/14014 [00:00<?, ? examples/s]

Saving the dataset (0/11 shards):   0%|          | 0/5591 [00:00<?, ? examples/s]