huggingface · lhoestq · Mar 24, 2022 · Mar 24, 2022 · Mar 24, 2022 · Mar 24, 2022
diff --git a/datasets/ami/dataset_infos.json b/datasets/ami/dataset_infos.json
diff --git a/datasets/arabic_speech_corpus/arabic_speech_corpus.py b/datasets/arabic_speech_corpus/arabic_speech_corpus.py
@@ -93,7 +93,7 @@ def _info(self):
             supervised_keys=("file", "text"),
             homepage=_URL,
             citation=_CITATION,
-            task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
+            task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")],
         )
 
     def _split_generators(self, dl_manager):

diff --git a/datasets/arabic_speech_corpus/dataset_infos.json b/datasets/arabic_speech_corpus/dataset_infos.json
@@ -1 +1 @@
-{"clean": {"description": "This Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton.\nThe corpus was recorded in south Levantine Arabic\n(Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@phdthesis{halabi2016modern,\n  title={Modern standard Arabic phonetics for speech synthesis},\n  author={Halabi, Nawar},\n  year={2016},\n  school={University of Southampton}\n}\n", "homepage": "http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "phonetic": {"dtype": "string", "id": null, "_type": "Value"}, "orthographic": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "arabic_speech_corpus", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1002365, "num_examples": 1813, "dataset_name": "arabic_speech_corpus"}, "test": {"name": "test", "num_bytes": 65784, "num_examples": 100, "dataset_name": "arabic_speech_corpus"}}, "download_checksums": {"http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip": {"num_bytes": 1192302846, "checksum": "1df85219370fb1ebe8bfc46aa886265586411d04e7c1caa5a5b9847b3ad5f9de"}}, "download_size": 1192302846, "post_processing_size": null, "dataset_size": 1068149, "size_in_bytes": 1193370995}}
+{"clean": {"description": "This Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton.\nThe corpus was recorded in south Levantine Arabic\n(Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@phdthesis{halabi2016modern,\n  title={Modern standard Arabic phonetics for speech synthesis},\n  author={Halabi, Nawar},\n  year={2016},\n  school={University of Southampton}\n}\n", "homepage": "http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "decode": true, "id": null, "_type": "Audio"}, "phonetic": {"dtype": "string", "id": null, "_type": "Value"}, "orthographic": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "arabic_speech_corpus", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1002365, "num_examples": 1813, "dataset_name": "arabic_speech_corpus"}, "test": {"name": "test", "num_bytes": 65784, "num_examples": 100, "dataset_name": "arabic_speech_corpus"}}, "download_checksums": {"http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip": {"num_bytes": 1192302846, "checksum": "1df85219370fb1ebe8bfc46aa886265586411d04e7c1caa5a5b9847b3ad5f9de"}}, "download_size": 1192302846, "post_processing_size": null, "dataset_size": 1068149, "size_in_bytes": 1193370995}}
diff --git a/datasets/common_voice/common_voice.py b/datasets/common_voice/common_voice.py
@@ -652,9 +652,7 @@ def _info(self):
             homepage=_HOMEPAGE,
             license=_LICENSE,
             citation=_CITATION,
-            task_templates=[
-                AutomaticSpeechRecognition(audio_file_path_column="path", transcription_column="sentence")
-            ],
+            task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="sentence")],
         )
 
     def _split_generators(self, dl_manager):

diff --git a/datasets/librispeech_asr/dataset_infos.json b/datasets/librispeech_asr/dataset_infos.json
@@ -1 +1 @@
-{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 6619683041, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 23898214592, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 359572231, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 367705423, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 31245175287, "size_in_bytes": 61366552941}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_file_path_column": "file", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 31810256902, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 337283304, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 352396474, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 32499936680, "size_in_bytes": 63736502057}}
+{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 6619683041, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 23898214592, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 359572231, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 367705423, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 31245175287, "size_in_bytes": 61366552941}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n  title={Librispeech: an ASR corpus based on public domain audio books},\n  author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n  booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n  pages={5206--5210},\n  year={2015},\n  organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 31810256902, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 337283304, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 352396474, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 32499936680, "size_in_bytes": 63736502057}}
diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py
@@ -112,7 +112,7 @@ def _info(self):
             supervised_keys=("file", "text"),
             homepage=_URL,
             citation=_CITATION,
-            task_templates=[AutomaticSpeechRecognition(audio_file_path_column="file", transcription_column="text")],
+            task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")],
         )
 
     def _split_generators(self, dl_manager):