update dataset_infos.json

huggingface · Mar 24, 2022 · 27fa5ef · 27fa5ef · github-actions · Mar 24, 2022
1 parent ed39389
commit 27fa5ef
Show file tree

Hide file tree

Showing 6 changed files with 6 additions and 6 deletions.
diff --git a/datasets/ami/dataset_infos.json b/datasets/ami/dataset_infos.json
diff --git a/datasets/arabic_speech_corpus/dataset_infos.json b/datasets/arabic_speech_corpus/dataset_infos.json
@@ -1 +1 @@
-{"clean": {"description": "This Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton.\nThe corpus was recorded in south Levantine Arabic\n(Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@phdthesis{halabi2016modern,\n  title={Modern standard Arabic phonetics for speech synthesis},\n  author={Halabi, Nawar},\n  year={2016},\n  school={University of Southampton}\n}\n", "homepage": "http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "phonetic": {"dtype": "string", "id": null, "_type": "Value"}, "orthographic": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "arabic_speech_corpus", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1002365, "num_examples": 1813, "dataset_name": "arabic_speech_corpus"}, "test": {"name": "test", "num_bytes": 65784, "num_examples": 100, "dataset_name": "arabic_speech_corpus"}}, "download_checksums": {"http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip": {"num_bytes": 1192302846, "checksum": "1df85219370fb1ebe8bfc46aa886265586411d04e7c1caa5a5b9847b3ad5f9de"}}, "download_size": 1192302846, "post_processing_size": null, "dataset_size": 1068149, "size_in_bytes": 1193370995}}
+{"clean": {"description": "This Speech corpus has been developed as part of PhD work carried out by Nawar Halabi at the University of Southampton.\nThe corpus was recorded in south Levantine Arabic\n(Damascian accent) using a professional studio. Synthesized speech as an output using this corpus has produced a high quality, natural voice.\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@phdthesis{halabi2016modern,\n  title={Modern standard Arabic phonetics for speech synthesis},\n  author={Halabi, Nawar},\n  year={2016},\n  school={University of Southampton}\n}\n", "homepage": "http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 48000, "mono": true, "decode": true, "id": null, "_type": "Audio"}, "phonetic": {"dtype": "string", "id": null, "_type": "Value"}, "orthographic": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "arabic_speech_corpus", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1002365, "num_examples": 1813, "dataset_name": "arabic_speech_corpus"}, "test": {"name": "test", "num_bytes": 65784, "num_examples": 100, "dataset_name": "arabic_speech_corpus"}}, "download_checksums": {"http://en.arabicspeechcorpus.com/arabic-speech-corpus.zip": {"num_bytes": 1192302846, "checksum": "1df85219370fb1ebe8bfc46aa886265586411d04e7c1caa5a5b9847b3ad5f9de"}}, "download_size": 1192302846, "post_processing_size": null, "dataset_size": 1068149, "size_in_bytes": 1193370995}}
diff --git a/datasets/lj_speech/dataset_infos.json b/datasets/lj_speech/dataset_infos.json
@@ -1 +1 @@
-{"main": {"description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading \npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length \nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@misc{ljspeech17,\n  author       = {Keith Ito and Linda Johnson},\n  title        = {The LJ Speech Dataset},\n  howpublished = {\\url{https://keithito.com/LJ-Speech-Dataset/}},\n  year         = 2017\n}\n", "homepage": "https://keithito.com/LJ-Speech-Dataset/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "normalized_text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "lj_speech", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4667022, "num_examples": 13100, "dataset_name": "lj_speech"}}, "download_checksums": {"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2": {"num_bytes": 2748572632, "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5"}}, "download_size": 2748572632, "post_processing_size": null, "dataset_size": 4667022, "size_in_bytes": 2753239654}}
+{"main": {"description": "This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading \npassages from 7 non-fiction books in English. A transcription is provided for each clip. Clips vary in length \nfrom 1 to 10 seconds and have a total length of approximately 24 hours.\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .wav format and is not converted to a float32 array. To convert the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n    speech_array, _ = sf.read(batch[\"file\"])\n    batch[\"speech\"] = speech_array\n    return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@misc{ljspeech17,\n  author       = {Keith Ito and Linda Johnson},\n  title        = {The LJ Speech Dataset},\n  howpublished = {\\url{https://keithito.com/LJ-Speech-Dataset/}},\n  year         = 2017\n}\n", "homepage": "https://keithito.com/LJ-Speech-Dataset/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 22050, "mono": true, "decode": true, "id": null, "_type": "Audio"}, "file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "normalized_text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "lj_speech", "config_name": "main", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4667022, "num_examples": 13100, "dataset_name": "lj_speech"}}, "download_checksums": {"https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2": {"num_bytes": 2748572632, "checksum": "be1a30453f28eb8dd26af4101ae40cbf2c50413b1bb21936cbcdc6fae3de8aa5"}}, "download_size": 2748572632, "post_processing_size": null, "dataset_size": 4667022, "size_in_bytes": 2753239654}}
diff --git a/datasets/superb/dataset_infos.json b/datasets/superb/dataset_infos.json
diff --git a/datasets/superb/superb.py b/datasets/superb/superb.py
@@ -147,7 +147,7 @@ class Superb(datasets.GeneratorBasedBuilder):
             supervised_keys=("file", "text"),
             url="http://www.openslr.org/12",
             data_url="http://www.openslr.org/resources/12/",
-            task_templates=[AutomaticSpeechRecognition(audio="audio_column", transcription_column="text")],
+            task_templates=[AutomaticSpeechRecognition(audio_column="audio", transcription_column="text")],
         ),
         SuperbConfig(
             name="ks",

diff --git a/datasets/timit_asr/dataset_infos.json b/datasets/timit_asr/dataset_infos.json
@@ -1 +1 @@
-{"clean": {"description": "The TIMIT corpus of reading speech has been developed to provide speech data for acoustic-phonetic research studies\nand for the evaluation of automatic speech recognition systems.\n\nTIMIT contains high quality recordings of 630 individuals/speakers with 8 different American English dialects,\nwith each individual reading upto 10 phonetically rich sentences.\n\nMore info on TIMIT dataset can be understood from the \"README\" which can be found here:\nhttps://catalog.ldc.upenn.edu/docs/LDC93S1/readme.txt\n", "citation": "@inproceedings{\n  title={TIMIT Acoustic-Phonetic Continuous Speech Corpus},\n  author={Garofolo, John S., et al},\n  ldc_catalog_no={LDC93S1},\n  DOI={https://doi.org/10.35111/17gk-bn40},\n  journal={Linguistic Data Consortium, Philadelphia},\n  year={1983}\n}\n", "homepage": "https://catalog.ldc.upenn.edu/LDC93S1", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "phonetic_detail": {"feature": {"start": {"dtype": "int64", "id": null, "_type": "Value"}, "stop": {"dtype": "int64", "id": null, "_type": "Value"}, "utterance": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "word_detail": {"feature": {"start": {"dtype": "int64", "id": null, "_type": "Value"}, "stop": {"dtype": "int64", "id": null, "_type": "Value"}, "utterance": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "dialect_region": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_type": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "timit_asr", "config_name": "clean", "version": {"version_str": "2.0.1", "description": "", "major": 2, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6076580, "num_examples": 4620, "dataset_name": "timit_asr"}, "test": {"name": "test", "num_bytes": 2202968, "num_examples": 1680, "dataset_name": "timit_asr"}}, "download_checksums": {"https://data.deepai.org/timit.zip": {"num_bytes": 869007403, "checksum": "b79af42068b53045510d86854e2239a13ff77c4bd27803b40c28dce4bb5aeb0d"}}, "download_size": 869007403, "post_processing_size": null, "dataset_size": 8279548, "size_in_bytes": 877286951}}
+{"clean": {"description": "The TIMIT corpus of reading speech has been developed to provide speech data for acoustic-phonetic research studies\nand for the evaluation of automatic speech recognition systems.\n\nTIMIT contains high quality recordings of 630 individuals/speakers with 8 different American English dialects,\nwith each individual reading upto 10 phonetically rich sentences.\n\nMore info on TIMIT dataset can be understood from the \"README\" which can be found here:\nhttps://catalog.ldc.upenn.edu/docs/LDC93S1/readme.txt\n", "citation": "@inproceedings{\n  title={TIMIT Acoustic-Phonetic Continuous Speech Corpus},\n  author={Garofolo, John S., et al},\n  ldc_catalog_no={LDC93S1},\n  DOI={https://doi.org/10.35111/17gk-bn40},\n  journal={Linguistic Data Consortium, Philadelphia},\n  year={1983}\n}\n", "homepage": "https://catalog.ldc.upenn.edu/LDC93S1", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "decode": true, "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "phonetic_detail": {"feature": {"start": {"dtype": "int64", "id": null, "_type": "Value"}, "stop": {"dtype": "int64", "id": null, "_type": "Value"}, "utterance": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "word_detail": {"feature": {"start": {"dtype": "int64", "id": null, "_type": "Value"}, "stop": {"dtype": "int64", "id": null, "_type": "Value"}, "utterance": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "dialect_region": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_type": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "timit_asr", "config_name": "clean", "version": {"version_str": "2.0.1", "description": "", "major": 2, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6076580, "num_examples": 4620, "dataset_name": "timit_asr"}, "test": {"name": "test", "num_bytes": 2202968, "num_examples": 1680, "dataset_name": "timit_asr"}}, "download_checksums": {"https://data.deepai.org/timit.zip": {"num_bytes": 869007403, "checksum": "b79af42068b53045510d86854e2239a13ff77c4bd27803b40c28dce4bb5aeb0d"}}, "download_size": 869007403, "post_processing_size": null, "dataset_size": 8279548, "size_in_bytes": 877286951}}