From fadc0a0a1cd9f65b57e87237a5ad13afc4ebd698 Mon Sep 17 00:00:00 2001 From: theo Date: Mon, 22 Mar 2021 17:15:28 +0100 Subject: [PATCH 01/28] basic validation --- setup.py | 23 +- src/datasets/utils/metadata_validator.py | 103 +++++ src/datasets/utils/resources/creators.json | 17 + src/datasets/utils/resources/licenses.json | 452 +++++++++++++++++++++ src/datasets/utils/resources/tasks.json | 86 ++++ 5 files changed, 672 insertions(+), 9 deletions(-) create mode 100644 src/datasets/utils/metadata_validator.py create mode 100644 src/datasets/utils/resources/creators.json create mode 100644 src/datasets/utils/resources/licenses.json create mode 100644 src/datasets/utils/resources/tasks.json diff --git a/setup.py b/setup.py index 78f75f07abc..6bc7007c22f 100644 --- a/setup.py +++ b/setup.py @@ -56,8 +56,8 @@ import os import sys -from setuptools import find_packages -from setuptools import setup +from setuptools import find_packages, setup + DOCLINES = __doc__.split("\n") @@ -140,6 +140,9 @@ "texttable>=1.6.3", "s3fs>=0.4.2", "Werkzeug>=1.0.1", + # metadata validation + "langcodes[data]>=3.1.0", + "pydantic>=1.8.1", ] if os.name == "nt": # windows @@ -147,13 +150,15 @@ else: # dependencies of unbabel-comet # only test if not on windows since there're issues installing fairseq on windows - TESTS_REQUIRE.extend([ - "wget>=3.2", - "pytorch-nlp==0.5.0", - "pytorch_lightning", - "fastBPE==0.1.0", - "fairseq", - ]) + TESTS_REQUIRE.extend( + [ + "wget>=3.2", + "pytorch-nlp==0.5.0", + "pytorch_lightning", + "fastBPE==0.1.0", + "fairseq", + ] + ) QUALITY_REQUIRE = [ diff --git a/src/datasets/utils/metadata_validator.py b/src/datasets/utils/metadata_validator.py new file mode 100644 index 00000000000..80de1f55ed4 --- /dev/null +++ b/src/datasets/utils/metadata_validator.py @@ -0,0 +1,103 @@ +import json +from pathlib import Path +from typing import Dict, List + +import langcodes as lc +import yaml +from pydantic import BaseModel, validator + + +def load_json_resource(resource: str) -> Dict: + utils_dir = Path(__file__).parent + with open(utils_dir / "resources" / resource) as fi: + return json.load(fi) + + +known_licenses: Dict[str, str] = load_json_resource("licenses.json") +known_task_ids: Dict[str, Dict] = load_json_resource("tasks.json") +creator_set: Dict[str, List[str]] = load_json_resource("creators.json") +known_size_categories = ["unknown", "n<1K", "1K1M"] +multilinguality_set = { + "monolingual": "contains a single language", + "multilingual": "contains multiple languages", + "translation": "contains translated or aligned text", + "other": "other type of language distribution", +} + + +def tagset_validator(values: List[str], reference_values: List[str], name: str) -> List[str]: + for v in values: + if v not in reference_values: + raise ValueError(f"'{v}' is not a registered tag for {name}.") + return values + + +class DatasetMetadata(BaseModel): + annotations_creators: List[str] + language_creators: List[str] + languages: List[str] + licenses: List[str] + multilinguality: List[str] + size_categories: List[str] + source_datasets: List[str] + task_categories: List[str] + task_ids: List[str] + + @classmethod + def from_readme(cls, f: Path) -> "DatasetMetadata": + with f.open() as fi: + content = [line.strip() for line in fi] + + if content[0] == "---" and "---" in content[1:]: + yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) + metada_dict = yaml.safe_load(yamlblock) or dict() + return cls(**metada_dict) + else: + raise ValueError(f"did not find a yaml block in '{f}'") + + @classmethod + def from_yaml_string(cls, string: str) -> "DatasetMetadata": + metada_dict = yaml.safe_load(string) or dict() + return cls(**metada_dict) + + @validator("annotations_creators") + def annotations_creators_must_be_in_known_set(cls, annotations_creators: List[str]) -> List[str]: + return tagset_validator(annotations_creators, creator_set["annotations"], "annotations") + + @validator("language_creators") + def language_creators_must_be_in_known_set(cls, language_creators: List[str]) -> List[str]: + return tagset_validator(language_creators, creator_set["language"], "annotations") + + @validator("languages") + def language_code_must_be_recognized(cls, languages: List[str]): + for code in languages: + try: + lc.get(code) + except lc.tag_parser.LanguageTagError: + raise ValueError(f"'{code}' is not recognised as a valid language code") + return languages + + @validator("licenses") + def licenses_must_be_in_known_set(cls, licenses: List[str]): + return tagset_validator(licenses, list(known_licenses.keys()), "licenses") + + @validator("task_categories") + def task_category_must_be_in_known_set(cls, task_categories: List[str]): + return tagset_validator(task_categories, list(known_task_ids.keys()), "taks_ids") + + @validator("task_ids") + def task_id_must_be_in_known_set(cls, task_ids: List[str]): + return tagset_validator( + task_ids, [tid for _cat, d in known_task_ids.items() for tid in d["options"]], "taks_ids" + ) + + +if __name__ == "__main__": + from argparse import ArgumentParser + + ap = ArgumentParser(usage="Validate the yaml metadata block of a README.md file.") + ap.add_argument("readme_filepath") + args = ap.parse_args() + + readme_filepath = Path(args.readme_filepath) + DatasetMetadata.from_readme(readme_filepath) diff --git a/src/datasets/utils/resources/creators.json b/src/datasets/utils/resources/creators.json new file mode 100644 index 00000000000..d9e15f0039c --- /dev/null +++ b/src/datasets/utils/resources/creators.json @@ -0,0 +1,17 @@ +{ + "language": [ + "found", + "crowdsourced", + "expert-generated", + "machine-generated", + "other" + ], + "annotations": [ + "found", + "crowdsourced", + "expert-generated", + "machine-generated", + "no-annotation", + "other" + ] +} diff --git a/src/datasets/utils/resources/licenses.json b/src/datasets/utils/resources/licenses.json new file mode 100644 index 00000000000..31e76c43d48 --- /dev/null +++ b/src/datasets/utils/resources/licenses.json @@ -0,0 +1,452 @@ +{ + "other": "Other license", + "unknown": "License information unavailable", + "0bsd": "BSD Zero Clause License", + "aal": "Attribution Assurance License", + "abstyles": "Abstyles License", + "adobe-2006": "Adobe Systems Incorporated Source Code License Agreement", + "adobe-glyph": "Adobe Glyph List License", + "adsl": "Amazon Digital Services License", + "afl-1.1": "Academic Free License v1.1", + "afl-1.2": "Academic Free License v1.2", + "afl-2.0": "Academic Free License v2.0", + "afl-2.1": "Academic Free License v2.1", + "afl-3.0": "Academic Free License v3.0", + "afmparse": "Afmparse License", + "agpl-1.0": "Affero General Public License v1.0", + "agpl-1.0-only": "Affero General Public License v1.0 only", + "agpl-1.0-or-later": "Affero General Public License v1.0 or later", + "agpl-3.0": "GNU Affero General Public License v3.0", + "agpl-3.0-only": "GNU Affero General Public License v3.0 only", + "agpl-3.0-or-later": "GNU Affero General Public License v3.0 or later", + "aladdin": "Aladdin Free Public License", + "amdplpa": "AMD's plpa_map.c License", + "aml": "Apple MIT License", + "ampas": "Academy of Motion Picture Arts and Sciences BSD", + "antlr-pd": "ANTLR Software Rights Notice", + "antlr-pd-fallback": "ANTLR Software Rights Notice with license fallback", + "apache-1.0": "Apache License 1.0", + "apache-1.1": "Apache License 1.1", + "apache-2.0": "Apache License 2.0", + "apafml": "Adobe Postscript AFM License", + "apl-1.0": "Adaptive Public License 1.0", + "apsl-1.0": "Apple Public Source License 1.0", + "apsl-1.1": "Apple Public Source License 1.1", + "apsl-1.2": "Apple Public Source License 1.2", + "apsl-2.0": "Apple Public Source License 2.0", + "artistic-1.0": "Artistic License 1.0", + "artistic-1.0-cl8": "Artistic License 1.0 w/clause 8", + "artistic-1.0-perl": "Artistic License 1.0 (Perl)", + "artistic-2.0": "Artistic License 2.0", + "bahyph": "Bahyph License", + "barr": "Barr License", + "beerware": "Beerware License", + "bittorrent-1.0": "BitTorrent Open Source License v1.0", + "bittorrent-1.1": "BitTorrent Open Source License v1.1", + "blessing": "SQLite Blessing", + "blueoak-1.0.0": "Blue Oak Model License 1.0.0", + "borceux": "Borceux license", + "bsd-1-clause": "BSD 1-Clause License", + "bsd-2-clause": "BSD 2-Clause \"Simplified\" License", + "bsd-2-clause-freebsd": "BSD 2-Clause FreeBSD License", + "bsd-2-clause-netbsd": "BSD 2-Clause NetBSD License", + "bsd-2-clause-patent": "BSD-2-Clause Plus Patent License", + "bsd-2-clause-views": "BSD 2-Clause with views sentence", + "bsd-3-clause": "BSD 3-Clause \"New\" or \"Revised\" License", + "bsd-3-clause-attribution": "BSD with attribution", + "bsd-3-clause-clear": "BSD 3-Clause Clear License", + "bsd-3-clause-lbnl": "Lawrence Berkeley National Labs BSD variant license", + "bsd-3-clause-no-nuclear-license": "BSD 3-Clause No Nuclear License", + "bsd-3-clause-no-nuclear-license-2014": "BSD 3-Clause No Nuclear License 2014", + "bsd-3-clause-no-nuclear-warranty": "BSD 3-Clause No Nuclear Warranty", + "bsd-3-clause-open-mpi": "BSD 3-Clause Open MPI variant", + "bsd-4-clause": "BSD 4-Clause \"Original\" or \"Old\" License", + "bsd-4-clause-uc": "BSD-4-Clause (University of California-Specific)", + "bsd-protection": "BSD Protection License", + "bsd-source-code": "BSD Source Code Attribution", + "bsl-1.0": "Boost Software License 1.0", + "busl-1.1": "Business Source License 1.1", + "bzip2-1.0.5": "bzip2 and libbzip2 License v1.0.5", + "bzip2-1.0.6": "bzip2 and libbzip2 License v1.0.6", + "cal-1.0": "Cryptographic Autonomy License 1.0", + "cal-1.0-combined-work-exception": "Cryptographic Autonomy License 1.0 (Combined Work Exception)", + "caldera": "Caldera License", + "catosl-1.1": "Computer Associates Trusted Open Source License 1.1", + "cc-by-1.0": "Creative Commons Attribution 1.0 Generic", + "cc-by-2.0": "Creative Commons Attribution 2.0 Generic", + "cc-by-2.5": "Creative Commons Attribution 2.5 Generic", + "cc-by-3.0": "Creative Commons Attribution 3.0 Unported", + "cc-by-3.0-at": "Creative Commons Attribution 3.0 Austria", + "cc-by-3.0-us": "Creative Commons Attribution 3.0 United States", + "cc-by-4.0": "Creative Commons Attribution 4.0 International", + "cc-by-nc-1.0": "Creative Commons Attribution Non Commercial 1.0 Generic", + "cc-by-nc-2.0": "Creative Commons Attribution Non Commercial 2.0 Generic", + "cc-by-nc-2.5": "Creative Commons Attribution Non Commercial 2.5 Generic", + "cc-by-nc-3.0": "Creative Commons Attribution Non Commercial 3.0 Unported", + "cc-by-nc-4.0": "Creative Commons Attribution Non Commercial 4.0 International", + "cc-by-nc-nd-1.0": "Creative Commons Attribution Non Commercial No Derivatives 1.0 Generic", + "cc-by-nc-nd-2.0": "Creative Commons Attribution Non Commercial No Derivatives 2.0 Generic", + "cc-by-nc-nd-2.5": "Creative Commons Attribution Non Commercial No Derivatives 2.5 Generic", + "cc-by-nc-nd-3.0": "Creative Commons Attribution Non Commercial No Derivatives 3.0 Unported", + "cc-by-nc-nd-3.0-igo": "Creative Commons Attribution Non Commercial No Derivatives 3.0 IGO", + "cc-by-nc-nd-4.0": "Creative Commons Attribution Non Commercial No Derivatives 4.0 International", + "cc-by-nc-sa-1.0": "Creative Commons Attribution Non Commercial Share Alike 1.0 Generic", + "cc-by-nc-sa-2.0": "Creative Commons Attribution Non Commercial Share Alike 2.0 Generic", + "cc-by-nc-sa-2.5": "Creative Commons Attribution Non Commercial Share Alike 2.5 Generic", + "cc-by-nc-sa-3.0": "Creative Commons Attribution Non Commercial Share Alike 3.0 Unported", + "cc-by-nc-sa-4.0": "Creative Commons Attribution Non Commercial Share Alike 4.0 International", + "cc-by-nd-1.0": "Creative Commons Attribution No Derivatives 1.0 Generic", + "cc-by-nd-2.0": "Creative Commons Attribution No Derivatives 2.0 Generic", + "cc-by-nd-2.5": "Creative Commons Attribution No Derivatives 2.5 Generic", + "cc-by-nd-3.0": "Creative Commons Attribution No Derivatives 3.0 Unported", + "cc-by-nd-4.0": "Creative Commons Attribution No Derivatives 4.0 International", + "cc-by-sa-1.0": "Creative Commons Attribution Share Alike 1.0 Generic", + "cc-by-sa-2.0": "Creative Commons Attribution Share Alike 2.0 Generic", + "cc-by-sa-2.0-uk": "Creative Commons Attribution Share Alike 2.0 England and Wales", + "cc-by-sa-2.5": "Creative Commons Attribution Share Alike 2.5 Generic", + "cc-by-sa-3.0": "Creative Commons Attribution Share Alike 3.0 Unported", + "cc-by-sa-3.0-at": "Creative Commons Attribution-Share Alike 3.0 Austria", + "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0 International", + "cc-pddc": "Creative Commons Public Domain Dedication and Certification", + "cc0-1.0": "Creative Commons Zero v1.0 Universal", + "cddl-1.0": "Common Development and Distribution License 1.0", + "cddl-1.1": "Common Development and Distribution License 1.1", + "cdla-permissive-1.0": "Community Data License Agreement Permissive 1.0", + "cdla-sharing-1.0": "Community Data License Agreement Sharing 1.0", + "cecill-1.0": "CeCILL Free Software License Agreement v1.0", + "cecill-1.1": "CeCILL Free Software License Agreement v1.1", + "cecill-2.0": "CeCILL Free Software License Agreement v2.0", + "cecill-2.1": "CeCILL Free Software License Agreement v2.1", + "cecill-b": "CeCILL-B Free Software License Agreement", + "cecill-c": "CeCILL-C Free Software License Agreement", + "cern-ohl-1.1": "CERN Open Hardware Licence v1.1", + "cern-ohl-1.2": "CERN Open Hardware Licence v1.2", + "cern-ohl-p-2.0": "CERN Open Hardware Licence Version 2 - Permissive", + "cern-ohl-s-2.0": "CERN Open Hardware Licence Version 2 - Strongly Reciprocal", + "cern-ohl-w-2.0": "CERN Open Hardware Licence Version 2 - Weakly Reciprocal", + "clartistic": "Clarified Artistic License", + "cnri-jython": "CNRI Jython License", + "cnri-python": "CNRI Python License", + "cnri-python-gpl-compatible": "CNRI Python Open Source GPL Compatible License Agreement", + "condor-1.1": "Condor Public License v1.1", + "copyleft-next-0.3.0": "copyleft-next 0.3.0", + "copyleft-next-0.3.1": "copyleft-next 0.3.1", + "cpal-1.0": "Common Public Attribution License 1.0", + "cpl-1.0": "Common Public License 1.0", + "cpol-1.02": "Code Project Open License 1.02", + "crossword": "Crossword License", + "crystalstacker": "CrystalStacker License", + "cua-opl-1.0": "CUA Office Public License v1.0", + "cube": "Cube License", + "curl": "curl License", + "d-fsl-1.0": "Deutsche Freie Software Lizenz", + "diffmark": "diffmark license", + "doc": "DOC License", + "dotseqn": "Dotseqn License", + "dsdp": "DSDP License", + "dvipdfm": "dvipdfm License", + "ecl-1.0": "Educational Community License v1.0", + "ecl-2.0": "Educational Community License v2.0", + "ecos-2.0": "eCos license version 2.0", + "efl-1.0": "Eiffel Forum License v1.0", + "efl-2.0": "Eiffel Forum License v2.0", + "egenix": "eGenix.com Public License 1.1.0", + "entessa": "Entessa Public License v1.0", + "epics": "EPICS Open License", + "epl-1.0": "Eclipse Public License 1.0", + "epl-2.0": "Eclipse Public License 2.0", + "erlpl-1.1": "Erlang Public License v1.1", + "etalab-2.0": "Etalab Open License 2.0", + "eudatagrid": "EU DataGrid Software License", + "eupl-1.0": "European Union Public License 1.0", + "eupl-1.1": "European Union Public License 1.1", + "eupl-1.2": "European Union Public License 1.2", + "eurosym": "Eurosym License", + "fair": "Fair License", + "frameworx-1.0": "Frameworx Open License 1.0", + "freeimage": "FreeImage Public License v1.0", + "fsfap": "FSF All Permissive License", + "fsful": "FSF Unlimited License", + "fsfullr": "FSF Unlimited License (with License Retention)", + "ftl": "Freetype Project License", + "gfdl-1.1": "GNU Free Documentation License v1.1", + "gfdl-1.1-invariants-only": "GNU Free Documentation License v1.1 only - invariants", + "gfdl-1.1-invariants-or-later": "GNU Free Documentation License v1.1 or later - invariants", + "gfdl-1.1-no-invariants-only": "GNU Free Documentation License v1.1 only - no invariants", + "gfdl-1.1-no-invariants-or-later": "GNU Free Documentation License v1.1 or later - no invariants", + "gfdl-1.1-only": "GNU Free Documentation License v1.1 only", + "gfdl-1.1-or-later": "GNU Free Documentation License v1.1 or later", + "gfdl-1.2": "GNU Free Documentation License v1.2", + "gfdl-1.2-invariants-only": "GNU Free Documentation License v1.2 only - invariants", + "gfdl-1.2-invariants-or-later": "GNU Free Documentation License v1.2 or later - invariants", + "gfdl-1.2-no-invariants-only": "GNU Free Documentation License v1.2 only - no invariants", + "gfdl-1.2-no-invariants-or-later": "GNU Free Documentation License v1.2 or later - no invariants", + "gfdl-1.2-only": "GNU Free Documentation License v1.2 only", + "gfdl-1.2-or-later": "GNU Free Documentation License v1.2 or later", + "gfdl-1.3": "GNU Free Documentation License v1.3", + "gfdl-1.3-invariants-only": "GNU Free Documentation License v1.3 only - invariants", + "gfdl-1.3-invariants-or-later": "GNU Free Documentation License v1.3 or later - invariants", + "gfdl-1.3-no-invariants-only": "GNU Free Documentation License v1.3 only - no invariants", + "gfdl-1.3-no-invariants-or-later": "GNU Free Documentation License v1.3 or later - no invariants", + "gfdl-1.3-only": "GNU Free Documentation License v1.3 only", + "gfdl-1.3-or-later": "GNU Free Documentation License v1.3 or later", + "giftware": "Giftware License", + "gl2ps": "GL2PS License", + "glide": "3dfx Glide License", + "glulxe": "Glulxe License", + "glwtpl": "Good Luck With That Public License", + "gnuplot": "gnuplot License", + "gpl-1.0": "GNU General Public License v1.0 only", + "gpl-1.0+": "GNU General Public License v1.0 or later", + "gpl-1.0-only": "GNU General Public License v1.0 only", + "gpl-1.0-or-later": "GNU General Public License v1.0 or later", + "gpl-2.0": "GNU General Public License v2.0 only", + "gpl-2.0+": "GNU General Public License v2.0 or later", + "gpl-2.0-only": "GNU General Public License v2.0 only", + "gpl-2.0-or-later": "GNU General Public License v2.0 or later", + "gpl-2.0-with-autoconf-exception": "GNU General Public License v2.0 w/Autoconf exception", + "gpl-2.0-with-bison-exception": "GNU General Public License v2.0 w/Bison exception", + "gpl-2.0-with-classpath-exception": "GNU General Public License v2.0 w/Classpath exception", + "gpl-2.0-with-font-exception": "GNU General Public License v2.0 w/Font exception", + "gpl-2.0-with-gcc-exception": "GNU General Public License v2.0 w/GCC Runtime Library exception", + "gpl-3.0": "GNU General Public License v3.0 only", + "gpl-3.0+": "GNU General Public License v3.0 or later", + "gpl-3.0-only": "GNU General Public License v3.0 only", + "gpl-3.0-or-later": "GNU General Public License v3.0 or later", + "gpl-3.0-with-autoconf-exception": "GNU General Public License v3.0 w/Autoconf exception", + "gpl-3.0-with-gcc-exception": "GNU General Public License v3.0 w/GCC Runtime Library exception", + "gsoap-1.3b": "gSOAP Public License v1.3b", + "haskellreport": "Haskell Language Report License", + "hippocratic-2.1": "Hippocratic License 2.1", + "hpnd": "Historical Permission Notice and Disclaimer", + "hpnd-sell-variant": "Historical Permission Notice and Disclaimer - sell variant", + "htmltidy": "HTML Tidy License", + "ibm-pibs": "IBM PowerPC Initialization and Boot Software", + "icu": "ICU License", + "ijg": "Independent JPEG Group License", + "imagemagick": "ImageMagick License", + "imatix": "iMatix Standard Function Library Agreement", + "imlib2": "Imlib2 License", + "info-zip": "Info-ZIP License", + "intel": "Intel Open Source License", + "intel-acpi": "Intel ACPI Software License Agreement", + "interbase-1.0": "Interbase Public License v1.0", + "ipa": "IPA Font License", + "ipl-1.0": "IBM Public License v1.0", + "isc": "ISC License", + "jasper-2.0": "JasPer License", + "jpnic": "Japan Network Information Center License", + "json": "JSON License", + "lal-1.2": "Licence Art Libre 1.2", + "lal-1.3": "Licence Art Libre 1.3", + "latex2e": "Latex2e License", + "leptonica": "Leptonica License", + "lgpl-2.0": "GNU Library General Public License v2 only", + "lgpl-2.0+": "GNU Library General Public License v2 or later", + "lgpl-2.0-only": "GNU Library General Public License v2 only", + "lgpl-2.0-or-later": "GNU Library General Public License v2 or later", + "lgpl-2.1": "GNU Lesser General Public License v2.1 only", + "lgpl-2.1+": "GNU Library General Public License v2.1 or later", + "lgpl-2.1-only": "GNU Lesser General Public License v2.1 only", + "lgpl-2.1-or-later": "GNU Lesser General Public License v2.1 or later", + "lgpl-3.0": "GNU Lesser General Public License v3.0 only", + "lgpl-3.0+": "GNU Lesser General Public License v3.0 or later", + "lgpl-3.0-only": "GNU Lesser General Public License v3.0 only", + "lgpl-3.0-or-later": "GNU Lesser General Public License v3.0 or later", + "lgpllr": "Lesser General Public License For Linguistic Resources", + "libpng": "libpng License", + "libpng-2.0": "PNG Reference Library version 2", + "libselinux-1.0": "libselinux public domain notice", + "libtiff": "libtiff License", + "liliq-p-1.1": "Licence Libre du Qu\u00e9bec \u2013 Permissive version 1.1", + "liliq-r-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 version 1.1", + "liliq-rplus-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 forte version 1.1", + "linux-openib": "Linux Kernel Variant of OpenIB.org license", + "lpl-1.0": "Lucent Public License Version 1.0", + "lpl-1.02": "Lucent Public License v1.02", + "lppl-1.0": "LaTeX Project Public License v1.0", + "lppl-1.1": "LaTeX Project Public License v1.1", + "lppl-1.2": "LaTeX Project Public License v1.2", + "lppl-1.3a": "LaTeX Project Public License v1.3a", + "lppl-1.3c": "LaTeX Project Public License v1.3c", + "makeindex": "MakeIndex License", + "miros": "The MirOS Licence", + "mit": "MIT License", + "mit-0": "MIT No Attribution", + "mit-advertising": "Enlightenment License (e16)", + "mit-cmu": "CMU License", + "mit-enna": "enna License", + "mit-feh": "feh License", + "mit-open-group": "MIT Open Group variant", + "mitnfa": "MIT +no-false-attribs license", + "motosoto": "Motosoto License", + "mpich2": "mpich2 License", + "mpl-1.0": "Mozilla Public License 1.0", + "mpl-1.1": "Mozilla Public License 1.1", + "mpl-2.0": "Mozilla Public License 2.0", + "mpl-2.0-no-copyleft-exception": "Mozilla Public License 2.0 (no copyleft exception)", + "ms-pl": "Microsoft Public License", + "ms-rl": "Microsoft Reciprocal License", + "mtll": "Matrix Template Library License", + "mulanpsl-1.0": "Mulan Permissive Software License, Version 1", + "mulanpsl-2.0": "Mulan Permissive Software License, Version 2", + "multics": "Multics License", + "mup": "Mup License", + "nasa-1.3": "NASA Open Source Agreement 1.3", + "naumen": "Naumen Public License", + "nbpl-1.0": "Net Boolean Public License v1", + "ncgl-uk-2.0": "Non-Commercial Government Licence", + "ncsa": "University of Illinois/NCSA Open Source License", + "net-snmp": "Net-SNMP License", + "netcdf": "NetCDF license", + "newsletr": "Newsletr License", + "ngpl": "Nethack General Public License", + "nist-pd": "NIST Public Domain Notice", + "nist-pd-fallback": "NIST Public Domain Notice with license fallback", + "nlod-1.0": "Norwegian Licence for Open Government Data", + "nlpl": "No Limit Public License", + "nokia": "Nokia Open Source License", + "nosl": "Netizen Open Source License", + "noweb": "Noweb License", + "npl-1.0": "Netscape Public License v1.0", + "npl-1.1": "Netscape Public License v1.1", + "nposl-3.0": "Non-Profit Open Software License 3.0", + "nrl": "NRL License", + "ntp": "NTP License", + "ntp-0": "NTP No Attribution", + "nunit": "Nunit License", + "o-uda-1.0": "Open Use of Data Agreement v1.0", + "occt-pl": "Open CASCADE Technology Public License", + "oclc-2.0": "OCLC Research Public License 2.0", + "odbl-1.0": "ODC Open Database License v1.0", + "odc-by-1.0": "Open Data Commons Attribution License v1.0", + "ofl-1.0": "SIL Open Font License 1.0", + "ofl-1.0-no-rfn": "SIL Open Font License 1.0 with no Reserved Font Name", + "ofl-1.0-rfn": "SIL Open Font License 1.0 with Reserved Font Name", + "ofl-1.1": "SIL Open Font License 1.1", + "ofl-1.1-no-rfn": "SIL Open Font License 1.1 with no Reserved Font Name", + "ofl-1.1-rfn": "SIL Open Font License 1.1 with Reserved Font Name", + "ogc-1.0": "OGC Software License, Version 1.0", + "ogl-canada-2.0": "Open Government Licence - Canada", + "ogl-uk-1.0": "Open Government Licence v1.0", + "ogl-uk-2.0": "Open Government Licence v2.0", + "ogl-uk-3.0": "Open Government Licence v3.0", + "ogtsl": "Open Group Test Suite License", + "oldap-1.1": "Open LDAP Public License v1.1", + "oldap-1.2": "Open LDAP Public License v1.2", + "oldap-1.3": "Open LDAP Public License v1.3", + "oldap-1.4": "Open LDAP Public License v1.4", + "oldap-2.0": "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)", + "oldap-2.0.1": "Open LDAP Public License v2.0.1", + "oldap-2.1": "Open LDAP Public License v2.1", + "oldap-2.2": "Open LDAP Public License v2.2", + "oldap-2.2.1": "Open LDAP Public License v2.2.1", + "oldap-2.2.2": "Open LDAP Public License 2.2.2", + "oldap-2.3": "Open LDAP Public License v2.3", + "oldap-2.4": "Open LDAP Public License v2.4", + "oldap-2.5": "Open LDAP Public License v2.5", + "oldap-2.6": "Open LDAP Public License v2.6", + "oldap-2.7": "Open LDAP Public License v2.7", + "oldap-2.8": "Open LDAP Public License v2.8", + "oml": "Open Market License", + "openssl": "OpenSSL License", + "opl-1.0": "Open Public License v1.0", + "oset-pl-2.1": "OSET Public License version 2.1", + "osl-1.0": "Open Software License 1.0", + "osl-1.1": "Open Software License 1.1", + "osl-2.0": "Open Software License 2.0", + "osl-2.1": "Open Software License 2.1", + "osl-3.0": "Open Software License 3.0", + "parity-6.0.0": "The Parity Public License 6.0.0", + "parity-7.0.0": "The Parity Public License 7.0.0", + "pddl-1.0": "ODC Public Domain Dedication & License 1.0", + "php-3.0": "PHP License v3.0", + "php-3.01": "PHP License v3.01", + "plexus": "Plexus Classworlds License", + "polyform-noncommercial-1.0.0": "PolyForm Noncommercial License 1.0.0", + "polyform-small-business-1.0.0": "PolyForm Small Business License 1.0.0", + "postgresql": "PostgreSQL License", + "psf-2.0": "Python Software Foundation License 2.0", + "psfrag": "psfrag License", + "psutils": "psutils License", + "python-2.0": "Python License 2.0", + "qhull": "Qhull License", + "qpl-1.0": "Q Public License 1.0", + "rdisc": "Rdisc License", + "rhecos-1.1": "Red Hat eCos Public License v1.1", + "rpl-1.1": "Reciprocal Public License 1.1", + "rpl-1.5": "Reciprocal Public License 1.5", + "rpsl-1.0": "RealNetworks Public Source License v1.0", + "rsa-md": "RSA Message-Digest License", + "rscpl": "Ricoh Source Code Public License", + "ruby": "Ruby License", + "sax-pd": "Sax Public Domain Notice", + "saxpath": "Saxpath License", + "scea": "SCEA Shared Source License", + "sendmail": "Sendmail License", + "sendmail-8.23": "Sendmail License 8.23", + "sgi-b-1.0": "SGI Free Software License B v1.0", + "sgi-b-1.1": "SGI Free Software License B v1.1", + "sgi-b-2.0": "SGI Free Software License B v2.0", + "shl-0.5": "Solderpad Hardware License v0.5", + "shl-0.51": "Solderpad Hardware License, Version 0.51", + "simpl-2.0": "Simple Public License 2.0", + "sissl": "Sun Industry Standards Source License v1.1", + "sissl-1.2": "Sun Industry Standards Source License v1.2", + "sleepycat": "Sleepycat License", + "smlnj": "Standard ML of New Jersey License", + "smppl": "Secure Messaging Protocol Public License", + "snia": "SNIA Public License 1.1", + "spencer-86": "Spencer License 86", + "spencer-94": "Spencer License 94", + "spencer-99": "Spencer License 99", + "spl-1.0": "Sun Public License v1.0", + "ssh-openssh": "SSH OpenSSH license", + "ssh-short": "SSH short notice", + "sspl-1.0": "Server Side Public License, v 1", + "standardml-nj": "Standard ML of New Jersey License", + "sugarcrm-1.1.3": "SugarCRM Public License v1.1.3", + "swl": "Scheme Widget Library (SWL) Software License Agreement", + "tapr-ohl-1.0": "TAPR Open Hardware License v1.0", + "tcl": "TCL/TK License", + "tcp-wrappers": "TCP Wrappers License", + "tmate": "TMate Open Source License", + "torque-1.1": "TORQUE v2.5+ Software License v1.1", + "tosl": "Trusster Open Source License", + "tu-berlin-1.0": "Technische Universitaet Berlin License 1.0", + "tu-berlin-2.0": "Technische Universitaet Berlin License 2.0", + "ucl-1.0": "Upstream Compatibility License v1.0", + "unicode-dfs-2015": "Unicode License Agreement - Data Files and Software (2015)", + "unicode-dfs-2016": "Unicode License Agreement - Data Files and Software (2016)", + "unicode-tou": "Unicode Terms of Use", + "unlicense": "The Unlicense", + "upl-1.0": "Universal Permissive License v1.0", + "vim": "Vim License", + "vostrom": "VOSTROM Public License for Open Source", + "vsl-1.0": "Vovida Software License v1.0", + "w3c": "W3C Software Notice and License (2002-12-31)", + "w3c-19980720": "W3C Software Notice and License (1998-07-20)", + "w3c-20150513": "W3C Software Notice and Document License (2015-05-13)", + "watcom-1.0": "Sybase Open Watcom Public License 1.0", + "wsuipa": "Wsuipa License", + "wtfpl": "Do What The F*ck You Want To Public License", + "wxwindows": "wxWindows Library License", + "x11": "X11 License", + "xerox": "Xerox License", + "xfree86-1.1": "XFree86 License 1.1", + "xinetd": "xinetd License", + "xnet": "X.Net License", + "xpp": "XPP License", + "xskat": "XSkat License", + "ypl-1.0": "Yahoo! Public License v1.0", + "ypl-1.1": "Yahoo! Public License v1.1", + "zed": "Zed License", + "zend-2.0": "Zend License v2.0", + "zimbra-1.3": "Zimbra Public License v1.3", + "zimbra-1.4": "Zimbra Public License v1.4", + "zlib": "zlib License", + "zlib-acknowledgement": "zlib/libpng License with Acknowledgement", + "zpl-1.1": "Zope Public License 1.1", + "zpl-2.0": "Zope Public License 2.0", + "zpl-2.1": "Zope Public License 2.1" +} \ No newline at end of file diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json new file mode 100644 index 00000000000..966ba106c9d --- /dev/null +++ b/src/datasets/utils/resources/tasks.json @@ -0,0 +1,86 @@ +{ + "conditional-text-generation": { + "description": "data-to-text and text transduction tasks such as translation or summarization", + "options": [ + "machine-translation", + "sentence-splitting-fusion", + "summarization", + "table-to-text", + "text-simplification", + "explanation-generation", + "other-stuctured-to-text", + "other" + ] + }, + "question-answering": { + "description": "question answering tasks", + "options": [ + "open-domain-qa", + "closed-domain-qa", + "multiple-choice-qa", + "extractive-qa", + "abstractive-qa", + "other" + ] + }, + "sequence-modeling": { + "description": "such as language modeling or dialogue", + "options": [ + "dialogue-modeling", + "language-modeling", + "other-multi-turn", + "slot-filling", + "other" + ] + }, + "structure-prediction": { + "description": "predicting structural properties of the text, such as syntax", + "options": [ + "coreference-resolution", + "named-entity-recognition", + "part-of-speech-tagging", + "parsing", + "other" + ] + }, + "text-classification": { + "description": "predicting a class index or boolean value", + "options": [ + "acceptability-classification", + "entity-linking-classification", + "fact-checking", + "intent-classification", + "multi-class-classification", + "multi-label-classification", + "natural-language-inference", + "semantic-similarity-classification", + "sentiment-classification", + "topic-classification", + "other" + ] + }, + "text-retrieval": { + "description": "information or text retrieval tasks", + "options": [ + "document-retrieval", + "utterance-retrieval", + "entity-linking-retrieval", + "fact-checking-retrieval", + "other" + ] + }, + "text-scoring": { + "description": "text scoring tasks, predicting a real valued score for some text", + "options": [ + "semantic-similarity-scoring", + "sentiment-scoring", + "other" + ] + }, + "other": { + "description": "other task family not mentioned here", + "options": [ + "other" + ] + } +} From 7a4b594a549bd5dbdc4bf800e6a2b20562559f01 Mon Sep 17 00:00:00 2001 From: theo Date: Tue, 23 Mar 2021 14:50:02 +0100 Subject: [PATCH 02/28] ci script and test change --- datasets/spanish_billion_words/README.md | 2 +- scripts/ci-metadata-validator.py | 53 ++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100755 scripts/ci-metadata-validator.py diff --git a/datasets/spanish_billion_words/README.md b/datasets/spanish_billion_words/README.md index 113e86665bb..6b87bab370b 100644 --- a/datasets/spanish_billion_words/README.md +++ b/datasets/spanish_billion_words/README.md @@ -4,7 +4,7 @@ annotations_creators: language_creators: - expert-generated languages: -- es +- esovefoi licenses: - cc-by-sa-4.0 multilinguality: diff --git a/scripts/ci-metadata-validator.py b/scripts/ci-metadata-validator.py new file mode 100755 index 00000000000..b2346cbee5c --- /dev/null +++ b/scripts/ci-metadata-validator.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python + +""" This script will run in CI and make sure all new changes to datasets readme files have valid metadata yaml headers. + +""" + +from pathlib import Path +from subprocess import check_call, check_output +from typing import List + +from pydantic import ValidationError + +from datasets.utils.metadata_validator import DatasetMetadata + + +def get_changed_files(repo_path: Path) -> List[Path]: + # check_call(["git", "fetch"], cwd=repo_path) + diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path) + changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()] + return changed_files + + +if __name__ == "__main__": + import logging + from argparse import ArgumentParser + + logging.basicConfig(level=logging.DEBUG) + + ap = ArgumentParser() + ap.add_argument("--repo_path", type=Path, default=Path.cwd()) + args = ap.parse_args() + + cwd = args.repo_path + changed_files = get_changed_files(cwd) + changed_readmes = [ + f for f in changed_files if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" + ] + + failed: List[Path] = [] + for readme in changed_readmes: + try: + DatasetMetadata.from_readme(readme) + logging.debug(f"✔️ Validated '{readme.absolute()}'") + except ValidationError as e: + failed.append(readme) + logging.warning(f"❌ Failed to validate '{readme.absolute()}':\n{e}") + + if len(failed) > 0: + logging.info(f"❌ Failed on {len(failed)} files.") + exit(1) + else: + logging.info("All is well, keep up the good work 🤗!") + exit(0) From c3c97ea326e0413bee8f27734f973d4ab85976b0 Mon Sep 17 00:00:00 2001 From: theo Date: Tue, 23 Mar 2021 18:12:11 +0100 Subject: [PATCH 03/28] color is better --- scripts/ci-metadata-validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci-metadata-validator.py b/scripts/ci-metadata-validator.py index b2346cbee5c..629baede65b 100755 --- a/scripts/ci-metadata-validator.py +++ b/scripts/ci-metadata-validator.py @@ -40,7 +40,7 @@ def get_changed_files(repo_path: Path) -> List[Path]: for readme in changed_readmes: try: DatasetMetadata.from_readme(readme) - logging.debug(f"✔️ Validated '{readme.absolute()}'") + logging.debug(f"✅️ Validated '{readme.absolute()}'") except ValidationError as e: failed.append(readme) logging.warning(f"❌ Failed to validate '{readme.absolute()}':\n{e}") From 2fe5787161d5608349256b57962ddf0f84e6dacd Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 09:16:48 +0100 Subject: [PATCH 04/28] check all option --- datasets/spanish_billion_words/README.md | 2 +- scripts/ci-metadata-validator.py | 25 ++++++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/datasets/spanish_billion_words/README.md b/datasets/spanish_billion_words/README.md index 6b87bab370b..113e86665bb 100644 --- a/datasets/spanish_billion_words/README.md +++ b/datasets/spanish_billion_words/README.md @@ -4,7 +4,7 @@ annotations_creators: language_creators: - expert-generated languages: -- esovefoi +- es licenses: - cc-by-sa-4.0 multilinguality: diff --git a/scripts/ci-metadata-validator.py b/scripts/ci-metadata-validator.py index 629baede65b..22771c3e7fd 100755 --- a/scripts/ci-metadata-validator.py +++ b/scripts/ci-metadata-validator.py @@ -28,22 +28,31 @@ def get_changed_files(repo_path: Path) -> List[Path]: ap = ArgumentParser() ap.add_argument("--repo_path", type=Path, default=Path.cwd()) + ap.add_argument("--check_all", action="store_true") args = ap.parse_args() - cwd = args.repo_path - changed_files = get_changed_files(cwd) - changed_readmes = [ - f for f in changed_files if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" - ] + repo_path: Path = args.repo_path + if args.check_all: + readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()] + else: + changed_files = get_changed_files(repo_path) + readmes = [ + f + for f in changed_files + if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" + ] failed: List[Path] = [] - for readme in changed_readmes: + for readme in sorted(readmes): try: DatasetMetadata.from_readme(readme) - logging.debug(f"✅️ Validated '{readme.absolute()}'") + logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") except ValidationError as e: failed.append(readme) - logging.warning(f"❌ Failed to validate '{readme.absolute()}':\n{e}") + logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}") + except Exception as e: + failed.append(readme) + logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}") if len(failed) > 0: logging.info(f"❌ Failed on {len(failed)} files.") From 0f68ce4635c2ed4afcefdd354a38160336f99f95 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 09:17:15 +0100 Subject: [PATCH 05/28] validate size cats & multiling, point to reference file urls on error --- src/datasets/utils/metadata_validator.py | 61 ++++++++++++++++++------ 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/src/datasets/utils/metadata_validator.py b/src/datasets/utils/metadata_validator.py index 80de1f55ed4..41a11a8c1dd 100644 --- a/src/datasets/utils/metadata_validator.py +++ b/src/datasets/utils/metadata_validator.py @@ -1,23 +1,27 @@ import json from pathlib import Path -from typing import Dict, List +from typing import Dict, List, Tuple import langcodes as lc import yaml from pydantic import BaseModel, validator -def load_json_resource(resource: str) -> Dict: +BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" +this_url = f"{BASE_REF_URL}/{__file__}" + + +def load_json_resource(resource: str) -> Tuple[Dict, str]: utils_dir = Path(__file__).parent with open(utils_dir / "resources" / resource) as fi: - return json.load(fi) + return json.load(fi), f"{BASE_REF_URL}/resources/{resource}" -known_licenses: Dict[str, str] = load_json_resource("licenses.json") -known_task_ids: Dict[str, Dict] = load_json_resource("tasks.json") -creator_set: Dict[str, List[str]] = load_json_resource("creators.json") +known_licenses, known_licenses_url = load_json_resource("licenses.json") +known_task_ids, known_task_ids_url = load_json_resource("tasks.json") +known_creators, known_creators_url = load_json_resource("creators.json") known_size_categories = ["unknown", "n<1K", "1K1M"] -multilinguality_set = { +known_multilingualities = { "monolingual": "contains a single language", "multilingual": "contains multiple languages", "translation": "contains translated or aligned text", @@ -25,10 +29,10 @@ def load_json_resource(resource: str) -> Dict: } -def tagset_validator(values: List[str], reference_values: List[str], name: str) -> List[str]: +def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> List[str]: for v in values: if v not in reference_values: - raise ValueError(f"'{v}' is not a registered tag for {name}.") + raise ValueError(f"'{v}' is not a registered tag for '{name}', reference at {url}") return values @@ -60,13 +64,27 @@ def from_yaml_string(cls, string: str) -> "DatasetMetadata": metada_dict = yaml.safe_load(string) or dict() return cls(**metada_dict) + @classmethod + def empty(cls) -> "DatasetMetadata": + return cls( + annotations_creators=list(), + language_creators=list(), + languages=list(), + licenses=list(), + multilinguality=list(), + size_categories=list(), + source_datasets=list(), + task_categories=list(), + task_ids=list(), + ) + @validator("annotations_creators") def annotations_creators_must_be_in_known_set(cls, annotations_creators: List[str]) -> List[str]: - return tagset_validator(annotations_creators, creator_set["annotations"], "annotations") + return tagset_validator(annotations_creators, known_creators["annotations"], "annotations", known_creators_url) @validator("language_creators") def language_creators_must_be_in_known_set(cls, language_creators: List[str]) -> List[str]: - return tagset_validator(language_creators, creator_set["language"], "annotations") + return tagset_validator(language_creators, known_creators["language"], "annotations", known_creators_url) @validator("languages") def language_code_must_be_recognized(cls, languages: List[str]): @@ -74,23 +92,36 @@ def language_code_must_be_recognized(cls, languages: List[str]): try: lc.get(code) except lc.tag_parser.LanguageTagError: - raise ValueError(f"'{code}' is not recognised as a valid language code") + raise ValueError( + f"'{code}' is not recognised as a valid language code (BCP47 norm), you can refer to https://github.com/LuminosoInsight/langcodes" + ) return languages @validator("licenses") def licenses_must_be_in_known_set(cls, licenses: List[str]): - return tagset_validator(licenses, list(known_licenses.keys()), "licenses") + return tagset_validator(licenses, list(known_licenses.keys()), "licenses", known_licenses_url) @validator("task_categories") def task_category_must_be_in_known_set(cls, task_categories: List[str]): - return tagset_validator(task_categories, list(known_task_ids.keys()), "taks_ids") + return tagset_validator(task_categories, list(known_task_ids.keys()), "tasks_ids", known_task_ids_url) @validator("task_ids") def task_id_must_be_in_known_set(cls, task_ids: List[str]): return tagset_validator( - task_ids, [tid for _cat, d in known_task_ids.items() for tid in d["options"]], "taks_ids" + task_ids, + [tid for _cat, d in known_task_ids.items() for tid in d["options"]], + "tasks_ids", + known_task_ids_url, ) + @validator("multilinguality") + def multilinguality_must_be_in_known_set(cls, multilinguality: List[str]): + return tagset_validator(multilinguality, list(known_multilingualities.keys()), "multilinguality", this_url) + + @validator("size_categories") + def size_categories_must_be_in_known_set(cls, size_cats: List[str]): + return tagset_validator(size_cats, known_size_categories, "size_categories", this_url) + if __name__ == "__main__": from argparse import ArgumentParser From 2d264e8d4b694af802f2373fdc59196bab383aa2 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 09:43:53 +0100 Subject: [PATCH 06/28] add validation to ci and rename files --- .circleci/config.yml | 1 + ...ator.py => datasets_metadata_validator.py} | 5 +- .../{metadata_validator.py => metadata.py} | 51 +++++++++++++------ 3 files changed, 39 insertions(+), 18 deletions(-) rename scripts/{ci-metadata-validator.py => datasets_metadata_validator.py} (92%) rename src/datasets/utils/{metadata_validator.py => metadata.py} (71%) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2aaaa787912..624479f3b09 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -81,6 +81,7 @@ jobs: - run: black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics - run: isort --check-only tests src benchmarks datasets metrics - run: flake8 tests src benchmarks datasets metrics + - run: ./scripts/datasets_metadata_validator.py build_doc: working_directory: ~/datasets diff --git a/scripts/ci-metadata-validator.py b/scripts/datasets_metadata_validator.py similarity index 92% rename from scripts/ci-metadata-validator.py rename to scripts/datasets_metadata_validator.py index 22771c3e7fd..9bb50baa8ca 100755 --- a/scripts/ci-metadata-validator.py +++ b/scripts/datasets_metadata_validator.py @@ -5,16 +5,15 @@ """ from pathlib import Path -from subprocess import check_call, check_output +from subprocess import check_output from typing import List from pydantic import ValidationError -from datasets.utils.metadata_validator import DatasetMetadata +from datasets.utils.metadata import DatasetMetadata def get_changed_files(repo_path: Path) -> List[Path]: - # check_call(["git", "fetch"], cwd=repo_path) diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path) changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()] return changed_files diff --git a/src/datasets/utils/metadata_validator.py b/src/datasets/utils/metadata.py similarity index 71% rename from src/datasets/utils/metadata_validator.py rename to src/datasets/utils/metadata.py index 41a11a8c1dd..cb767c53379 100644 --- a/src/datasets/utils/metadata_validator.py +++ b/src/datasets/utils/metadata.py @@ -1,6 +1,7 @@ import json +import logging from pathlib import Path -from typing import Dict, List, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple import langcodes as lc import yaml @@ -9,6 +10,17 @@ BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" this_url = f"{BASE_REF_URL}/{__file__}" +logger = logging.getLogger(__name__) + + +def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]: + with f.open() as fi: + content = [line.strip() for line in fi] + + if content[0] == "---" and "---" in content[1:]: + yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) + metada_dict = yaml.safe_load(yamlblock) or dict() + return metada_dict def load_json_resource(resource: str) -> Tuple[Dict, str]: @@ -36,6 +48,16 @@ def tagset_validator(values: List[str], reference_values: List[str], name: str, return values +def splitter(values: List[Any], predicate_fn: Callable[[Any], bool]) -> Tuple[List[Any], List[Any]]: + trues, falses = list(), list() + for v in values: + if predicate_fn(v): + trues.append(v) + else: + falses.append(v) + return trues, falses + + class DatasetMetadata(BaseModel): annotations_creators: List[str] language_creators: List[str] @@ -49,13 +71,9 @@ class DatasetMetadata(BaseModel): @classmethod def from_readme(cls, f: Path) -> "DatasetMetadata": - with f.open() as fi: - content = [line.strip() for line in fi] - - if content[0] == "---" and "---" in content[1:]: - yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) - metada_dict = yaml.safe_load(yamlblock) or dict() - return cls(**metada_dict) + metadata_dict = dict_from_readme(f) + if metadata_dict is not None: + return cls(**metadata_dict) else: raise ValueError(f"did not find a yaml block in '{f}'") @@ -103,16 +121,19 @@ def licenses_must_be_in_known_set(cls, licenses: List[str]): @validator("task_categories") def task_category_must_be_in_known_set(cls, task_categories: List[str]): - return tagset_validator(task_categories, list(known_task_ids.keys()), "tasks_ids", known_task_ids_url) + # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change + # in the near future and we don't want to waste energy in tagging against a moving taxonomy. + known_set = list(known_task_ids.keys()) + others, to_validate = splitter(task_categories, lambda e: e.startswith("other")) + return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others] @validator("task_ids") def task_id_must_be_in_known_set(cls, task_ids: List[str]): - return tagset_validator( - task_ids, - [tid for _cat, d in known_task_ids.items() for tid in d["options"]], - "tasks_ids", - known_task_ids_url, - ) + # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change + # in the near future and we don't want to waste energy in tagging against a moving taxonomy. + known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] + others, to_validate = splitter(task_ids, lambda e: e.startswith("other")) + return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others] @validator("multilinguality") def multilinguality_must_be_in_known_set(cls, multilinguality: List[str]): From fc46ec3117ca72ae68ef888781c53ada3695d0b5 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 09:44:14 +0100 Subject: [PATCH 07/28] spurrious change to trigger CI --- datasets/spanish_billion_words/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/spanish_billion_words/README.md b/datasets/spanish_billion_words/README.md index 113e86665bb..12fdd31a8ae 100644 --- a/datasets/spanish_billion_words/README.md +++ b/datasets/spanish_billion_words/README.md @@ -4,7 +4,7 @@ annotations_creators: language_creators: - expert-generated languages: -- es +- esXXX_CI_SHOULD_FAIL_HERE licenses: - cc-by-sa-4.0 multilinguality: From 58763d21cb861a3fff3c9fb95b102df2b66ea6f6 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 09:54:03 +0100 Subject: [PATCH 08/28] add qa reqs --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6bc7007c22f..7ac7d1705f1 100644 --- a/setup.py +++ b/setup.py @@ -163,8 +163,10 @@ QUALITY_REQUIRE = [ "black", - "isort", "flake8==3.7.9", + "isort", + "langcodes[data]>=3.1.0", + "pydantic>=1.8.1", ] From 115d252d40e6b78d6ab6e3423210eadf0b8689ff Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 09:57:14 +0100 Subject: [PATCH 09/28] disallow empty lists --- src/datasets/utils/metadata.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index cb767c53379..bcf927f67a2 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -5,7 +5,7 @@ import langcodes as lc import yaml -from pydantic import BaseModel, validator +from pydantic import BaseModel, conlist, validator BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" @@ -59,15 +59,15 @@ def splitter(values: List[Any], predicate_fn: Callable[[Any], bool]) -> Tuple[Li class DatasetMetadata(BaseModel): - annotations_creators: List[str] - language_creators: List[str] - languages: List[str] - licenses: List[str] - multilinguality: List[str] - size_categories: List[str] - source_datasets: List[str] - task_categories: List[str] - task_ids: List[str] + annotations_creators: conlist(str, min_items=1) + language_creators: conlist(str, min_items=1) + languages: conlist(str, min_items=1) + licenses: conlist(str, min_items=1) + multilinguality: conlist(str, min_items=1) + size_categories: conlist(str, min_items=1) + source_datasets: conlist(str, min_items=1) + task_categories: conlist(str, min_items=1) + task_ids: conlist(str, min_items=1) @classmethod def from_readme(cls, f: Path) -> "DatasetMetadata": From 9ae048eb11b36002aeed3761842e0868c2a7e0db Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 10:04:21 +0100 Subject: [PATCH 10/28] better error msg: show all invalid values rather than first one --- src/datasets/utils/metadata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index bcf927f67a2..a967df16d8e 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -42,9 +42,9 @@ def load_json_resource(resource: str) -> Tuple[Dict, str]: def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> List[str]: - for v in values: - if v not in reference_values: - raise ValueError(f"'{v}' is not a registered tag for '{name}', reference at {url}") + invalid_values = [v for v in values if v not in reference_values] + if len(invalid_values) > 0: + raise ValueError(f"'{invalid_values}' is not a registered tag for '{name}', reference at {url}") return values From 299e907333abb0514a4112966423084c152dca0e Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 10:27:56 +0100 Subject: [PATCH 11/28] some code shuffling & better error msg for langcodes --- src/datasets/utils/metadata.py | 45 +++++++++++++--------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index a967df16d8e..e374e4b0424 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -13,16 +13,6 @@ logger = logging.getLogger(__name__) -def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]: - with f.open() as fi: - content = [line.strip() for line in fi] - - if content[0] == "---" and "---" in content[1:]: - yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) - metada_dict = yaml.safe_load(yamlblock) or dict() - return metada_dict - - def load_json_resource(resource: str) -> Tuple[Dict, str]: utils_dir = Path(__file__).parent with open(utils_dir / "resources" / resource) as fi: @@ -41,10 +31,20 @@ def load_json_resource(resource: str) -> Tuple[Dict, str]: } +def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]: + with f.open() as fi: + content = [line.strip() for line in fi] + + if content[0] == "---" and "---" in content[1:]: + yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) + metada_dict = yaml.safe_load(yamlblock) or dict() + return metada_dict + + def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> List[str]: invalid_values = [v for v in values if v not in reference_values] if len(invalid_values) > 0: - raise ValueError(f"'{invalid_values}' is not a registered tag for '{name}', reference at {url}") + raise ValueError(f"{invalid_values} are not registered tags for '{name}', reference at {url}") return values @@ -82,20 +82,6 @@ def from_yaml_string(cls, string: str) -> "DatasetMetadata": metada_dict = yaml.safe_load(string) or dict() return cls(**metada_dict) - @classmethod - def empty(cls) -> "DatasetMetadata": - return cls( - annotations_creators=list(), - language_creators=list(), - languages=list(), - licenses=list(), - multilinguality=list(), - size_categories=list(), - source_datasets=list(), - task_categories=list(), - task_ids=list(), - ) - @validator("annotations_creators") def annotations_creators_must_be_in_known_set(cls, annotations_creators: List[str]) -> List[str]: return tagset_validator(annotations_creators, known_creators["annotations"], "annotations", known_creators_url) @@ -106,13 +92,16 @@ def language_creators_must_be_in_known_set(cls, language_creators: List[str]) -> @validator("languages") def language_code_must_be_recognized(cls, languages: List[str]): + invalid_values = [] for code in languages: try: lc.get(code) except lc.tag_parser.LanguageTagError: - raise ValueError( - f"'{code}' is not recognised as a valid language code (BCP47 norm), you can refer to https://github.com/LuminosoInsight/langcodes" - ) + invalid_values.append(code) + if len(invalid_values) > 0: + raise ValueError( + f"{invalid_values} are not recognised as valid language codes (BCP47 norm), you can refer to https://github.com/LuminosoInsight/langcodes" + ) return languages @validator("licenses") From b4a06657273e85f41d857d8c1861afad1513a0cd Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 10:31:51 +0100 Subject: [PATCH 12/28] add pyyaml to qa reqs --- setup.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/setup.py b/setup.py index 7ac7d1705f1..21a07674552 100644 --- a/setup.py +++ b/setup.py @@ -161,13 +161,7 @@ ) -QUALITY_REQUIRE = [ - "black", - "flake8==3.7.9", - "isort", - "langcodes[data]>=3.1.0", - "pydantic>=1.8.1", -] +QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "langcodes[data]>=3.1.0", "pydantic>=1.8.1", "pyyaml>=5.3.1"] EXTRAS_REQUIRE = { From 7eeb647fb9f2b20562556e0139b09440700c7bf8 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 11:05:08 +0100 Subject: [PATCH 13/28] fix package file loading --- src/datasets/utils/metadata.py | 15 ++++++++++++--- src/datasets/utils/resources/__init__.py | 0 2 files changed, 12 insertions(+), 3 deletions(-) create mode 100644 src/datasets/utils/resources/__init__.py diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index e374e4b0424..d4523638f64 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -3,10 +3,20 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple + +# loading package files: https://stackoverflow.com/a/20885799 +try: + import importlib.resources as pkg_resources +except ImportError: + # Try backported to PY<37 `importlib_resources`. + import importlib_resources as pkg_resources + import langcodes as lc import yaml from pydantic import BaseModel, conlist, validator +from . import resources + BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" this_url = f"{BASE_REF_URL}/{__file__}" @@ -14,9 +24,8 @@ def load_json_resource(resource: str) -> Tuple[Dict, str]: - utils_dir = Path(__file__).parent - with open(utils_dir / "resources" / resource) as fi: - return json.load(fi), f"{BASE_REF_URL}/resources/{resource}" + content = pkg_resources.read_text(resources, resource) + return json.loads(content), f"{BASE_REF_URL}/resources/{resource}" known_licenses, known_licenses_url = load_json_resource("licenses.json") diff --git a/src/datasets/utils/resources/__init__.py b/src/datasets/utils/resources/__init__.py new file mode 100644 index 00000000000..e69de29bb2d From 3a940865d89f7f6899526f5923e872d5ff6923e2 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 11:14:08 +0100 Subject: [PATCH 14/28] include json resources --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 21a07674552..205ce9b8c29 100644 --- a/setup.py +++ b/setup.py @@ -201,9 +201,7 @@ package_dir={"": "src"}, packages=find_packages("src"), package_data={ - "datasets": [ - "scripts/templates/*", - ], + "datasets": ["scripts/templates/*", "*.json"], }, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, install_requires=REQUIRED_PKGS, From e4409a97490f1bdd1d5d81fd006ba07a3ce3f283 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 11:21:38 +0100 Subject: [PATCH 15/28] reflect changes to size cats from https://github.com/huggingface/datasets-tagging/pull/11 --- src/datasets/utils/metadata.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index d4523638f64..f26ecb1978a 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -31,7 +31,20 @@ def load_json_resource(resource: str) -> Tuple[Dict, str]: known_licenses, known_licenses_url = load_json_resource("licenses.json") known_task_ids, known_task_ids_url = load_json_resource("tasks.json") known_creators, known_creators_url = load_json_resource("creators.json") -known_size_categories = ["unknown", "n<1K", "1K1M"] +known_size_categories = [ + "unknown", + "n<1K", + "1K1T", +] known_multilingualities = { "monolingual": "contains a single language", "multilingual": "contains multiple languages", From 9450b5fcde3465c7bf43133db6e7253514045269 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 11:24:09 +0100 Subject: [PATCH 16/28] trying another format for package_data --- setup.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 205ce9b8c29..08f82b4d2e2 100644 --- a/setup.py +++ b/setup.py @@ -200,9 +200,7 @@ license="Apache 2.0", package_dir={"": "src"}, packages=find_packages("src"), - package_data={ - "datasets": ["scripts/templates/*", "*.json"], - }, + package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]}, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, From 58709bfe7a8ed3ce55f01420f482d5a4fcebaee5 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 11:29:29 +0100 Subject: [PATCH 17/28] =?UTF-8?q?ci=20works!=20fixing=20the=20readme=20lik?= =?UTF-8?q?e=20a=20good=20citizen=20=F0=9F=A4=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datasets/spanish_billion_words/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/spanish_billion_words/README.md b/datasets/spanish_billion_words/README.md index 12fdd31a8ae..113e86665bb 100644 --- a/datasets/spanish_billion_words/README.md +++ b/datasets/spanish_billion_words/README.md @@ -4,7 +4,7 @@ annotations_creators: language_creators: - expert-generated languages: -- esXXX_CI_SHOULD_FAIL_HERE +- es licenses: - cc-by-sa-4.0 multilinguality: From 702a8a15df3c029d925bee364b1557b45cda000f Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 24 Mar 2021 12:05:06 +0100 Subject: [PATCH 18/28] escape validation everywhere it's allowed in the tagging app --- src/datasets/utils/metadata.py | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index f26ecb1978a..ca689408415 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -70,13 +70,17 @@ def tagset_validator(values: List[str], reference_values: List[str], name: str, return values -def splitter(values: List[Any], predicate_fn: Callable[[Any], bool]) -> Tuple[List[Any], List[Any]]: +def escape_validation_for_predicate( + values: List[Any], predicate_fn: Callable[[Any], bool] +) -> Tuple[List[Any], List[Any]]: trues, falses = list(), list() for v in values: if predicate_fn(v): trues.append(v) else: falses.append(v) + if len(trues) > 0: + logger.warning(f"The following values will escape validation: {trues}") return trues, falses @@ -128,14 +132,15 @@ def language_code_must_be_recognized(cls, languages: List[str]): @validator("licenses") def licenses_must_be_in_known_set(cls, licenses: List[str]): - return tagset_validator(licenses, list(known_licenses.keys()), "licenses", known_licenses_url) + others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e) + return [*tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url), *others] @validator("task_categories") def task_category_must_be_in_known_set(cls, task_categories: List[str]): # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = list(known_task_ids.keys()) - others, to_validate = splitter(task_categories, lambda e: e.startswith("other")) + others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other")) return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others] @validator("task_ids") @@ -143,17 +148,34 @@ def task_id_must_be_in_known_set(cls, task_ids: List[str]): # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] - others, to_validate = splitter(task_ids, lambda e: e.startswith("other")) + others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e) return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others] @validator("multilinguality") def multilinguality_must_be_in_known_set(cls, multilinguality: List[str]): - return tagset_validator(multilinguality, list(known_multilingualities.keys()), "multilinguality", this_url) + others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other")) + return [ + *tagset_validator(to_validate, list(known_multilingualities.keys()), "multilinguality", this_url), + *others, + ] @validator("size_categories") def size_categories_must_be_in_known_set(cls, size_cats: List[str]): return tagset_validator(size_cats, known_size_categories, "size_categories", this_url) + @validator("source_datasets") + def source_datasets_must_be_in_known_set(cls, sources: List[str]): + invalid_values = [] + for src in sources: + is_ok = src in ["original", "extended"] or src.startswith("extended|") + if not is_ok: + invalid_values.append(src) + if len(invalid_values) > 0: + raise ValueError( + f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}" + ) + return sources + if __name__ == "__main__": from argparse import ArgumentParser From d3eec3c58836804049de997b33e3fa4d177b83d8 Mon Sep 17 00:00:00 2001 From: theo Date: Thu, 25 Mar 2021 17:06:06 +0100 Subject: [PATCH 19/28] code review: more json files, conditional import --- setup.py | 1 + src/datasets/utils/metadata.py | 30 +++++-------------- .../utils/resources/multilingualities.json | 6 ++++ .../utils/resources/size_categories.json | 14 +++++++++ 4 files changed, 28 insertions(+), 23 deletions(-) create mode 100644 src/datasets/utils/resources/multilingualities.json create mode 100644 src/datasets/utils/resources/size_categories.json diff --git a/setup.py b/setup.py index 08f82b4d2e2..a2e39273add 100644 --- a/setup.py +++ b/setup.py @@ -143,6 +143,7 @@ # metadata validation "langcodes[data]>=3.1.0", "pydantic>=1.8.1", + "importlib_resources;python_version<'3.7'", ] if os.name == "nt": # windows diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index ca689408415..5f8922b1a72 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) -def load_json_resource(resource: str) -> Tuple[Dict, str]: +def load_json_resource(resource: str) -> Tuple[Any, str]: content = pkg_resources.read_text(resources, resource) return json.loads(content), f"{BASE_REF_URL}/resources/{resource}" @@ -31,26 +31,8 @@ def load_json_resource(resource: str) -> Tuple[Dict, str]: known_licenses, known_licenses_url = load_json_resource("licenses.json") known_task_ids, known_task_ids_url = load_json_resource("tasks.json") known_creators, known_creators_url = load_json_resource("creators.json") -known_size_categories = [ - "unknown", - "n<1K", - "1K1T", -] -known_multilingualities = { - "monolingual": "contains a single language", - "multilingual": "contains multiple languages", - "translation": "contains translated or aligned text", - "other": "other type of language distribution", -} +known_size_categories, known_size_categories_url = load_json_resource("size_categories.json") +known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json") def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]: @@ -155,13 +137,15 @@ def task_id_must_be_in_known_set(cls, task_ids: List[str]): def multilinguality_must_be_in_known_set(cls, multilinguality: List[str]): others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other")) return [ - *tagset_validator(to_validate, list(known_multilingualities.keys()), "multilinguality", this_url), + *tagset_validator( + to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url + ), *others, ] @validator("size_categories") def size_categories_must_be_in_known_set(cls, size_cats: List[str]): - return tagset_validator(size_cats, known_size_categories, "size_categories", this_url) + return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url) @validator("source_datasets") def source_datasets_must_be_in_known_set(cls, sources: List[str]): diff --git a/src/datasets/utils/resources/multilingualities.json b/src/datasets/utils/resources/multilingualities.json new file mode 100644 index 00000000000..a35c79f03df --- /dev/null +++ b/src/datasets/utils/resources/multilingualities.json @@ -0,0 +1,6 @@ +{ + "monolingual": "contains a single language", + "multilingual": "contains multiple languages", + "translation": "contains translated or aligned text", + "other": "other type of language distribution" +} diff --git a/src/datasets/utils/resources/size_categories.json b/src/datasets/utils/resources/size_categories.json new file mode 100644 index 00000000000..983ce0c10db --- /dev/null +++ b/src/datasets/utils/resources/size_categories.json @@ -0,0 +1,14 @@ +[ + "unknown", + "n<1K", + "1K1T" +] From 84de0135a91f7be9113b662b272d1c065c130138 Mon Sep 17 00:00:00 2001 From: theo Date: Mon, 29 Mar 2021 11:01:10 +0200 Subject: [PATCH 20/28] pointers to integrate readme metadata in class (wip) --- src/datasets/builder.py | 1 + src/datasets/info.py | 4 ++++ src/datasets/load.py | 1 + 3 files changed, 6 insertions(+) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index b90c77c8fc6..b0c7e457f37 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -292,6 +292,7 @@ def get_all_exported_dataset_infos(cls) -> dict: """Empty dict if doesn't exist""" dset_infos_file_path = os.path.join(cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME) if os.path.exists(dset_infos_file_path): + # todo load readme return DatasetInfosDict.from_directory(cls.get_imported_module_dir()) return {} diff --git a/src/datasets/info.py b/src/datasets/info.py index d9c862e0616..a2db8cd622b 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -40,6 +40,7 @@ from .splits import SplitDict from .utils import Version from .utils.logging import get_logger +from .utils.metadata import DatasetMetadata logger = get_logger(__name__) @@ -123,6 +124,7 @@ class DatasetInfo: features: Optional[Features] = None post_processed: Optional[PostProcessedInfo] = None supervised_keys: Optional[SupervisedKeysData] = None + metadata: DatasetMetadata = None # Set later by the builder builder_name: Optional[str] = None @@ -183,6 +185,7 @@ def from_merge(cls, dataset_infos: List["DatasetInfo"]): citation = "\n\n".join([info.citation for info in dataset_infos]) homepage = "\n\n".join([info.homepage for info in dataset_infos]) license = "\n\n".join([info.license for info in dataset_infos]) + # todo extend metadata fields of one another features = None supervised_keys = None @@ -214,6 +217,7 @@ def from_directory(cls, dataset_info_dir: str) -> "DatasetInfo": with open(os.path.join(dataset_info_dir, DATASET_INFO_FILENAME), "r", encoding="utf-8") as f: dataset_info_dict = json.load(f) + # todo load readme to populate metadata field return cls.from_dict(dataset_info_dict) @classmethod diff --git a/src/datasets/load.py b/src/datasets/load.py index 20fc396d089..0baaaa8ae88 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -515,6 +515,7 @@ def _get_modification_time(module_hash): shutil.copyfile(local_dataset_infos_path, dataset_infos_path) else: logger.info("Found dataset infos file from %s to %s", dataset_infos, dataset_infos_path) + # todo load readme # Record metadata associating original dataset path with local unique folder meta_path = local_file_path.split(".py")[0] + ".json" From 7fbd51d95dd146b3cade934f1ab4694880f5b9ab Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 31 Mar 2021 13:26:38 +0200 Subject: [PATCH 21/28] no pydantic --- scripts/datasets_metadata_validator.py | 4 +- setup.py | 3 +- src/datasets/utils/metadata.py | 150 +++++++++++++++++-------- 3 files changed, 103 insertions(+), 54 deletions(-) diff --git a/scripts/datasets_metadata_validator.py b/scripts/datasets_metadata_validator.py index 9bb50baa8ca..857d11c116e 100755 --- a/scripts/datasets_metadata_validator.py +++ b/scripts/datasets_metadata_validator.py @@ -8,8 +8,6 @@ from subprocess import check_output from typing import List -from pydantic import ValidationError - from datasets.utils.metadata import DatasetMetadata @@ -46,7 +44,7 @@ def get_changed_files(repo_path: Path) -> List[Path]: try: DatasetMetadata.from_readme(readme) logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") - except ValidationError as e: + except TypeError as e: failed.append(readme) logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}") except Exception as e: diff --git a/setup.py b/setup.py index a2e39273add..dc2e509a367 100644 --- a/setup.py +++ b/setup.py @@ -142,7 +142,6 @@ "Werkzeug>=1.0.1", # metadata validation "langcodes[data]>=3.1.0", - "pydantic>=1.8.1", "importlib_resources;python_version<'3.7'", ] @@ -162,7 +161,7 @@ ) -QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "langcodes[data]>=3.1.0", "pydantic>=1.8.1", "pyyaml>=5.3.1"] +QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "langcodes[data]>=3.1.0", "pyyaml>=5.3.1"] EXTRAS_REQUIRE = { diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 5f8922b1a72..0a240ef4190 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -1,5 +1,6 @@ import json import logging +from dataclasses import dataclass from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple @@ -13,7 +14,6 @@ import langcodes as lc import yaml -from pydantic import BaseModel, conlist, validator from . import resources @@ -45,11 +45,14 @@ def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]: return metada_dict -def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> List[str]: +ValidatorOutput = Tuple[List[str], Optional[str]] + + +def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> ValidatorOutput: invalid_values = [v for v in values if v not in reference_values] if len(invalid_values) > 0: - raise ValueError(f"{invalid_values} are not registered tags for '{name}', reference at {url}") - return values + return [], f"{invalid_values} are not registered tags for '{name}', reference at {url}" + return values, None def escape_validation_for_predicate( @@ -66,16 +69,61 @@ def escape_validation_for_predicate( return trues, falses -class DatasetMetadata(BaseModel): - annotations_creators: conlist(str, min_items=1) - language_creators: conlist(str, min_items=1) - languages: conlist(str, min_items=1) - licenses: conlist(str, min_items=1) - multilinguality: conlist(str, min_items=1) - size_categories: conlist(str, min_items=1) - source_datasets: conlist(str, min_items=1) - task_categories: conlist(str, min_items=1) - task_ids: conlist(str, min_items=1) +@dataclass +class DatasetMetadata: + annotations_creators: List[str] + language_creators: List[str] + languages: List[str] + licenses: List[str] + multilinguality: List[str] + size_categories: List[str] + source_datasets: List[str] + task_categories: List[str] + task_ids: List[str] + + def __post_init__(self): + basic_typing_errors = { + name: value + for name, value in vars(self).items() + if not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str) + } + if len(basic_typing_errors) > 0: + raise TypeError(f"Found fields that are not non-empty list of strings: {basic_typing_errors}") + + self.annotations_creators, annotations_creators_errors = self.annotations_creators_must_be_in_known_set( + self.annotations_creators + ) + self.language_creators, language_creators_errors = self.language_creators_must_be_in_known_set( + self.language_creators + ) + self.languages, languages_errors = self.language_code_must_be_recognized(self.language_creators) + self.licenses, licenses_errors = self.licenses_must_be_in_known_set(self.licenses) + self.multilinguality, multilinguality_errors = self.multilinguality_must_be_in_known_set(self.multilinguality) + self.size_categories, size_categories_errors = self.size_categories_must_be_in_known_set(self.size_categories) + self.source_datasets, source_datasets_errors = self.source_datasets_must_be_in_known_set(self.source_datasets) + self.task_categories, task_categories_errors = self.task_category_must_be_in_known_set(self.task_categories) + self.task_ids, task_ids_errors = self.task_id_must_be_in_known_set(self.task_ids) + + errors = { + "annotations_creators": annotations_creators_errors, + "language_creators": language_creators_errors, + "licenses": licenses_errors, + "multilinguality": multilinguality_errors, + "size_categories": size_categories_errors, + "source_datasets": source_datasets_errors, + "task_categories": task_categories_errors, + "task_ids": task_ids_errors, + } + + exception_msg_dict = dict() + for field, errs in errors.items(): + if errs is not None: + exception_msg_dict[field] = errs + if len(exception_msg_dict) > 0: + raise TypeError( + "Could not validate the metada, found the following errors:\n" + + "\n".join(f"* field '{fieldname}':\n\t{err}" for fieldname, err in exception_msg_dict.items()) + ) @classmethod def from_readme(cls, f: Path) -> "DatasetMetadata": @@ -83,23 +131,23 @@ def from_readme(cls, f: Path) -> "DatasetMetadata": if metadata_dict is not None: return cls(**metadata_dict) else: - raise ValueError(f"did not find a yaml block in '{f}'") + raise TypeError(f"did not find a yaml block in '{f}'") @classmethod def from_yaml_string(cls, string: str) -> "DatasetMetadata": metada_dict = yaml.safe_load(string) or dict() return cls(**metada_dict) - @validator("annotations_creators") - def annotations_creators_must_be_in_known_set(cls, annotations_creators: List[str]) -> List[str]: + @staticmethod + def annotations_creators_must_be_in_known_set(annotations_creators: List[str]) -> ValidatorOutput: return tagset_validator(annotations_creators, known_creators["annotations"], "annotations", known_creators_url) - @validator("language_creators") - def language_creators_must_be_in_known_set(cls, language_creators: List[str]) -> List[str]: + @staticmethod + def language_creators_must_be_in_known_set(language_creators: List[str]) -> ValidatorOutput: return tagset_validator(language_creators, known_creators["language"], "annotations", known_creators_url) - @validator("languages") - def language_code_must_be_recognized(cls, languages: List[str]): + @staticmethod + def language_code_must_be_recognized(languages: List[str]) -> ValidatorOutput: invalid_values = [] for code in languages: try: @@ -107,58 +155,62 @@ def language_code_must_be_recognized(cls, languages: List[str]): except lc.tag_parser.LanguageTagError: invalid_values.append(code) if len(invalid_values) > 0: - raise ValueError( - f"{invalid_values} are not recognised as valid language codes (BCP47 norm), you can refer to https://github.com/LuminosoInsight/langcodes" + return ( + [], + f"{invalid_values} are not recognised as valid language codes (BCP47 norm), you can refer to https://github.com/LuminosoInsight/langcodes", ) - return languages + return languages, None - @validator("licenses") - def licenses_must_be_in_known_set(cls, licenses: List[str]): + @staticmethod + def licenses_must_be_in_known_set(licenses: List[str]) -> ValidatorOutput: others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e) - return [*tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url), *others] + validated, error = tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url) + return [*validated, *others], error - @validator("task_categories") - def task_category_must_be_in_known_set(cls, task_categories: List[str]): + @staticmethod + def task_category_must_be_in_known_set(task_categories: List[str]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = list(known_task_ids.keys()) others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other")) - return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others] + validated, error = tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url) + return [*validated, *others], error - @validator("task_ids") - def task_id_must_be_in_known_set(cls, task_ids: List[str]): + @staticmethod + def task_id_must_be_in_known_set(task_ids: List[str]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e) - return [*tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url), *others] + validated, error = tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url) + return [*validated, *others], error - @validator("multilinguality") - def multilinguality_must_be_in_known_set(cls, multilinguality: List[str]): + @staticmethod + def multilinguality_must_be_in_known_set(multilinguality: List[str]) -> ValidatorOutput: others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other")) - return [ - *tagset_validator( - to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url - ), - *others, - ] - - @validator("size_categories") - def size_categories_must_be_in_known_set(cls, size_cats: List[str]): + validated, error = tagset_validator( + to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url + ) + return [*validated, *others], error + + @staticmethod + def size_categories_must_be_in_known_set(size_cats: List[str]) -> ValidatorOutput: return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url) - @validator("source_datasets") - def source_datasets_must_be_in_known_set(cls, sources: List[str]): + @staticmethod + def source_datasets_must_be_in_known_set(sources: List[str]) -> ValidatorOutput: invalid_values = [] for src in sources: is_ok = src in ["original", "extended"] or src.startswith("extended|") if not is_ok: invalid_values.append(src) if len(invalid_values) > 0: - raise ValueError( - f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}" + return ( + [], + f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}", ) - return sources + + return sources, None if __name__ == "__main__": From ab82a6cbb1dd5fbc7f0ea70e98156d7419c54bf1 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 31 Mar 2021 18:22:15 +0200 Subject: [PATCH 22/28] fix docs? --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index eb0ba6dca3f..92b96b833c5 100644 --- a/setup.py +++ b/setup.py @@ -186,6 +186,7 @@ "sphinx-rtd-theme==0.4.3", "sphinx-copybutton", "fsspec[s3]", + "langcodes[data]", ], } From a4953db6c85900f738db01383f4ae8d657a86d20 Mon Sep 17 00:00:00 2001 From: theo Date: Wed, 31 Mar 2021 18:33:19 +0200 Subject: [PATCH 23/28] Revert "fix docs?" This reverts commit ab82a6cbb1dd5fbc7f0ea70e98156d7419c54bf1. --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 92b96b833c5..eb0ba6dca3f 100644 --- a/setup.py +++ b/setup.py @@ -186,7 +186,6 @@ "sphinx-rtd-theme==0.4.3", "sphinx-copybutton", "fsspec[s3]", - "langcodes[data]", ], } From e63d32553abb23969174fdbe7ac6ced883e64029 Mon Sep 17 00:00:00 2001 From: theo Date: Thu, 1 Apr 2021 12:29:34 +0200 Subject: [PATCH 24/28] remove pointers to add readme to loader --- src/datasets/builder.py | 1 - src/datasets/info.py | 4 ---- src/datasets/load.py | 1 - 3 files changed, 6 deletions(-) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 31a7eb0b61e..d5564ffe2e2 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -292,7 +292,6 @@ def get_all_exported_dataset_infos(cls) -> dict: """Empty dict if doesn't exist""" dset_infos_file_path = os.path.join(cls.get_imported_module_dir(), DATASET_INFOS_DICT_FILE_NAME) if os.path.exists(dset_infos_file_path): - # todo load readme return DatasetInfosDict.from_directory(cls.get_imported_module_dir()) return {} diff --git a/src/datasets/info.py b/src/datasets/info.py index a2db8cd622b..d9c862e0616 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -40,7 +40,6 @@ from .splits import SplitDict from .utils import Version from .utils.logging import get_logger -from .utils.metadata import DatasetMetadata logger = get_logger(__name__) @@ -124,7 +123,6 @@ class DatasetInfo: features: Optional[Features] = None post_processed: Optional[PostProcessedInfo] = None supervised_keys: Optional[SupervisedKeysData] = None - metadata: DatasetMetadata = None # Set later by the builder builder_name: Optional[str] = None @@ -185,7 +183,6 @@ def from_merge(cls, dataset_infos: List["DatasetInfo"]): citation = "\n\n".join([info.citation for info in dataset_infos]) homepage = "\n\n".join([info.homepage for info in dataset_infos]) license = "\n\n".join([info.license for info in dataset_infos]) - # todo extend metadata fields of one another features = None supervised_keys = None @@ -217,7 +214,6 @@ def from_directory(cls, dataset_info_dir: str) -> "DatasetInfo": with open(os.path.join(dataset_info_dir, DATASET_INFO_FILENAME), "r", encoding="utf-8") as f: dataset_info_dict = json.load(f) - # todo load readme to populate metadata field return cls.from_dict(dataset_info_dict) @classmethod diff --git a/src/datasets/load.py b/src/datasets/load.py index e8ec136159d..298d74456dc 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -515,7 +515,6 @@ def _get_modification_time(module_hash): shutil.copyfile(local_dataset_infos_path, dataset_infos_path) else: logger.info("Found dataset infos file from %s to %s", dataset_infos, dataset_infos_path) - # todo load readme # Record metadata associating original dataset path with local unique folder meta_path = local_file_path.split(".py")[0] + ".json" From 3102ccfb363dc7b8628327aca0a71e66ed97a089 Mon Sep 17 00:00:00 2001 From: SBrandeis Date: Fri, 23 Apr 2021 11:54:25 +0200 Subject: [PATCH 25/28] Get rid of langcodes, some refactor --- setup.py | 3 +- src/datasets/utils/metadata.py | 94 +-- src/datasets/utils/resources/languages.json | 637 ++++++++++++++++++++ 3 files changed, 695 insertions(+), 39 deletions(-) create mode 100644 src/datasets/utils/resources/languages.json diff --git a/setup.py b/setup.py index c4bf167de43..817c06189a9 100644 --- a/setup.py +++ b/setup.py @@ -154,7 +154,6 @@ "texttable>=1.6.3", "Werkzeug>=1.0.1", # metadata validation - "langcodes[data]>=3.1.0", "importlib_resources;python_version<'3.7'", ] @@ -174,7 +173,7 @@ ) -QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "langcodes[data]>=3.1.0", "pyyaml>=5.3.1"] +QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "pyyaml>=5.3.1"] EXTRAS_REQUIRE = { diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 0a240ef4190..312a7b090ac 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -12,7 +12,6 @@ # Try backported to PY<37 `importlib_resources`. import importlib_resources as pkg_resources -import langcodes as lc import yaml from . import resources @@ -28,6 +27,9 @@ def load_json_resource(resource: str) -> Tuple[Any, str]: return json.loads(content), f"{BASE_REF_URL}/resources/{resource}" +# Source of languages.json: +# https://github.com/unicode-org/cldr-json/blob/9f6b2f0c5eb3aabaa97343cd1ee431a3badc4851/cldr-json/cldr-localenames-full/main/en/languages.json +known_language_codes, known_language_codes_url = load_json_resource("languages.json") known_licenses, known_licenses_url = load_json_resource("licenses.json") known_task_ids, known_task_ids_url = load_json_resource("tasks.json") known_creators, known_creators_url = load_json_resource("creators.json") @@ -35,14 +37,16 @@ def load_json_resource(resource: str) -> Tuple[Any, str]: known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json") -def dict_from_readme(f: Path) -> Optional[Dict[str, List[str]]]: - with f.open() as fi: - content = [line.strip() for line in fi] +def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: + """"Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" + with path.open() as readme_file: + content = [line.strip() for line in readme_file] if content[0] == "---" and "---" in content[1:]: yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) metada_dict = yaml.safe_load(yamlblock) or dict() return metada_dict + return None ValidatorOutput = Tuple[List[str], Optional[str]] @@ -96,13 +100,13 @@ def __post_init__(self): self.language_creators, language_creators_errors = self.language_creators_must_be_in_known_set( self.language_creators ) - self.languages, languages_errors = self.language_code_must_be_recognized(self.language_creators) - self.licenses, licenses_errors = self.licenses_must_be_in_known_set(self.licenses) - self.multilinguality, multilinguality_errors = self.multilinguality_must_be_in_known_set(self.multilinguality) - self.size_categories, size_categories_errors = self.size_categories_must_be_in_known_set(self.size_categories) - self.source_datasets, source_datasets_errors = self.source_datasets_must_be_in_known_set(self.source_datasets) - self.task_categories, task_categories_errors = self.task_category_must_be_in_known_set(self.task_categories) - self.task_ids, task_ids_errors = self.task_id_must_be_in_known_set(self.task_ids) + self.languages, languages_errors = self.validate_language_codes(self.language_creators) + self.licenses, licenses_errors = self.validate_licences(self.licenses) + self.multilinguality, multilinguality_errors = self.validate_mulitlinguality(self.multilinguality) + self.size_categories, size_categories_errors = self.validate_size_catgeories(self.size_categories) + self.source_datasets, source_datasets_errors = self.validate_source_datasets(self.source_datasets) + self.task_categories, task_categories_errors = self.validate_task_categories(self.task_categories) + self.task_ids, task_ids_errors = self.validate_task_ids(self.task_ids) errors = { "annotations_creators": annotations_creators_errors, @@ -113,6 +117,7 @@ def __post_init__(self): "source_datasets": source_datasets_errors, "task_categories": task_categories_errors, "task_ids": task_ids_errors, + "languages": languages_errors, } exception_msg_dict = dict() @@ -126,67 +131,82 @@ def __post_init__(self): ) @classmethod - def from_readme(cls, f: Path) -> "DatasetMetadata": - metadata_dict = dict_from_readme(f) + def from_readme(cls, path: Path) -> "DatasetMetadata": + """Loads and validates the dataset metadat from its dataset card (README.md) + + Args: + path (:obj:`Path`): Path to the dataset card (its README.md file) + + Returns: + :class:`DatasetMetadata`: The dataset's metadata + + Raises: + :obj:`TypeError`: If the dataset card has no metadata (no YAML header) + """ + metadata_dict = metadata_dict_from_readme(path) if metadata_dict is not None: return cls(**metadata_dict) else: - raise TypeError(f"did not find a yaml block in '{f}'") + raise TypeError(f"did not find a yaml block in '{path}'") @classmethod def from_yaml_string(cls, string: str) -> "DatasetMetadata": + """Loads and validates the dataset metadat from a YAML string + + Args: + string (:obj:`str`): The YAML string + + Returns: + :class:`DatasetMetadata`: The dataset's metadata + """ metada_dict = yaml.safe_load(string) or dict() return cls(**metada_dict) @staticmethod def annotations_creators_must_be_in_known_set(annotations_creators: List[str]) -> ValidatorOutput: - return tagset_validator(annotations_creators, known_creators["annotations"], "annotations", known_creators_url) + return tagset_validator( + annotations_creators, known_creators["annotations"], "annotations_creators", known_creators_url + ) @staticmethod def language_creators_must_be_in_known_set(language_creators: List[str]) -> ValidatorOutput: - return tagset_validator(language_creators, known_creators["language"], "annotations", known_creators_url) + return tagset_validator(language_creators, known_creators["language"], "language_creators", known_creators_url) @staticmethod - def language_code_must_be_recognized(languages: List[str]) -> ValidatorOutput: - invalid_values = [] - for code in languages: - try: - lc.get(code) - except lc.tag_parser.LanguageTagError: - invalid_values.append(code) - if len(invalid_values) > 0: - return ( - [], - f"{invalid_values} are not recognised as valid language codes (BCP47 norm), you can refer to https://github.com/LuminosoInsight/langcodes", - ) - return languages, None + def validate_language_codes(languages: List[str]) -> ValidatorOutput: + return tagset_validator( + values=languages, + reference_values=known_language_codes.keys(), + name="languages", + url=known_language_codes_url, + ) @staticmethod - def licenses_must_be_in_known_set(licenses: List[str]) -> ValidatorOutput: + def validate_licences(licenses: List[str]) -> ValidatorOutput: others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e) validated, error = tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url) return [*validated, *others], error @staticmethod - def task_category_must_be_in_known_set(task_categories: List[str]) -> ValidatorOutput: + def validate_task_categories(task_categories: List[str]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = list(known_task_ids.keys()) others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other")) - validated, error = tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url) + validated, error = tagset_validator(to_validate, known_set, "task_categories", known_task_ids_url) return [*validated, *others], error @staticmethod - def task_id_must_be_in_known_set(task_ids: List[str]) -> ValidatorOutput: + def validate_task_ids(task_ids: List[str]) -> ValidatorOutput: # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change # in the near future and we don't want to waste energy in tagging against a moving taxonomy. known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e) - validated, error = tagset_validator(to_validate, known_set, "tasks_ids", known_task_ids_url) + validated, error = tagset_validator(to_validate, known_set, "task_ids", known_task_ids_url) return [*validated, *others], error @staticmethod - def multilinguality_must_be_in_known_set(multilinguality: List[str]) -> ValidatorOutput: + def validate_mulitlinguality(multilinguality: List[str]) -> ValidatorOutput: others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other")) validated, error = tagset_validator( to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url @@ -194,11 +214,11 @@ def multilinguality_must_be_in_known_set(multilinguality: List[str]) -> Validato return [*validated, *others], error @staticmethod - def size_categories_must_be_in_known_set(size_cats: List[str]) -> ValidatorOutput: + def validate_size_catgeories(size_cats: List[str]) -> ValidatorOutput: return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url) @staticmethod - def source_datasets_must_be_in_known_set(sources: List[str]) -> ValidatorOutput: + def validate_source_datasets(sources: List[str]) -> ValidatorOutput: invalid_values = [] for src in sources: is_ok = src in ["original", "extended"] or src.startswith("extended|") diff --git a/src/datasets/utils/resources/languages.json b/src/datasets/utils/resources/languages.json new file mode 100644 index 00000000000..03e5d037af1 --- /dev/null +++ b/src/datasets/utils/resources/languages.json @@ -0,0 +1,637 @@ +{ + "aa": "Afar", + "ab": "Abkhazian", + "ace": "Achinese", + "ach": "Acoli", + "ada": "Adangme", + "ady": "Adyghe", + "ae": "Avestan", + "aeb": "Tunisian Arabic", + "af": "Afrikaans", + "afh": "Afrihili", + "agq": "Aghem", + "ain": "Ainu", + "ak": "Akan", + "akk": "Akkadian", + "akz": "Alabama", + "ale": "Aleut", + "aln": "Gheg Albanian", + "alt": "Southern Altai", + "am": "Amharic", + "an": "Aragonese", + "ang": "Old English", + "anp": "Angika", + "ar": "Arabic", + "ar-001": "Modern Standard Arabic", + "arc": "Aramaic", + "arn": "Mapuche", + "aro": "Araona", + "arp": "Arapaho", + "arq": "Algerian Arabic", + "ars": "Najdi Arabic", + "ars-alt-menu": "Arabic, Najdi", + "arw": "Arawak", + "ary": "Moroccan Arabic", + "arz": "Egyptian Arabic", + "as": "Assamese", + "asa": "Asu", + "ase": "American Sign Language", + "ast": "Asturian", + "av": "Avaric", + "avk": "Kotava", + "awa": "Awadhi", + "ay": "Aymara", + "az": "Azerbaijani", + "az-alt-short": "Azeri", + "ba": "Bashkir", + "bal": "Baluchi", + "ban": "Balinese", + "bar": "Bavarian", + "bas": "Basaa", + "bax": "Bamun", + "bbc": "Batak Toba", + "bbj": "Ghomala", + "be": "Belarusian", + "bej": "Beja", + "bem": "Bemba", + "bew": "Betawi", + "bez": "Bena", + "bfd": "Bafut", + "bfq": "Badaga", + "bg": "Bulgarian", + "bgn": "Western Balochi", + "bho": "Bhojpuri", + "bi": "Bislama", + "bik": "Bikol", + "bin": "Bini", + "bjn": "Banjar", + "bkm": "Kom", + "bla": "Siksika", + "bm": "Bambara", + "bn": "Bangla", + "bo": "Tibetan", + "bpy": "Bishnupriya", + "bqi": "Bakhtiari", + "br": "Breton", + "bra": "Braj", + "brh": "Brahui", + "brx": "Bodo", + "bs": "Bosnian", + "bss": "Akoose", + "bua": "Buriat", + "bug": "Buginese", + "bum": "Bulu", + "byn": "Blin", + "byv": "Medumba", + "ca": "Catalan", + "cad": "Caddo", + "car": "Carib", + "cay": "Cayuga", + "cch": "Atsam", + "ccp": "Chakma", + "ce": "Chechen", + "ceb": "Cebuano", + "cgg": "Chiga", + "ch": "Chamorro", + "chb": "Chibcha", + "chg": "Chagatai", + "chk": "Chuukese", + "chm": "Mari", + "chn": "Chinook Jargon", + "cho": "Choctaw", + "chp": "Chipewyan", + "chr": "Cherokee", + "chy": "Cheyenne", + "cic": "Chickasaw", + "ckb": "Central Kurdish", + "ckb-alt-menu": "Kurdish, Central", + "ckb-alt-variant": "Kurdish, Sorani", + "co": "Corsican", + "cop": "Coptic", + "cps": "Capiznon", + "cr": "Cree", + "crh": "Crimean Turkish", + "crs": "Seselwa Creole French", + "cs": "Czech", + "csb": "Kashubian", + "cu": "Church Slavic", + "cv": "Chuvash", + "cy": "Welsh", + "da": "Danish", + "dak": "Dakota", + "dar": "Dargwa", + "dav": "Taita", + "de": "German", + "de-AT": "Austrian German", + "de-CH": "Swiss High German", + "del": "Delaware", + "den": "Slave", + "dgr": "Dogrib", + "din": "Dinka", + "dje": "Zarma", + "doi": "Dogri", + "dsb": "Lower Sorbian", + "dtp": "Central Dusun", + "dua": "Duala", + "dum": "Middle Dutch", + "dv": "Divehi", + "dyo": "Jola-Fonyi", + "dyu": "Dyula", + "dz": "Dzongkha", + "dzg": "Dazaga", + "ebu": "Embu", + "ee": "Ewe", + "efi": "Efik", + "egl": "Emilian", + "egy": "Ancient Egyptian", + "eka": "Ekajuk", + "el": "Greek", + "elx": "Elamite", + "en": "English", + "en-AU": "Australian English", + "en-CA": "Canadian English", + "en-GB": "British English", + "en-GB-alt-short": "UK English", + "en-US": "American English", + "en-US-alt-short": "US English", + "enm": "Middle English", + "eo": "Esperanto", + "es": "Spanish", + "es-419": "Latin American Spanish", + "es-ES": "European Spanish", + "es-MX": "Mexican Spanish", + "esu": "Central Yupik", + "et": "Estonian", + "eu": "Basque", + "ewo": "Ewondo", + "ext": "Extremaduran", + "fa": "Persian", + "fa-AF": "Dari", + "fan": "Fang", + "fat": "Fanti", + "ff": "Fulah", + "fi": "Finnish", + "fil": "Filipino", + "fit": "Tornedalen Finnish", + "fj": "Fijian", + "fo": "Faroese", + "fon": "Fon", + "fr": "French", + "fr-CA": "Canadian French", + "fr-CH": "Swiss French", + "frc": "Cajun French", + "frm": "Middle French", + "fro": "Old French", + "frp": "Arpitan", + "frr": "Northern Frisian", + "frs": "Eastern Frisian", + "fur": "Friulian", + "fy": "Western Frisian", + "ga": "Irish", + "gaa": "Ga", + "gag": "Gagauz", + "gan": "Gan Chinese", + "gay": "Gayo", + "gba": "Gbaya", + "gbz": "Zoroastrian Dari", + "gd": "Scottish Gaelic", + "gez": "Geez", + "gil": "Gilbertese", + "gl": "Galician", + "glk": "Gilaki", + "gmh": "Middle High German", + "gn": "Guarani", + "goh": "Old High German", + "gom": "Goan Konkani", + "gon": "Gondi", + "gor": "Gorontalo", + "got": "Gothic", + "grb": "Grebo", + "grc": "Ancient Greek", + "gsw": "Swiss German", + "gu": "Gujarati", + "guc": "Wayuu", + "gur": "Frafra", + "guz": "Gusii", + "gv": "Manx", + "gwi": "Gwichʼin", + "ha": "Hausa", + "hai": "Haida", + "hak": "Hakka Chinese", + "haw": "Hawaiian", + "he": "Hebrew", + "hi": "Hindi", + "hif": "Fiji Hindi", + "hil": "Hiligaynon", + "hit": "Hittite", + "hmn": "Hmong", + "ho": "Hiri Motu", + "hr": "Croatian", + "hsb": "Upper Sorbian", + "hsn": "Xiang Chinese", + "ht": "Haitian Creole", + "hu": "Hungarian", + "hup": "Hupa", + "hy": "Armenian", + "hz": "Herero", + "ia": "Interlingua", + "iba": "Iban", + "ibb": "Ibibio", + "id": "Indonesian", + "ie": "Interlingue", + "ig": "Igbo", + "ii": "Sichuan Yi", + "ik": "Inupiaq", + "ilo": "Iloko", + "inh": "Ingush", + "io": "Ido", + "is": "Icelandic", + "it": "Italian", + "iu": "Inuktitut", + "izh": "Ingrian", + "ja": "Japanese", + "jam": "Jamaican Creole English", + "jbo": "Lojban", + "jgo": "Ngomba", + "jmc": "Machame", + "jpr": "Judeo-Persian", + "jrb": "Judeo-Arabic", + "jut": "Jutish", + "jv": "Javanese", + "ka": "Georgian", + "kaa": "Kara-Kalpak", + "kab": "Kabyle", + "kac": "Kachin", + "kaj": "Jju", + "kam": "Kamba", + "kaw": "Kawi", + "kbd": "Kabardian", + "kbl": "Kanembu", + "kcg": "Tyap", + "kde": "Makonde", + "kea": "Kabuverdianu", + "ken": "Kenyang", + "kfo": "Koro", + "kg": "Kongo", + "kgp": "Kaingang", + "kha": "Khasi", + "kho": "Khotanese", + "khq": "Koyra Chiini", + "khw": "Khowar", + "ki": "Kikuyu", + "kiu": "Kirmanjki", + "kj": "Kuanyama", + "kk": "Kazakh", + "kkj": "Kako", + "kl": "Kalaallisut", + "kln": "Kalenjin", + "km": "Khmer", + "kmb": "Kimbundu", + "kn": "Kannada", + "ko": "Korean", + "koi": "Komi-Permyak", + "kok": "Konkani", + "kos": "Kosraean", + "kpe": "Kpelle", + "kr": "Kanuri", + "krc": "Karachay-Balkar", + "kri": "Krio", + "krj": "Kinaray-a", + "krl": "Karelian", + "kru": "Kurukh", + "ks": "Kashmiri", + "ksb": "Shambala", + "ksf": "Bafia", + "ksh": "Colognian", + "ku": "Kurdish", + "kum": "Kumyk", + "kut": "Kutenai", + "kv": "Komi", + "kw": "Cornish", + "ky": "Kyrgyz", + "ky-alt-variant": "Kirghiz", + "la": "Latin", + "lad": "Ladino", + "lag": "Langi", + "lah": "Lahnda", + "lam": "Lamba", + "lb": "Luxembourgish", + "lez": "Lezghian", + "lfn": "Lingua Franca Nova", + "lg": "Ganda", + "li": "Limburgish", + "lij": "Ligurian", + "liv": "Livonian", + "lkt": "Lakota", + "lmo": "Lombard", + "ln": "Lingala", + "lo": "Lao", + "lol": "Mongo", + "lou": "Louisiana Creole", + "loz": "Lozi", + "lrc": "Northern Luri", + "lt": "Lithuanian", + "ltg": "Latgalian", + "lu": "Luba-Katanga", + "lua": "Luba-Lulua", + "lui": "Luiseno", + "lun": "Lunda", + "luo": "Luo", + "lus": "Mizo", + "luy": "Luyia", + "lv": "Latvian", + "lzh": "Literary Chinese", + "lzz": "Laz", + "mad": "Madurese", + "maf": "Mafa", + "mag": "Magahi", + "mai": "Maithili", + "mak": "Makasar", + "man": "Mandingo", + "mas": "Masai", + "mde": "Maba", + "mdf": "Moksha", + "mdr": "Mandar", + "men": "Mende", + "mer": "Meru", + "mfe": "Morisyen", + "mg": "Malagasy", + "mga": "Middle Irish", + "mgh": "Makhuwa-Meetto", + "mgo": "Metaʼ", + "mh": "Marshallese", + "mi": "Maori", + "mic": "Mi'kmaq", + "min": "Minangkabau", + "mk": "Macedonian", + "ml": "Malayalam", + "mn": "Mongolian", + "mnc": "Manchu", + "mni": "Manipuri", + "moh": "Mohawk", + "mos": "Mossi", + "mr": "Marathi", + "mrj": "Western Mari", + "ms": "Malay", + "mt": "Maltese", + "mua": "Mundang", + "mul": "Multiple languages", + "mus": "Muscogee", + "mwl": "Mirandese", + "mwr": "Marwari", + "mwv": "Mentawai", + "my": "Burmese", + "my-alt-variant": "Myanmar Language", + "mye": "Myene", + "myv": "Erzya", + "mzn": "Mazanderani", + "na": "Nauru", + "nan": "Min Nan Chinese", + "nap": "Neapolitan", + "naq": "Nama", + "nb": "Norwegian Bokmål", + "nd": "North Ndebele", + "nds": "Low German", + "nds-NL": "Low Saxon", + "ne": "Nepali", + "new": "Newari", + "ng": "Ndonga", + "nia": "Nias", + "niu": "Niuean", + "njo": "Ao Naga", + "nl": "Dutch", + "nl-BE": "Flemish", + "nmg": "Kwasio", + "nn": "Norwegian Nynorsk", + "nnh": "Ngiemboon", + "no": "Norwegian", + "nog": "Nogai", + "non": "Old Norse", + "nov": "Novial", + "nqo": "N’Ko", + "nr": "South Ndebele", + "nso": "Northern Sotho", + "nus": "Nuer", + "nv": "Navajo", + "nwc": "Classical Newari", + "ny": "Nyanja", + "nym": "Nyamwezi", + "nyn": "Nyankole", + "nyo": "Nyoro", + "nzi": "Nzima", + "oc": "Occitan", + "oj": "Ojibwa", + "om": "Oromo", + "or": "Odia", + "os": "Ossetic", + "osa": "Osage", + "ota": "Ottoman Turkish", + "pa": "Punjabi", + "pag": "Pangasinan", + "pal": "Pahlavi", + "pam": "Pampanga", + "pap": "Papiamento", + "pau": "Palauan", + "pcd": "Picard", + "pcm": "Nigerian Pidgin", + "pdc": "Pennsylvania German", + "pdt": "Plautdietsch", + "peo": "Old Persian", + "pfl": "Palatine German", + "phn": "Phoenician", + "pi": "Pali", + "pl": "Polish", + "pms": "Piedmontese", + "pnt": "Pontic", + "pon": "Pohnpeian", + "prg": "Prussian", + "pro": "Old Provençal", + "ps": "Pashto", + "ps-alt-variant": "Pushto", + "pt": "Portuguese", + "pt-BR": "Brazilian Portuguese", + "pt-PT": "European Portuguese", + "qu": "Quechua", + "quc": "Kʼicheʼ", + "qug": "Chimborazo Highland Quichua", + "raj": "Rajasthani", + "rap": "Rapanui", + "rar": "Rarotongan", + "rgn": "Romagnol", + "rif": "Riffian", + "rm": "Romansh", + "rn": "Rundi", + "ro": "Romanian", + "ro-MD": "Moldavian", + "rof": "Rombo", + "rom": "Romany", + "root": "Root", + "rtm": "Rotuman", + "ru": "Russian", + "rue": "Rusyn", + "rug": "Roviana", + "rup": "Aromanian", + "rw": "Kinyarwanda", + "rwk": "Rwa", + "sa": "Sanskrit", + "sad": "Sandawe", + "sah": "Sakha", + "sam": "Samaritan Aramaic", + "saq": "Samburu", + "sas": "Sasak", + "sat": "Santali", + "saz": "Saurashtra", + "sba": "Ngambay", + "sbp": "Sangu", + "sc": "Sardinian", + "scn": "Sicilian", + "sco": "Scots", + "sd": "Sindhi", + "sdc": "Sassarese Sardinian", + "sdh": "Southern Kurdish", + "se": "Northern Sami", + "see": "Seneca", + "seh": "Sena", + "sei": "Seri", + "sel": "Selkup", + "ses": "Koyraboro Senni", + "sg": "Sango", + "sga": "Old Irish", + "sgs": "Samogitian", + "sh": "Serbo-Croatian", + "shi": "Tachelhit", + "shn": "Shan", + "shu": "Chadian Arabic", + "si": "Sinhala", + "sid": "Sidamo", + "sk": "Slovak", + "sl": "Slovenian", + "sli": "Lower Silesian", + "sly": "Selayar", + "sm": "Samoan", + "sma": "Southern Sami", + "smj": "Lule Sami", + "smn": "Inari Sami", + "sms": "Skolt Sami", + "sn": "Shona", + "snk": "Soninke", + "so": "Somali", + "sog": "Sogdien", + "sq": "Albanian", + "sr": "Serbian", + "sr-ME": "Montenegrin", + "srn": "Sranan Tongo", + "srr": "Serer", + "ss": "Swati", + "ssy": "Saho", + "st": "Southern Sotho", + "stq": "Saterland Frisian", + "su": "Sundanese", + "suk": "Sukuma", + "sus": "Susu", + "sux": "Sumerian", + "sv": "Swedish", + "sw": "Swahili", + "sw-CD": "Congo Swahili", + "swb": "Comorian", + "syc": "Classical Syriac", + "syr": "Syriac", + "szl": "Silesian", + "ta": "Tamil", + "tcy": "Tulu", + "te": "Telugu", + "tem": "Timne", + "teo": "Teso", + "ter": "Tereno", + "tet": "Tetum", + "tg": "Tajik", + "th": "Thai", + "ti": "Tigrinya", + "tig": "Tigre", + "tiv": "Tiv", + "tk": "Turkmen", + "tkl": "Tokelau", + "tkr": "Tsakhur", + "tl": "Tagalog", + "tlh": "Klingon", + "tli": "Tlingit", + "tly": "Talysh", + "tmh": "Tamashek", + "tn": "Tswana", + "to": "Tongan", + "tog": "Nyasa Tonga", + "tpi": "Tok Pisin", + "tr": "Turkish", + "tru": "Turoyo", + "trv": "Taroko", + "ts": "Tsonga", + "tsd": "Tsakonian", + "tsi": "Tsimshian", + "tt": "Tatar", + "ttt": "Muslim Tat", + "tum": "Tumbuka", + "tvl": "Tuvalu", + "tw": "Twi", + "twq": "Tasawaq", + "ty": "Tahitian", + "tyv": "Tuvinian", + "tzm": "Central Atlas Tamazight", + "udm": "Udmurt", + "ug": "Uyghur", + "ug-alt-variant": "Uighur", + "uga": "Ugaritic", + "uk": "Ukrainian", + "umb": "Umbundu", + "und": "Unknown language", + "ur": "Urdu", + "uz": "Uzbek", + "vai": "Vai", + "ve": "Venda", + "vec": "Venetian", + "vep": "Veps", + "vi": "Vietnamese", + "vls": "West Flemish", + "vmf": "Main-Franconian", + "vo": "Volapük", + "vot": "Votic", + "vro": "Võro", + "vun": "Vunjo", + "wa": "Walloon", + "wae": "Walser", + "wal": "Wolaytta", + "war": "Waray", + "was": "Washo", + "wbp": "Warlpiri", + "wo": "Wolof", + "wuu": "Wu Chinese", + "xal": "Kalmyk", + "xh": "Xhosa", + "xmf": "Mingrelian", + "xog": "Soga", + "yao": "Yao", + "yap": "Yapese", + "yav": "Yangben", + "ybb": "Yemba", + "yi": "Yiddish", + "yo": "Yoruba", + "yrl": "Nheengatu", + "yue": "Cantonese", + "yue-alt-menu": "Chinese, Cantonese", + "za": "Zhuang", + "zap": "Zapotec", + "zbl": "Blissymbols", + "zea": "Zeelandic", + "zen": "Zenaga", + "zgh": "Standard Moroccan Tamazight", + "zh": "Chinese", + "zh-alt-long": "Mandarin Chinese", + "zh-alt-menu": "Chinese, Mandarin", + "zh-Hans": "Simplified Chinese", + "zh-Hans-alt-long": "Simplified Mandarin Chinese", + "zh-Hant": "Traditional Chinese", + "zh-Hant-alt-long": "Traditional Mandarin Chinese", + "zu": "Zulu", + "zun": "Zuni", + "zxx": "No linguistic content", + "zza": "Zaza" +} \ No newline at end of file From a9846fd58fc2c94625ac44bbc90fda1d7e59d501 Mon Sep 17 00:00:00 2001 From: SBrandeis Date: Fri, 23 Apr 2021 12:14:45 +0200 Subject: [PATCH 26/28] Update languages.json --- src/datasets/utils/metadata.py | 3 +- src/datasets/utils/resources/languages.json | 1009 +++++++++++-------- 2 files changed, 584 insertions(+), 428 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 312a7b090ac..44e7b6be338 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -28,7 +28,8 @@ def load_json_resource(resource: str) -> Tuple[Any, str]: # Source of languages.json: -# https://github.com/unicode-org/cldr-json/blob/9f6b2f0c5eb3aabaa97343cd1ee431a3badc4851/cldr-json/cldr-localenames-full/main/en/languages.json +# https://datahub.io/core/language-codes/r/ietf-language-tags.csv +# Language names were obtained with langcodes: https://github.com/LuminosoInsight/langcodes known_language_codes, known_language_codes_url = load_json_resource("languages.json") known_licenses, known_licenses_url = load_json_resource("licenses.json") known_task_ids, known_task_ids_url = load_json_resource("tasks.json") diff --git a/src/datasets/utils/resources/languages.json b/src/datasets/utils/resources/languages.json index 03e5d037af1..d0c77f66f11 100644 --- a/src/datasets/utils/resources/languages.json +++ b/src/datasets/utils/resources/languages.json @@ -1,637 +1,792 @@ { - "aa": "Afar", - "ab": "Abkhazian", - "ace": "Achinese", - "ach": "Acoli", - "ada": "Adangme", - "ady": "Adyghe", - "ae": "Avestan", - "aeb": "Tunisian Arabic", "af": "Afrikaans", - "afh": "Afrihili", + "af-NA": "Afrikaans (Namibia)", + "af-ZA": "Afrikaans (South Africa)", "agq": "Aghem", - "ain": "Ainu", + "agq-CM": "Aghem (Cameroon)", "ak": "Akan", - "akk": "Akkadian", - "akz": "Alabama", - "ale": "Aleut", - "aln": "Gheg Albanian", - "alt": "Southern Altai", + "ak-GH": "Akan (Ghana)", "am": "Amharic", - "an": "Aragonese", - "ang": "Old English", - "anp": "Angika", + "am-ET": "Amharic (Ethiopia)", "ar": "Arabic", - "ar-001": "Modern Standard Arabic", - "arc": "Aramaic", - "arn": "Mapuche", - "aro": "Araona", - "arp": "Arapaho", - "arq": "Algerian Arabic", - "ars": "Najdi Arabic", - "ars-alt-menu": "Arabic, Najdi", - "arw": "Arawak", - "ary": "Moroccan Arabic", - "arz": "Egyptian Arabic", + "ar-001": "Arabic (World)", + "ar-AE": "Arabic (United Arab Emirates)", + "ar-BH": "Arabic (Bahrain)", + "ar-DJ": "Arabic (Djibouti)", + "ar-DZ": "Arabic (Algeria)", + "ar-EG": "Arabic (Egypt)", + "ar-EH": "Arabic (Western Sahara)", + "ar-ER": "Arabic (Eritrea)", + "ar-IL": "Arabic (Israel)", + "ar-IQ": "Arabic (Iraq)", + "ar-JO": "Arabic (Jordan)", + "ar-KM": "Arabic (Comoros)", + "ar-KW": "Arabic (Kuwait)", + "ar-LB": "Arabic (Lebanon)", + "ar-LY": "Arabic (Libya)", + "ar-MA": "Arabic (Morocco)", + "ar-MR": "Arabic (Mauritania)", + "ar-OM": "Arabic (Oman)", + "ar-PS": "Arabic (Palestinian Territories)", + "ar-QA": "Arabic (Qatar)", + "ar-SA": "Arabic (Saudi Arabia)", + "ar-SD": "Arabic (Sudan)", + "ar-SO": "Arabic (Somalia)", + "ar-SS": "Arabic (South Sudan)", + "ar-SY": "Arabic (Syria)", + "ar-TD": "Arabic (Chad)", + "ar-TN": "Arabic (Tunisia)", + "ar-YE": "Arabic (Yemen)", "as": "Assamese", + "as-IN": "Assamese (India)", "asa": "Asu", - "ase": "American Sign Language", + "asa-TZ": "Asu (Tanzania)", "ast": "Asturian", - "av": "Avaric", - "avk": "Kotava", - "awa": "Awadhi", - "ay": "Aymara", + "ast-ES": "Asturian (Spain)", "az": "Azerbaijani", - "az-alt-short": "Azeri", - "ba": "Bashkir", - "bal": "Baluchi", - "ban": "Balinese", - "bar": "Bavarian", + "az-Cyrl": "Azerbaijani (Cyrillic)", + "az-Cyrl-AZ": "Azerbaijani (Cyrillic, Azerbaijan)", + "az-Latn": "Azerbaijani (Latin)", + "az-Latn-AZ": "Azerbaijani (Latin, Azerbaijan)", "bas": "Basaa", - "bax": "Bamun", - "bbc": "Batak Toba", - "bbj": "Ghomala", + "bas-CM": "Basaa (Cameroon)", "be": "Belarusian", - "bej": "Beja", + "be-BY": "Belarusian (Belarus)", "bem": "Bemba", - "bew": "Betawi", + "bem-ZM": "Bemba (Zambia)", "bez": "Bena", - "bfd": "Bafut", - "bfq": "Badaga", + "bez-TZ": "Bena (Tanzania)", "bg": "Bulgarian", - "bgn": "Western Balochi", - "bho": "Bhojpuri", - "bi": "Bislama", - "bik": "Bikol", - "bin": "Bini", - "bjn": "Banjar", - "bkm": "Kom", - "bla": "Siksika", + "bg-BG": "Bulgarian (Bulgaria)", "bm": "Bambara", + "bm-ML": "Bambara (Mali)", "bn": "Bangla", + "bn-BD": "Bangla (Bangladesh)", + "bn-IN": "Bangla (India)", "bo": "Tibetan", - "bpy": "Bishnupriya", - "bqi": "Bakhtiari", + "bo-CN": "Tibetan (China)", + "bo-IN": "Tibetan (India)", "br": "Breton", - "bra": "Braj", - "brh": "Brahui", + "br-FR": "Breton (France)", "brx": "Bodo", + "brx-IN": "Bodo (India)", "bs": "Bosnian", - "bss": "Akoose", - "bua": "Buriat", - "bug": "Buginese", - "bum": "Bulu", - "byn": "Blin", - "byv": "Medumba", + "bs-Cyrl": "Bosnian (Cyrillic)", + "bs-Cyrl-BA": "Bosnian (Cyrillic, Bosnia & Herzegovina)", + "bs-Latn": "Bosnian", + "bs-Latn-BA": "Bosnian (Bosnia & Herzegovina)", "ca": "Catalan", - "cad": "Caddo", - "car": "Carib", - "cay": "Cayuga", - "cch": "Atsam", + "ca-AD": "Catalan (Andorra)", + "ca-ES": "Catalan (Spain)", + "ca-ES-valencia": "Catalan (Spain)", + "ca-FR": "Catalan (France)", + "ca-IT": "Catalan (Italy)", "ccp": "Chakma", + "ccp-BD": "Chakma (Bangladesh)", + "ccp-IN": "Chakma (India)", "ce": "Chechen", + "ce-RU": "Chechen (Russia)", "ceb": "Cebuano", + "ceb-PH": "Cebuano (Philippines)", "cgg": "Chiga", - "ch": "Chamorro", - "chb": "Chibcha", - "chg": "Chagatai", - "chk": "Chuukese", - "chm": "Mari", - "chn": "Chinook Jargon", - "cho": "Choctaw", - "chp": "Chipewyan", + "cgg-UG": "Chiga (Uganda)", "chr": "Cherokee", - "chy": "Cheyenne", - "cic": "Chickasaw", + "chr-US": "Cherokee (United States)", "ckb": "Central Kurdish", - "ckb-alt-menu": "Kurdish, Central", - "ckb-alt-variant": "Kurdish, Sorani", - "co": "Corsican", - "cop": "Coptic", - "cps": "Capiznon", - "cr": "Cree", - "crh": "Crimean Turkish", - "crs": "Seselwa Creole French", + "ckb-IQ": "Central Kurdish (Iraq)", + "ckb-IR": "Central Kurdish (Iran)", "cs": "Czech", - "csb": "Kashubian", + "cs-CZ": "Czech (Czechia)", "cu": "Church Slavic", - "cv": "Chuvash", + "cu-RU": "Church Slavic (Russia)", "cy": "Welsh", + "cy-GB": "Welsh (United Kingdom)", "da": "Danish", - "dak": "Dakota", - "dar": "Dargwa", + "da-DK": "Danish (Denmark)", + "da-GL": "Danish (Greenland)", "dav": "Taita", + "dav-KE": "Taita (Kenya)", "de": "German", - "de-AT": "Austrian German", - "de-CH": "Swiss High German", - "del": "Delaware", - "den": "Slave", - "dgr": "Dogrib", - "din": "Dinka", + "de-AT": "German (Austria)", + "de-BE": "German (Belgium)", + "de-CH": "German (Switzerland)", + "de-DE": "German (Germany)", + "de-IT": "German (Italy)", + "de-LI": "German (Liechtenstein)", + "de-LU": "German (Luxembourg)", "dje": "Zarma", - "doi": "Dogri", + "dje-NE": "Zarma (Niger)", "dsb": "Lower Sorbian", - "dtp": "Central Dusun", + "dsb-DE": "Lower Sorbian (Germany)", "dua": "Duala", - "dum": "Middle Dutch", - "dv": "Divehi", + "dua-CM": "Duala (Cameroon)", "dyo": "Jola-Fonyi", - "dyu": "Dyula", + "dyo-SN": "Jola-Fonyi (Senegal)", "dz": "Dzongkha", - "dzg": "Dazaga", + "dz-BT": "Dzongkha (Bhutan)", "ebu": "Embu", + "ebu-KE": "Embu (Kenya)", "ee": "Ewe", - "efi": "Efik", - "egl": "Emilian", - "egy": "Ancient Egyptian", - "eka": "Ekajuk", + "ee-GH": "Ewe (Ghana)", + "ee-TG": "Ewe (Togo)", "el": "Greek", - "elx": "Elamite", + "el-CY": "Greek (Cyprus)", + "el-GR": "Greek (Greece)", "en": "English", - "en-AU": "Australian English", - "en-CA": "Canadian English", - "en-GB": "British English", - "en-GB-alt-short": "UK English", - "en-US": "American English", - "en-US-alt-short": "US English", - "enm": "Middle English", + "en-001": "English (World)", + "en-150": "English (Europe)", + "en-AE": "English (United Arab Emirates)", + "en-AG": "English (Antigua & Barbuda)", + "en-AI": "English (Anguilla)", + "en-AS": "English (American Samoa)", + "en-AT": "English (Austria)", + "en-AU": "English (Australia)", + "en-BB": "English (Barbados)", + "en-BE": "English (Belgium)", + "en-BI": "English (Burundi)", + "en-BM": "English (Bermuda)", + "en-BS": "English (Bahamas)", + "en-BW": "English (Botswana)", + "en-BZ": "English (Belize)", + "en-CA": "English (Canada)", + "en-CC": "English (Cocos (Keeling) Islands)", + "en-CH": "English (Switzerland)", + "en-CK": "English (Cook Islands)", + "en-CM": "English (Cameroon)", + "en-CX": "English (Christmas Island)", + "en-CY": "English (Cyprus)", + "en-DE": "English (Germany)", + "en-DG": "English (Diego Garcia)", + "en-DK": "English (Denmark)", + "en-DM": "English (Dominica)", + "en-ER": "English (Eritrea)", + "en-FI": "English (Finland)", + "en-FJ": "English (Fiji)", + "en-FK": "English (Falkland Islands)", + "en-FM": "English (Micronesia)", + "en-GB": "English (United Kingdom)", + "en-GD": "English (Grenada)", + "en-GG": "English (Guernsey)", + "en-GH": "English (Ghana)", + "en-GI": "English (Gibraltar)", + "en-GM": "English (Gambia)", + "en-GU": "English (Guam)", + "en-GY": "English (Guyana)", + "en-HK": "English (Hong Kong SAR China)", + "en-IE": "English (Ireland)", + "en-IL": "English (Israel)", + "en-IM": "English (Isle of Man)", + "en-IN": "English (India)", + "en-IO": "English (British Indian Ocean Territory)", + "en-JE": "English (Jersey)", + "en-JM": "English (Jamaica)", + "en-KE": "English (Kenya)", + "en-KI": "English (Kiribati)", + "en-KN": "English (St. Kitts & Nevis)", + "en-KY": "English (Cayman Islands)", + "en-LC": "English (St. Lucia)", + "en-LR": "English (Liberia)", + "en-LS": "English (Lesotho)", + "en-MG": "English (Madagascar)", + "en-MH": "English (Marshall Islands)", + "en-MO": "English (Macao SAR China)", + "en-MP": "English (Northern Mariana Islands)", + "en-MS": "English (Montserrat)", + "en-MT": "English (Malta)", + "en-MU": "English (Mauritius)", + "en-MW": "English (Malawi)", + "en-MY": "English (Malaysia)", + "en-NA": "English (Namibia)", + "en-NF": "English (Norfolk Island)", + "en-NG": "English (Nigeria)", + "en-NL": "English (Netherlands)", + "en-NR": "English (Nauru)", + "en-NU": "English (Niue)", + "en-NZ": "English (New Zealand)", + "en-PG": "English (Papua New Guinea)", + "en-PH": "English (Philippines)", + "en-PK": "English (Pakistan)", + "en-PN": "English (Pitcairn Islands)", + "en-PR": "English (Puerto Rico)", + "en-PW": "English (Palau)", + "en-RW": "English (Rwanda)", + "en-SB": "English (Solomon Islands)", + "en-SC": "English (Seychelles)", + "en-SD": "English (Sudan)", + "en-SE": "English (Sweden)", + "en-SG": "English (Singapore)", + "en-SH": "English (St. Helena)", + "en-SI": "English (Slovenia)", + "en-SL": "English (Sierra Leone)", + "en-SS": "English (South Sudan)", + "en-SX": "English (Sint Maarten)", + "en-SZ": "English (Eswatini)", + "en-TC": "English (Turks & Caicos Islands)", + "en-TK": "English (Tokelau)", + "en-TO": "English (Tonga)", + "en-TT": "English (Trinidad & Tobago)", + "en-TV": "English (Tuvalu)", + "en-TZ": "English (Tanzania)", + "en-UG": "English (Uganda)", + "en-UM": "English (U.S. Outlying Islands)", + "en-US": "English (United States)", + "en-US-posix": "English (United States)", + "en-VC": "English (St. Vincent & Grenadines)", + "en-VG": "English (British Virgin Islands)", + "en-VI": "English (U.S. Virgin Islands)", + "en-VU": "English (Vanuatu)", + "en-WS": "English (Samoa)", + "en-ZA": "English (South Africa)", + "en-ZM": "English (Zambia)", + "en-ZW": "English (Zimbabwe)", "eo": "Esperanto", + "eo-001": "Esperanto (World)", "es": "Spanish", - "es-419": "Latin American Spanish", - "es-ES": "European Spanish", - "es-MX": "Mexican Spanish", - "esu": "Central Yupik", + "es-419": "Spanish (Latin America)", + "es-AR": "Spanish (Argentina)", + "es-BO": "Spanish (Bolivia)", + "es-BR": "Spanish (Brazil)", + "es-BZ": "Spanish (Belize)", + "es-CL": "Spanish (Chile)", + "es-CO": "Spanish (Colombia)", + "es-CR": "Spanish (Costa Rica)", + "es-CU": "Spanish (Cuba)", + "es-DO": "Spanish (Dominican Republic)", + "es-EA": "Spanish (Ceuta & Melilla)", + "es-EC": "Spanish (Ecuador)", + "es-ES": "Spanish (Spain)", + "es-GQ": "Spanish (Equatorial Guinea)", + "es-GT": "Spanish (Guatemala)", + "es-HN": "Spanish (Honduras)", + "es-IC": "Spanish (Canary Islands)", + "es-MX": "Spanish (Mexico)", + "es-NI": "Spanish (Nicaragua)", + "es-PA": "Spanish (Panama)", + "es-PE": "Spanish (Peru)", + "es-PH": "Spanish (Philippines)", + "es-PR": "Spanish (Puerto Rico)", + "es-PY": "Spanish (Paraguay)", + "es-SV": "Spanish (El Salvador)", + "es-US": "Spanish (United States)", + "es-UY": "Spanish (Uruguay)", + "es-VE": "Spanish (Venezuela)", "et": "Estonian", + "et-EE": "Estonian (Estonia)", "eu": "Basque", + "eu-ES": "Basque (Spain)", "ewo": "Ewondo", - "ext": "Extremaduran", + "ewo-CM": "Ewondo (Cameroon)", "fa": "Persian", - "fa-AF": "Dari", - "fan": "Fang", - "fat": "Fanti", + "fa-AF": "Persian (Afghanistan)", + "fa-IR": "Persian (Iran)", "ff": "Fulah", + "ff-Adlm": "Fulah (Adlam)", + "ff-Adlm-BF": "Fulah (Adlam, Burkina Faso)", + "ff-Adlm-CM": "Fulah (Adlam, Cameroon)", + "ff-Adlm-GH": "Fulah (Adlam, Ghana)", + "ff-Adlm-GM": "Fulah (Adlam, Gambia)", + "ff-Adlm-GN": "Fulah (Adlam, Guinea)", + "ff-Adlm-GW": "Fulah (Adlam, Guinea-Bissau)", + "ff-Adlm-LR": "Fulah (Adlam, Liberia)", + "ff-Adlm-MR": "Fulah (Adlam, Mauritania)", + "ff-Adlm-NE": "Fulah (Adlam, Niger)", + "ff-Adlm-NG": "Fulah (Adlam, Nigeria)", + "ff-Adlm-SL": "Fulah (Adlam, Sierra Leone)", + "ff-Adlm-SN": "Fulah (Adlam, Senegal)", + "ff-Latn": "Fulah (Latin)", + "ff-Latn-BF": "Fulah (Latin, Burkina Faso)", + "ff-Latn-CM": "Fulah (Latin, Cameroon)", + "ff-Latn-GH": "Fulah (Latin, Ghana)", + "ff-Latn-GM": "Fulah (Latin, Gambia)", + "ff-Latn-GN": "Fulah (Latin, Guinea)", + "ff-Latn-GW": "Fulah (Latin, Guinea-Bissau)", + "ff-Latn-LR": "Fulah (Latin, Liberia)", + "ff-Latn-MR": "Fulah (Latin, Mauritania)", + "ff-Latn-NE": "Fulah (Latin, Niger)", + "ff-Latn-NG": "Fulah (Latin, Nigeria)", + "ff-Latn-SL": "Fulah (Latin, Sierra Leone)", + "ff-Latn-SN": "Fulah (Latin, Senegal)", "fi": "Finnish", + "fi-FI": "Finnish (Finland)", "fil": "Filipino", - "fit": "Tornedalen Finnish", - "fj": "Fijian", + "fil-PH": "Filipino (Philippines)", "fo": "Faroese", - "fon": "Fon", + "fo-DK": "Faroese (Denmark)", + "fo-FO": "Faroese (Faroe Islands)", "fr": "French", - "fr-CA": "Canadian French", - "fr-CH": "Swiss French", - "frc": "Cajun French", - "frm": "Middle French", - "fro": "Old French", - "frp": "Arpitan", - "frr": "Northern Frisian", - "frs": "Eastern Frisian", + "fr-BE": "French (Belgium)", + "fr-BF": "French (Burkina Faso)", + "fr-BI": "French (Burundi)", + "fr-BJ": "French (Benin)", + "fr-BL": "French (St. Barth\u00e9lemy)", + "fr-CA": "French (Canada)", + "fr-CD": "French (Congo - Kinshasa)", + "fr-CF": "French (Central African Republic)", + "fr-CG": "French (Congo - Brazzaville)", + "fr-CH": "French (Switzerland)", + "fr-CI": "French (C\u00f4te d\u2019Ivoire)", + "fr-CM": "French (Cameroon)", + "fr-DJ": "French (Djibouti)", + "fr-DZ": "French (Algeria)", + "fr-FR": "French (France)", + "fr-GA": "French (Gabon)", + "fr-GF": "French (French Guiana)", + "fr-GN": "French (Guinea)", + "fr-GP": "French (Guadeloupe)", + "fr-GQ": "French (Equatorial Guinea)", + "fr-HT": "French (Haiti)", + "fr-KM": "French (Comoros)", + "fr-LU": "French (Luxembourg)", + "fr-MA": "French (Morocco)", + "fr-MC": "French (Monaco)", + "fr-MF": "French (St. Martin)", + "fr-MG": "French (Madagascar)", + "fr-ML": "French (Mali)", + "fr-MQ": "French (Martinique)", + "fr-MR": "French (Mauritania)", + "fr-MU": "French (Mauritius)", + "fr-NC": "French (New Caledonia)", + "fr-NE": "French (Niger)", + "fr-PF": "French (French Polynesia)", + "fr-PM": "French (St. Pierre & Miquelon)", + "fr-RE": "French (R\u00e9union)", + "fr-RW": "French (Rwanda)", + "fr-SC": "French (Seychelles)", + "fr-SN": "French (Senegal)", + "fr-SY": "French (Syria)", + "fr-TD": "French (Chad)", + "fr-TG": "French (Togo)", + "fr-TN": "French (Tunisia)", + "fr-VU": "French (Vanuatu)", + "fr-WF": "French (Wallis & Futuna)", + "fr-YT": "French (Mayotte)", "fur": "Friulian", + "fur-IT": "Friulian (Italy)", "fy": "Western Frisian", + "fy-NL": "Western Frisian (Netherlands)", "ga": "Irish", - "gaa": "Ga", - "gag": "Gagauz", - "gan": "Gan Chinese", - "gay": "Gayo", - "gba": "Gbaya", - "gbz": "Zoroastrian Dari", + "ga-GB": "Irish (United Kingdom)", + "ga-IE": "Irish (Ireland)", "gd": "Scottish Gaelic", - "gez": "Geez", - "gil": "Gilbertese", + "gd-GB": "Scottish Gaelic (United Kingdom)", "gl": "Galician", - "glk": "Gilaki", - "gmh": "Middle High German", - "gn": "Guarani", - "goh": "Old High German", - "gom": "Goan Konkani", - "gon": "Gondi", - "gor": "Gorontalo", - "got": "Gothic", - "grb": "Grebo", - "grc": "Ancient Greek", + "gl-ES": "Galician (Spain)", "gsw": "Swiss German", + "gsw-CH": "Swiss German (Switzerland)", + "gsw-FR": "Swiss German (France)", + "gsw-LI": "Swiss German (Liechtenstein)", "gu": "Gujarati", - "guc": "Wayuu", - "gur": "Frafra", + "gu-IN": "Gujarati (India)", "guz": "Gusii", + "guz-KE": "Gusii (Kenya)", "gv": "Manx", - "gwi": "Gwichʼin", + "gv-IM": "Manx (Isle of Man)", "ha": "Hausa", - "hai": "Haida", - "hak": "Hakka Chinese", + "ha-GH": "Hausa (Ghana)", + "ha-NE": "Hausa (Niger)", + "ha-NG": "Hausa (Nigeria)", "haw": "Hawaiian", + "haw-US": "Hawaiian (United States)", "he": "Hebrew", + "he-IL": "Hebrew (Israel)", "hi": "Hindi", - "hif": "Fiji Hindi", - "hil": "Hiligaynon", - "hit": "Hittite", - "hmn": "Hmong", - "ho": "Hiri Motu", + "hi-IN": "Hindi (India)", "hr": "Croatian", + "hr-BA": "Croatian (Bosnia & Herzegovina)", + "hr-HR": "Croatian (Croatia)", "hsb": "Upper Sorbian", - "hsn": "Xiang Chinese", - "ht": "Haitian Creole", + "hsb-DE": "Upper Sorbian (Germany)", "hu": "Hungarian", - "hup": "Hupa", + "hu-HU": "Hungarian (Hungary)", "hy": "Armenian", - "hz": "Herero", + "hy-AM": "Armenian (Armenia)", "ia": "Interlingua", - "iba": "Iban", - "ibb": "Ibibio", + "ia-001": "Interlingua (World)", "id": "Indonesian", - "ie": "Interlingue", + "id-ID": "Indonesian (Indonesia)", "ig": "Igbo", + "ig-NG": "Igbo (Nigeria)", "ii": "Sichuan Yi", - "ik": "Inupiaq", - "ilo": "Iloko", - "inh": "Ingush", - "io": "Ido", + "ii-CN": "Sichuan Yi (China)", "is": "Icelandic", + "is-IS": "Icelandic (Iceland)", "it": "Italian", - "iu": "Inuktitut", - "izh": "Ingrian", + "it-CH": "Italian (Switzerland)", + "it-IT": "Italian (Italy)", + "it-SM": "Italian (San Marino)", + "it-VA": "Italian (Vatican City)", "ja": "Japanese", - "jam": "Jamaican Creole English", - "jbo": "Lojban", + "ja-JP": "Japanese (Japan)", "jgo": "Ngomba", + "jgo-CM": "Ngomba (Cameroon)", "jmc": "Machame", - "jpr": "Judeo-Persian", - "jrb": "Judeo-Arabic", - "jut": "Jutish", + "jmc-TZ": "Machame (Tanzania)", "jv": "Javanese", + "jv-ID": "Javanese (Indonesia)", "ka": "Georgian", - "kaa": "Kara-Kalpak", + "ka-GE": "Georgian (Georgia)", "kab": "Kabyle", - "kac": "Kachin", - "kaj": "Jju", + "kab-DZ": "Kabyle (Algeria)", "kam": "Kamba", - "kaw": "Kawi", - "kbd": "Kabardian", - "kbl": "Kanembu", - "kcg": "Tyap", + "kam-KE": "Kamba (Kenya)", "kde": "Makonde", + "kde-TZ": "Makonde (Tanzania)", "kea": "Kabuverdianu", - "ken": "Kenyang", - "kfo": "Koro", - "kg": "Kongo", - "kgp": "Kaingang", - "kha": "Khasi", - "kho": "Khotanese", + "kea-CV": "Kabuverdianu (Cape Verde)", "khq": "Koyra Chiini", - "khw": "Khowar", + "khq-ML": "Koyra Chiini (Mali)", "ki": "Kikuyu", - "kiu": "Kirmanjki", - "kj": "Kuanyama", + "ki-KE": "Kikuyu (Kenya)", "kk": "Kazakh", + "kk-KZ": "Kazakh (Kazakhstan)", "kkj": "Kako", + "kkj-CM": "Kako (Cameroon)", "kl": "Kalaallisut", + "kl-GL": "Kalaallisut (Greenland)", "kln": "Kalenjin", + "kln-KE": "Kalenjin (Kenya)", "km": "Khmer", - "kmb": "Kimbundu", + "km-KH": "Khmer (Cambodia)", "kn": "Kannada", + "kn-IN": "Kannada (India)", "ko": "Korean", - "koi": "Komi-Permyak", + "ko-KP": "Korean (North Korea)", + "ko-KR": "Korean (South Korea)", "kok": "Konkani", - "kos": "Kosraean", - "kpe": "Kpelle", - "kr": "Kanuri", - "krc": "Karachay-Balkar", - "kri": "Krio", - "krj": "Kinaray-a", - "krl": "Karelian", - "kru": "Kurukh", + "kok-IN": "Konkani (India)", "ks": "Kashmiri", + "ks-Arab": "Kashmiri (Arabic)", + "ks-IN": "Kashmiri (India)", "ksb": "Shambala", + "ksb-TZ": "Shambala (Tanzania)", "ksf": "Bafia", + "ksf-CM": "Bafia (Cameroon)", "ksh": "Colognian", + "ksh-DE": "Colognian (Germany)", "ku": "Kurdish", - "kum": "Kumyk", - "kut": "Kutenai", - "kv": "Komi", + "ku-TR": "Kurdish (Turkey)", "kw": "Cornish", + "kw-GB": "Cornish (United Kingdom)", "ky": "Kyrgyz", - "ky-alt-variant": "Kirghiz", - "la": "Latin", - "lad": "Ladino", + "ky-KG": "Kyrgyz (Kyrgyzstan)", "lag": "Langi", - "lah": "Lahnda", - "lam": "Lamba", + "lag-TZ": "Langi (Tanzania)", "lb": "Luxembourgish", - "lez": "Lezghian", - "lfn": "Lingua Franca Nova", + "lb-LU": "Luxembourgish (Luxembourg)", "lg": "Ganda", - "li": "Limburgish", - "lij": "Ligurian", - "liv": "Livonian", + "lg-UG": "Ganda (Uganda)", "lkt": "Lakota", - "lmo": "Lombard", + "lkt-US": "Lakota (United States)", "ln": "Lingala", + "ln-AO": "Lingala (Angola)", + "ln-CD": "Lingala (Congo - Kinshasa)", + "ln-CF": "Lingala (Central African Republic)", + "ln-CG": "Lingala (Congo - Brazzaville)", "lo": "Lao", - "lol": "Mongo", - "lou": "Louisiana Creole", - "loz": "Lozi", + "lo-LA": "Lao (Laos)", "lrc": "Northern Luri", + "lrc-IQ": "Northern Luri (Iraq)", + "lrc-IR": "Northern Luri (Iran)", "lt": "Lithuanian", - "ltg": "Latgalian", + "lt-LT": "Lithuanian (Lithuania)", "lu": "Luba-Katanga", - "lua": "Luba-Lulua", - "lui": "Luiseno", - "lun": "Lunda", - "luo": "Luo", - "lus": "Mizo", + "lu-CD": "Luba-Katanga (Congo - Kinshasa)", + "luo": "Luo (Kenya and Tanzania)", + "luo-KE": "Luo (Kenya and Tanzania) (Kenya)", "luy": "Luyia", + "luy-KE": "Luyia (Kenya)", "lv": "Latvian", - "lzh": "Literary Chinese", - "lzz": "Laz", - "mad": "Madurese", - "maf": "Mafa", - "mag": "Magahi", + "lv-LV": "Latvian (Latvia)", "mai": "Maithili", - "mak": "Makasar", - "man": "Mandingo", + "mai-IN": "Maithili (India)", "mas": "Masai", - "mde": "Maba", - "mdf": "Moksha", - "mdr": "Mandar", - "men": "Mende", + "mas-KE": "Masai (Kenya)", + "mas-TZ": "Masai (Tanzania)", "mer": "Meru", + "mer-KE": "Meru (Kenya)", "mfe": "Morisyen", + "mfe-MU": "Morisyen (Mauritius)", "mg": "Malagasy", - "mga": "Middle Irish", + "mg-MG": "Malagasy (Madagascar)", "mgh": "Makhuwa-Meetto", - "mgo": "Metaʼ", - "mh": "Marshallese", + "mgh-MZ": "Makhuwa-Meetto (Mozambique)", + "mgo": "Meta\u02bc", + "mgo-CM": "Meta\u02bc (Cameroon)", "mi": "Maori", - "mic": "Mi'kmaq", - "min": "Minangkabau", + "mi-NZ": "Maori (New Zealand)", "mk": "Macedonian", + "mk-MK": "Macedonian (North Macedonia)", "ml": "Malayalam", + "ml-IN": "Malayalam (India)", "mn": "Mongolian", - "mnc": "Manchu", + "mn-MN": "Mongolian (Mongolia)", "mni": "Manipuri", - "moh": "Mohawk", - "mos": "Mossi", + "mni-Beng": "Manipuri (Bangla)", + "mni-Beng-IN": "Manipuri (Bangla, India)", "mr": "Marathi", - "mrj": "Western Mari", + "mr-IN": "Marathi (India)", "ms": "Malay", + "ms-BN": "Malay (Brunei)", + "ms-ID": "Malay (Indonesia)", + "ms-MY": "Malay (Malaysia)", + "ms-SG": "Malay (Singapore)", "mt": "Maltese", + "mt-MT": "Maltese (Malta)", "mua": "Mundang", - "mul": "Multiple languages", - "mus": "Muscogee", - "mwl": "Mirandese", - "mwr": "Marwari", - "mwv": "Mentawai", + "mua-CM": "Mundang (Cameroon)", "my": "Burmese", - "my-alt-variant": "Myanmar Language", - "mye": "Myene", - "myv": "Erzya", + "my-MM": "Burmese (Myanmar (Burma))", "mzn": "Mazanderani", - "na": "Nauru", - "nan": "Min Nan Chinese", - "nap": "Neapolitan", + "mzn-IR": "Mazanderani (Iran)", "naq": "Nama", - "nb": "Norwegian Bokmål", + "naq-NA": "Nama (Namibia)", + "nb": "Norwegian Bokm\u00e5l", + "nb-NO": "Norwegian Bokm\u00e5l (Norway)", + "nb-SJ": "Norwegian Bokm\u00e5l (Svalbard & Jan Mayen)", "nd": "North Ndebele", + "nd-ZW": "North Ndebele (Zimbabwe)", "nds": "Low German", - "nds-NL": "Low Saxon", + "nds-DE": "Low German (Germany)", + "nds-NL": "Low German (Netherlands)", "ne": "Nepali", - "new": "Newari", - "ng": "Ndonga", - "nia": "Nias", - "niu": "Niuean", - "njo": "Ao Naga", + "ne-IN": "Nepali (India)", + "ne-NP": "Nepali (Nepal)", "nl": "Dutch", - "nl-BE": "Flemish", + "nl-AW": "Dutch (Aruba)", + "nl-BE": "Dutch (Belgium)", + "nl-BQ": "Dutch (Caribbean Netherlands)", + "nl-CW": "Dutch (Cura\u00e7ao)", + "nl-NL": "Dutch (Netherlands)", + "nl-SR": "Dutch (Suriname)", + "nl-SX": "Dutch (Sint Maarten)", "nmg": "Kwasio", + "nmg-CM": "Kwasio (Cameroon)", "nn": "Norwegian Nynorsk", + "nn-NO": "Norwegian Nynorsk (Norway)", "nnh": "Ngiemboon", - "no": "Norwegian", - "nog": "Nogai", - "non": "Old Norse", - "nov": "Novial", - "nqo": "N’Ko", - "nr": "South Ndebele", - "nso": "Northern Sotho", + "nnh-CM": "Ngiemboon (Cameroon)", "nus": "Nuer", - "nv": "Navajo", - "nwc": "Classical Newari", - "ny": "Nyanja", - "nym": "Nyamwezi", + "nus-SS": "Nuer (South Sudan)", "nyn": "Nyankole", - "nyo": "Nyoro", - "nzi": "Nzima", - "oc": "Occitan", - "oj": "Ojibwa", + "nyn-UG": "Nyankole (Uganda)", "om": "Oromo", + "om-ET": "Oromo (Ethiopia)", + "om-KE": "Oromo (Kenya)", "or": "Odia", + "or-IN": "Odia (India)", "os": "Ossetic", - "osa": "Osage", - "ota": "Ottoman Turkish", + "os-GE": "Ossetic (Georgia)", + "os-RU": "Ossetic (Russia)", "pa": "Punjabi", - "pag": "Pangasinan", - "pal": "Pahlavi", - "pam": "Pampanga", - "pap": "Papiamento", - "pau": "Palauan", - "pcd": "Picard", + "pa-Arab": "Punjabi (Arabic)", + "pa-Arab-PK": "Punjabi (Arabic, Pakistan)", + "pa-Guru": "Punjabi", + "pa-Guru-IN": "Punjabi (India)", "pcm": "Nigerian Pidgin", - "pdc": "Pennsylvania German", - "pdt": "Plautdietsch", - "peo": "Old Persian", - "pfl": "Palatine German", - "phn": "Phoenician", - "pi": "Pali", + "pcm-NG": "Nigerian Pidgin (Nigeria)", "pl": "Polish", - "pms": "Piedmontese", - "pnt": "Pontic", - "pon": "Pohnpeian", + "pl-PL": "Polish (Poland)", "prg": "Prussian", - "pro": "Old Provençal", + "prg-001": "Prussian (World)", "ps": "Pashto", - "ps-alt-variant": "Pushto", + "ps-AF": "Pashto (Afghanistan)", + "ps-PK": "Pashto (Pakistan)", "pt": "Portuguese", - "pt-BR": "Brazilian Portuguese", - "pt-PT": "European Portuguese", + "pt-AO": "Portuguese (Angola)", + "pt-BR": "Portuguese (Brazil)", + "pt-CH": "Portuguese (Switzerland)", + "pt-CV": "Portuguese (Cape Verde)", + "pt-GQ": "Portuguese (Equatorial Guinea)", + "pt-GW": "Portuguese (Guinea-Bissau)", + "pt-LU": "Portuguese (Luxembourg)", + "pt-MO": "Portuguese (Macao SAR China)", + "pt-MZ": "Portuguese (Mozambique)", + "pt-PT": "Portuguese (Portugal)", + "pt-ST": "Portuguese (S\u00e3o Tom\u00e9 & Pr\u00edncipe)", + "pt-TL": "Portuguese (Timor-Leste)", "qu": "Quechua", - "quc": "Kʼicheʼ", - "qug": "Chimborazo Highland Quichua", - "raj": "Rajasthani", - "rap": "Rapanui", - "rar": "Rarotongan", - "rgn": "Romagnol", - "rif": "Riffian", + "qu-BO": "Quechua (Bolivia)", + "qu-EC": "Quechua (Ecuador)", + "qu-PE": "Quechua (Peru)", "rm": "Romansh", + "rm-CH": "Romansh (Switzerland)", "rn": "Rundi", + "rn-BI": "Rundi (Burundi)", "ro": "Romanian", - "ro-MD": "Moldavian", + "ro-MD": "Romanian (Moldova)", + "ro-RO": "Romanian (Romania)", "rof": "Rombo", - "rom": "Romany", - "root": "Root", - "rtm": "Rotuman", + "rof-TZ": "Rombo (Tanzania)", + "und": "Unknown language", "ru": "Russian", - "rue": "Rusyn", - "rug": "Roviana", - "rup": "Aromanian", + "ru-BY": "Russian (Belarus)", + "ru-KG": "Russian (Kyrgyzstan)", + "ru-KZ": "Russian (Kazakhstan)", + "ru-MD": "Russian (Moldova)", + "ru-RU": "Russian (Russia)", + "ru-UA": "Russian (Ukraine)", "rw": "Kinyarwanda", + "rw-RW": "Kinyarwanda (Rwanda)", "rwk": "Rwa", - "sa": "Sanskrit", - "sad": "Sandawe", + "rwk-TZ": "Rwa (Tanzania)", "sah": "Sakha", - "sam": "Samaritan Aramaic", + "sah-RU": "Sakha (Russia)", "saq": "Samburu", - "sas": "Sasak", + "saq-KE": "Samburu (Kenya)", "sat": "Santali", - "saz": "Saurashtra", - "sba": "Ngambay", + "sat-Olck": "Santali (Ol Chiki)", + "sat-Olck-IN": "Santali (Ol Chiki, India)", "sbp": "Sangu", - "sc": "Sardinian", - "scn": "Sicilian", - "sco": "Scots", + "sbp-TZ": "Sangu (Tanzania)", "sd": "Sindhi", - "sdc": "Sassarese Sardinian", - "sdh": "Southern Kurdish", + "sd-Arab": "Sindhi (Arabic)", + "sd-Arab-PK": "Sindhi (Arabic, Pakistan)", + "sd-Deva": "Sindhi (Devanagari)", + "sd-Deva-IN": "Sindhi (Devanagari, India)", "se": "Northern Sami", - "see": "Seneca", + "se-FI": "Northern Sami (Finland)", + "se-NO": "Northern Sami (Norway)", + "se-SE": "Northern Sami (Sweden)", "seh": "Sena", - "sei": "Seri", - "sel": "Selkup", + "seh-MZ": "Sena (Mozambique)", "ses": "Koyraboro Senni", + "ses-ML": "Koyraboro Senni (Mali)", "sg": "Sango", - "sga": "Old Irish", - "sgs": "Samogitian", - "sh": "Serbo-Croatian", + "sg-CF": "Sango (Central African Republic)", "shi": "Tachelhit", - "shn": "Shan", - "shu": "Chadian Arabic", + "shi-Latn": "Tachelhit (Latin)", + "shi-Latn-MA": "Tachelhit (Latin, Morocco)", + "shi-Tfng": "Tachelhit (Tifinagh)", + "shi-Tfng-MA": "Tachelhit (Tifinagh, Morocco)", "si": "Sinhala", - "sid": "Sidamo", + "si-LK": "Sinhala (Sri Lanka)", "sk": "Slovak", + "sk-SK": "Slovak (Slovakia)", "sl": "Slovenian", - "sli": "Lower Silesian", - "sly": "Selayar", - "sm": "Samoan", - "sma": "Southern Sami", - "smj": "Lule Sami", + "sl-SI": "Slovenian (Slovenia)", "smn": "Inari Sami", - "sms": "Skolt Sami", + "smn-FI": "Inari Sami (Finland)", "sn": "Shona", - "snk": "Soninke", + "sn-ZW": "Shona (Zimbabwe)", "so": "Somali", - "sog": "Sogdien", + "so-DJ": "Somali (Djibouti)", + "so-ET": "Somali (Ethiopia)", + "so-KE": "Somali (Kenya)", + "so-SO": "Somali (Somalia)", "sq": "Albanian", + "sq-AL": "Albanian (Albania)", + "sq-MK": "Albanian (North Macedonia)", + "sq-XK": "Albanian (Kosovo)", "sr": "Serbian", - "sr-ME": "Montenegrin", - "srn": "Sranan Tongo", - "srr": "Serer", - "ss": "Swati", - "ssy": "Saho", - "st": "Southern Sotho", - "stq": "Saterland Frisian", + "sr-Cyrl": "Serbian (Cyrillic)", + "sr-Cyrl-BA": "Serbian (Cyrillic, Bosnia & Herzegovina)", + "sr-Cyrl-ME": "Serbian (Cyrillic, Montenegro)", + "sr-Cyrl-RS": "Serbian (Cyrillic, Serbia)", + "sr-Cyrl-XK": "Serbian (Cyrillic, Kosovo)", + "sr-Latn": "Serbian (Latin)", + "sr-Latn-BA": "Serbian (Latin, Bosnia & Herzegovina)", + "sr-Latn-ME": "Serbian (Latin, Montenegro)", + "sr-Latn-RS": "Serbian (Latin, Serbia)", + "sr-Latn-XK": "Serbian (Latin, Kosovo)", "su": "Sundanese", - "suk": "Sukuma", - "sus": "Susu", - "sux": "Sumerian", + "su-Latn": "Sundanese (Latin)", + "su-Latn-ID": "Sundanese (Latin, Indonesia)", "sv": "Swedish", + "sv-AX": "Swedish (\u00c5land Islands)", + "sv-FI": "Swedish (Finland)", + "sv-SE": "Swedish (Sweden)", "sw": "Swahili", - "sw-CD": "Congo Swahili", - "swb": "Comorian", - "syc": "Classical Syriac", - "syr": "Syriac", - "szl": "Silesian", + "sw-CD": "Swahili (Congo - Kinshasa)", + "sw-KE": "Swahili (Kenya)", + "sw-TZ": "Swahili (Tanzania)", + "sw-UG": "Swahili (Uganda)", "ta": "Tamil", - "tcy": "Tulu", + "ta-IN": "Tamil (India)", + "ta-LK": "Tamil (Sri Lanka)", + "ta-MY": "Tamil (Malaysia)", + "ta-SG": "Tamil (Singapore)", "te": "Telugu", - "tem": "Timne", + "te-IN": "Telugu (India)", "teo": "Teso", - "ter": "Tereno", - "tet": "Tetum", + "teo-KE": "Teso (Kenya)", + "teo-UG": "Teso (Uganda)", "tg": "Tajik", + "tg-TJ": "Tajik (Tajikistan)", "th": "Thai", + "th-TH": "Thai (Thailand)", "ti": "Tigrinya", - "tig": "Tigre", - "tiv": "Tiv", + "ti-ER": "Tigrinya (Eritrea)", + "ti-ET": "Tigrinya (Ethiopia)", "tk": "Turkmen", - "tkl": "Tokelau", - "tkr": "Tsakhur", - "tl": "Tagalog", - "tlh": "Klingon", - "tli": "Tlingit", - "tly": "Talysh", - "tmh": "Tamashek", - "tn": "Tswana", + "tk-TM": "Turkmen (Turkmenistan)", "to": "Tongan", - "tog": "Nyasa Tonga", - "tpi": "Tok Pisin", + "to-TO": "Tongan (Tonga)", "tr": "Turkish", - "tru": "Turoyo", - "trv": "Taroko", - "ts": "Tsonga", - "tsd": "Tsakonian", - "tsi": "Tsimshian", + "tr-CY": "Turkish (Cyprus)", + "tr-TR": "Turkish (Turkey)", "tt": "Tatar", - "ttt": "Muslim Tat", - "tum": "Tumbuka", - "tvl": "Tuvalu", - "tw": "Twi", + "tt-RU": "Tatar (Russia)", "twq": "Tasawaq", - "ty": "Tahitian", - "tyv": "Tuvinian", + "twq-NE": "Tasawaq (Niger)", "tzm": "Central Atlas Tamazight", - "udm": "Udmurt", + "tzm-MA": "Central Atlas Tamazight (Morocco)", "ug": "Uyghur", - "ug-alt-variant": "Uighur", - "uga": "Ugaritic", + "ug-CN": "Uyghur (China)", "uk": "Ukrainian", - "umb": "Umbundu", - "und": "Unknown language", + "uk-UA": "Ukrainian (Ukraine)", "ur": "Urdu", + "ur-IN": "Urdu (India)", + "ur-PK": "Urdu (Pakistan)", "uz": "Uzbek", + "uz-Arab": "Uzbek (Arabic)", + "uz-Arab-AF": "Uzbek (Arabic, Afghanistan)", + "uz-Cyrl": "Uzbek (Cyrillic)", + "uz-Cyrl-UZ": "Uzbek (Cyrillic, Uzbekistan)", + "uz-Latn": "Uzbek (Latin)", + "uz-Latn-UZ": "Uzbek (Latin, Uzbekistan)", "vai": "Vai", - "ve": "Venda", - "vec": "Venetian", - "vep": "Veps", + "vai-Latn": "Vai (Latin)", + "vai-Latn-LR": "Vai (Latin, Liberia)", + "vai-Vaii": "Vai (Vai)", + "vai-Vaii-LR": "Vai (Vai, Liberia)", "vi": "Vietnamese", - "vls": "West Flemish", - "vmf": "Main-Franconian", - "vo": "Volapük", - "vot": "Votic", - "vro": "Võro", + "vi-VN": "Vietnamese (Vietnam)", + "vo": "Volap\u00fck", + "vo-001": "Volap\u00fck (World)", "vun": "Vunjo", - "wa": "Walloon", + "vun-TZ": "Vunjo (Tanzania)", "wae": "Walser", - "wal": "Wolaytta", - "war": "Waray", - "was": "Washo", - "wbp": "Warlpiri", + "wae-CH": "Walser (Switzerland)", "wo": "Wolof", - "wuu": "Wu Chinese", - "xal": "Kalmyk", + "wo-SN": "Wolof (Senegal)", "xh": "Xhosa", - "xmf": "Mingrelian", + "xh-ZA": "Xhosa (South Africa)", "xog": "Soga", - "yao": "Yao", - "yap": "Yapese", + "xog-UG": "Soga (Uganda)", "yav": "Yangben", - "ybb": "Yemba", + "yav-CM": "Yangben (Cameroon)", "yi": "Yiddish", + "yi-001": "Yiddish (World)", "yo": "Yoruba", - "yrl": "Nheengatu", + "yo-BJ": "Yoruba (Benin)", + "yo-NG": "Yoruba (Nigeria)", "yue": "Cantonese", - "yue-alt-menu": "Chinese, Cantonese", - "za": "Zhuang", - "zap": "Zapotec", - "zbl": "Blissymbols", - "zea": "Zeelandic", - "zen": "Zenaga", + "yue-Hans": "Cantonese (Simplified)", + "yue-Hans-CN": "Cantonese (Simplified, China)", + "yue-Hant": "Cantonese (Traditional)", + "yue-Hant-HK": "Cantonese (Traditional, Hong Kong SAR China)", "zgh": "Standard Moroccan Tamazight", + "zgh-MA": "Standard Moroccan Tamazight (Morocco)", "zh": "Chinese", - "zh-alt-long": "Mandarin Chinese", - "zh-alt-menu": "Chinese, Mandarin", - "zh-Hans": "Simplified Chinese", - "zh-Hans-alt-long": "Simplified Mandarin Chinese", - "zh-Hant": "Traditional Chinese", - "zh-Hant-alt-long": "Traditional Mandarin Chinese", + "zh-Hans": "Chinese (Simplified)", + "zh-Hans-CN": "Chinese (Simplified, China)", + "zh-Hans-HK": "Chinese (Simplified, Hong Kong SAR China)", + "zh-Hans-MO": "Chinese (Simplified, Macao SAR China)", + "zh-Hans-SG": "Chinese (Simplified, Singapore)", + "zh-Hant": "Chinese (Traditional)", + "zh-Hant-HK": "Chinese (Traditional, Hong Kong SAR China)", + "zh-Hant-MO": "Chinese (Traditional, Macao SAR China)", + "zh-Hant-TW": "Chinese (Traditional, Taiwan)", "zu": "Zulu", - "zun": "Zuni", - "zxx": "No linguistic content", - "zza": "Zaza" + "zu-ZA": "Zulu (South Africa)" } \ No newline at end of file From 551ae960084f94a7aa360c19c9812b1c2c922178 Mon Sep 17 00:00:00 2001 From: SBrandeis Date: Fri, 23 Apr 2021 13:21:48 +0200 Subject: [PATCH 27/28] Refactor, add tests --- src/datasets/utils/metadata.py | 56 +++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py index 44e7b6be338..e94fe6abe0f 100644 --- a/src/datasets/utils/metadata.py +++ b/src/datasets/utils/metadata.py @@ -38,18 +38,26 @@ def load_json_resource(resource: str) -> Tuple[Any, str]: known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json") -def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: - """"Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" +def yaml_block_from_readme(path: Path) -> Optional[str]: with path.open() as readme_file: content = [line.strip() for line in readme_file] if content[0] == "---" and "---" in content[1:]: yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) - metada_dict = yaml.safe_load(yamlblock) or dict() - return metada_dict + return yamlblock + return None +def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: + """"Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" + yaml_block = yaml_block_from_readme(path=path) + if yaml_block is None: + return None + metada_dict = yaml.safe_load(yaml_block) or dict() + return metada_dict + + ValidatorOutput = Tuple[List[str], Optional[str]] @@ -74,6 +82,16 @@ def escape_validation_for_predicate( return trues, falses +def validate_metadata_type(metadata_dict: dict): + basic_typing_errors = { + name: value + for name, value in metadata_dict.items() + if not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str) + } + if len(basic_typing_errors) > 0: + raise TypeError(f"Found fields that are not non-empty list of strings: {basic_typing_errors}") + + @dataclass class DatasetMetadata: annotations_creators: List[str] @@ -87,21 +105,13 @@ class DatasetMetadata: task_ids: List[str] def __post_init__(self): - basic_typing_errors = { - name: value - for name, value in vars(self).items() - if not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str) - } - if len(basic_typing_errors) > 0: - raise TypeError(f"Found fields that are not non-empty list of strings: {basic_typing_errors}") + validate_metadata_type(metadata_dict=vars(self)) - self.annotations_creators, annotations_creators_errors = self.annotations_creators_must_be_in_known_set( + self.annotations_creators, annotations_creators_errors = self.validate_annotations_creators( self.annotations_creators ) - self.language_creators, language_creators_errors = self.language_creators_must_be_in_known_set( - self.language_creators - ) - self.languages, languages_errors = self.validate_language_codes(self.language_creators) + self.language_creators, language_creators_errors = self.validate_language_creators(self.language_creators) + self.languages, languages_errors = self.validate_language_codes(self.languages) self.licenses, licenses_errors = self.validate_licences(self.licenses) self.multilinguality, multilinguality_errors = self.validate_mulitlinguality(self.multilinguality) self.size_categories, size_categories_errors = self.validate_size_catgeories(self.size_categories) @@ -143,10 +153,11 @@ def from_readme(cls, path: Path) -> "DatasetMetadata": Raises: :obj:`TypeError`: If the dataset card has no metadata (no YAML header) + :obj:`TypeError`: If the dataset's metadata is invalid """ - metadata_dict = metadata_dict_from_readme(path) - if metadata_dict is not None: - return cls(**metadata_dict) + yaml_string = yaml_block_from_readme(path) + if yaml_string is not None: + return cls.from_yaml_string(yaml_string) else: raise TypeError(f"did not find a yaml block in '{path}'") @@ -159,18 +170,21 @@ def from_yaml_string(cls, string: str) -> "DatasetMetadata": Returns: :class:`DatasetMetadata`: The dataset's metadata + + Raises: + :obj:`TypeError`: If the dataset's metadata is invalid """ metada_dict = yaml.safe_load(string) or dict() return cls(**metada_dict) @staticmethod - def annotations_creators_must_be_in_known_set(annotations_creators: List[str]) -> ValidatorOutput: + def validate_annotations_creators(annotations_creators: List[str]) -> ValidatorOutput: return tagset_validator( annotations_creators, known_creators["annotations"], "annotations_creators", known_creators_url ) @staticmethod - def language_creators_must_be_in_known_set(language_creators: List[str]) -> ValidatorOutput: + def validate_language_creators(language_creators: List[str]) -> ValidatorOutput: return tagset_validator(language_creators, known_creators["language"], "language_creators", known_creators_url) @staticmethod From 8afb25a425324c4c6e4338a30d367c15ed7606d0 Mon Sep 17 00:00:00 2001 From: SBrandeis Date: Fri, 23 Apr 2021 14:58:26 +0200 Subject: [PATCH 28/28] I said, tests!! --- tests/test_metadata_util.py | 236 ++++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 tests/test_metadata_util.py diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py new file mode 100644 index 00000000000..a24e0741862 --- /dev/null +++ b/tests/test_metadata_util.py @@ -0,0 +1,236 @@ +import tempfile +import unittest +from pathlib import Path + +from datasets.utils.metadata import ( + DatasetMetadata, + escape_validation_for_predicate, + metadata_dict_from_readme, + tagset_validator, + validate_metadata_type, + yaml_block_from_readme, +) + + +def _dedent(string: str) -> str: + return "\n".join([line.lstrip() for line in string.splitlines()]) + + +README_YAML = """\ +--- +languages: +- zh +- en +task_ids: +- sentiment-classification +--- +# Begin of markdown + +Some cool dataset card +""" + +README_EMPTY_YAML = """\ +--- +--- +# Begin of markdown + +Some cool dataset card +""" + + +README_NO_YAML = """\ +# Begin of markdown + +Some cool dataset card +""" + + +class TestMetadataUtils(unittest.TestCase): + def test_validate_metadata_type(self): + metadata_dict = { + "tag": ["list", "of", "values"], + "another tag": ["Another", {"list"}, ["of"], 0x646D46736457567A], + } + validate_metadata_type(metadata_dict) + + metadata_dict = {"tag1": []} + with self.assertRaises(TypeError): + validate_metadata_type(metadata_dict) + + metadata_dict = {"tag1": None} + with self.assertRaises(TypeError): + validate_metadata_type(metadata_dict) + + def test_tagset_validator(self): + name = "test_tag" + url = "https://dummy.hf.co" + + values = ["tag1", "tag2", "tag2", "tag3"] + reference_values = ["tag1", "tag2", "tag3"] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, values) + self.assertIsNone(error) + + values = [] + reference_values = ["tag1", "tag2", "tag3"] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, values) + self.assertIsNone(error) + + values = [] + reference_values = [] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, values) + self.assertIsNone(error) + + values = ["tag1", "tag2", "tag2", "tag3", "unknown tag"] + reference_values = ["tag1", "tag2", "tag3"] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, []) + self.assertEqual(error, f"{['unknown tag']} are not registered tags for '{name}', reference at {url}") + + def test_escape_validation_for_predicate(self): + def predicate_fn(string: str) -> bool: + return "ignore" in string + + values = ["process me", "process me too", "ignore me"] + to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) + self.assertListEqual(to_ignore, ["ignore me"]) + self.assertListEqual(to_validate, ["process me", "process me too"]) + + values = ["process me", "process me too"] + to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) + self.assertListEqual(to_ignore, []) + self.assertListEqual(to_validate, values) + + values = ["this value will be ignored", "ignore this one two"] + to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) + self.assertListEqual(to_ignore, values) + self.assertListEqual(to_validate, []) + + def test_yaml_block_from_readme(self): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + + with open(path, "w+") as readme_file: + readme_file.write(README_YAML) + yaml_block = yaml_block_from_readme(path=path) + self.assertEqual( + yaml_block, + _dedent( + """\ + languages: + - zh + - en + task_ids: + - sentiment-classification +""" + ), + ) + + with open(path, "w+") as readme_file: + readme_file.write(README_EMPTY_YAML) + yaml_block = yaml_block_from_readme(path=path) + self.assertEqual( + yaml_block, + _dedent( + """\ + """ + ), + ) + + with open(path, "w+") as readme_file: + readme_file.write(README_NO_YAML) + yaml_block = yaml_block_from_readme(path=path) + self.assertIsNone(yaml_block) + + def test_metadata_dict_from_readme(self): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(README_YAML) + metadata_dict = metadata_dict_from_readme(path) + self.assertDictEqual(metadata_dict, {"languages": ["zh", "en"], "task_ids": ["sentiment-classification"]}) + + with open(path, "w+") as readme_file: + readme_file.write(README_EMPTY_YAML) + metadata_dict = metadata_dict_from_readme(path) + self.assertDictEqual(metadata_dict, {}) + + with open(path, "w+") as readme_file: + readme_file.write(README_NO_YAML) + metadata_dict = metadata_dict_from_readme(path) + self.assertIsNone(metadata_dict) + + def test_from_yaml_string(self): + valid_yaml_string = _dedent( + """\ + annotations_creators: + - found + language_creators: + - found + languages: + - en + licenses: + - unknown + multilinguality: + - monolingual + size_categories: + - 10K