diff --git a/.circleci/config.yml b/.circleci/config.yml index cbd925431b7..42cbc406ba2 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -81,6 +81,7 @@ jobs: - run: black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics - run: isort --check-only tests src benchmarks datasets metrics - run: flake8 tests src benchmarks datasets metrics + - run: ./scripts/datasets_metadata_validator.py build_doc: working_directory: ~/datasets diff --git a/scripts/datasets_metadata_validator.py b/scripts/datasets_metadata_validator.py new file mode 100755 index 00000000000..857d11c116e --- /dev/null +++ b/scripts/datasets_metadata_validator.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +""" This script will run in CI and make sure all new changes to datasets readme files have valid metadata yaml headers. + +""" + +from pathlib import Path +from subprocess import check_output +from typing import List + +from datasets.utils.metadata import DatasetMetadata + + +def get_changed_files(repo_path: Path) -> List[Path]: + diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path) + changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()] + return changed_files + + +if __name__ == "__main__": + import logging + from argparse import ArgumentParser + + logging.basicConfig(level=logging.DEBUG) + + ap = ArgumentParser() + ap.add_argument("--repo_path", type=Path, default=Path.cwd()) + ap.add_argument("--check_all", action="store_true") + args = ap.parse_args() + + repo_path: Path = args.repo_path + if args.check_all: + readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()] + else: + changed_files = get_changed_files(repo_path) + readmes = [ + f + for f in changed_files + if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" + ] + + failed: List[Path] = [] + for readme in sorted(readmes): + try: + DatasetMetadata.from_readme(readme) + logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") + except TypeError as e: + failed.append(readme) + logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}") + except Exception as e: + failed.append(readme) + logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}") + + if len(failed) > 0: + logging.info(f"❌ Failed on {len(failed)} files.") + exit(1) + else: + logging.info("All is well, keep up the good work 🤗!") + exit(0) diff --git a/setup.py b/setup.py index 52563ad0cdf..817c06189a9 100644 --- a/setup.py +++ b/setup.py @@ -52,16 +52,20 @@ import os import sys -from setuptools import find_packages -from setuptools import setup +from setuptools import find_packages, setup + DOCLINES = __doc__.split("\n") # Pin some dependencies for old python versions _deps = { - "fsspec": "fsspec" if sys.version_info >= (3, 7) else "fsspec<0.8.1", # fsspec>=0.8.1 requires py>=3.7 for async stuff - "s3fs": "s3fs" if sys.version_info >= (3, 7) else "s3fs==0.4.2", # later versions of s3fs have issues downloading directories recursively for py36 + "fsspec": "fsspec" + if sys.version_info >= (3, 7) + else "fsspec<0.8.1", # fsspec>=0.8.1 requires py>=3.7 for async stuff + "s3fs": "s3fs" + if sys.version_info >= (3, 7) + else "s3fs==0.4.2", # later versions of s3fs have issues downloading directories recursively for py36 } @@ -149,6 +153,8 @@ "tldextract>=3.1.0", "texttable>=1.6.3", "Werkzeug>=1.0.1", + # metadata validation + "importlib_resources;python_version<'3.7'", ] if os.name == "nt": # windows @@ -167,11 +173,7 @@ ) -QUALITY_REQUIRE = [ - "black", - "isort", - "flake8==3.7.9", -] +QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "pyyaml>=5.3.1"] EXTRAS_REQUIRE = { @@ -214,11 +216,7 @@ license="Apache 2.0", package_dir={"": "src"}, packages=find_packages("src"), - package_data={ - "datasets": [ - "scripts/templates/*", - ], - }, + package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]}, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py new file mode 100644 index 00000000000..e94fe6abe0f --- /dev/null +++ b/src/datasets/utils/metadata.py @@ -0,0 +1,259 @@ +import json +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + + +# loading package files: https://stackoverflow.com/a/20885799 +try: + import importlib.resources as pkg_resources +except ImportError: + # Try backported to PY<37 `importlib_resources`. + import importlib_resources as pkg_resources + +import yaml + +from . import resources + + +BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" +this_url = f"{BASE_REF_URL}/{__file__}" +logger = logging.getLogger(__name__) + + +def load_json_resource(resource: str) -> Tuple[Any, str]: + content = pkg_resources.read_text(resources, resource) + return json.loads(content), f"{BASE_REF_URL}/resources/{resource}" + + +# Source of languages.json: +# https://datahub.io/core/language-codes/r/ietf-language-tags.csv +# Language names were obtained with langcodes: https://github.com/LuminosoInsight/langcodes +known_language_codes, known_language_codes_url = load_json_resource("languages.json") +known_licenses, known_licenses_url = load_json_resource("licenses.json") +known_task_ids, known_task_ids_url = load_json_resource("tasks.json") +known_creators, known_creators_url = load_json_resource("creators.json") +known_size_categories, known_size_categories_url = load_json_resource("size_categories.json") +known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json") + + +def yaml_block_from_readme(path: Path) -> Optional[str]: + with path.open() as readme_file: + content = [line.strip() for line in readme_file] + + if content[0] == "---" and "---" in content[1:]: + yamlblock = "\n".join(content[1 : content[1:].index("---") + 1]) + return yamlblock + + return None + + +def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]: + """"Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict""" + yaml_block = yaml_block_from_readme(path=path) + if yaml_block is None: + return None + metada_dict = yaml.safe_load(yaml_block) or dict() + return metada_dict + + +ValidatorOutput = Tuple[List[str], Optional[str]] + + +def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> ValidatorOutput: + invalid_values = [v for v in values if v not in reference_values] + if len(invalid_values) > 0: + return [], f"{invalid_values} are not registered tags for '{name}', reference at {url}" + return values, None + + +def escape_validation_for_predicate( + values: List[Any], predicate_fn: Callable[[Any], bool] +) -> Tuple[List[Any], List[Any]]: + trues, falses = list(), list() + for v in values: + if predicate_fn(v): + trues.append(v) + else: + falses.append(v) + if len(trues) > 0: + logger.warning(f"The following values will escape validation: {trues}") + return trues, falses + + +def validate_metadata_type(metadata_dict: dict): + basic_typing_errors = { + name: value + for name, value in metadata_dict.items() + if not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str) + } + if len(basic_typing_errors) > 0: + raise TypeError(f"Found fields that are not non-empty list of strings: {basic_typing_errors}") + + +@dataclass +class DatasetMetadata: + annotations_creators: List[str] + language_creators: List[str] + languages: List[str] + licenses: List[str] + multilinguality: List[str] + size_categories: List[str] + source_datasets: List[str] + task_categories: List[str] + task_ids: List[str] + + def __post_init__(self): + validate_metadata_type(metadata_dict=vars(self)) + + self.annotations_creators, annotations_creators_errors = self.validate_annotations_creators( + self.annotations_creators + ) + self.language_creators, language_creators_errors = self.validate_language_creators(self.language_creators) + self.languages, languages_errors = self.validate_language_codes(self.languages) + self.licenses, licenses_errors = self.validate_licences(self.licenses) + self.multilinguality, multilinguality_errors = self.validate_mulitlinguality(self.multilinguality) + self.size_categories, size_categories_errors = self.validate_size_catgeories(self.size_categories) + self.source_datasets, source_datasets_errors = self.validate_source_datasets(self.source_datasets) + self.task_categories, task_categories_errors = self.validate_task_categories(self.task_categories) + self.task_ids, task_ids_errors = self.validate_task_ids(self.task_ids) + + errors = { + "annotations_creators": annotations_creators_errors, + "language_creators": language_creators_errors, + "licenses": licenses_errors, + "multilinguality": multilinguality_errors, + "size_categories": size_categories_errors, + "source_datasets": source_datasets_errors, + "task_categories": task_categories_errors, + "task_ids": task_ids_errors, + "languages": languages_errors, + } + + exception_msg_dict = dict() + for field, errs in errors.items(): + if errs is not None: + exception_msg_dict[field] = errs + if len(exception_msg_dict) > 0: + raise TypeError( + "Could not validate the metada, found the following errors:\n" + + "\n".join(f"* field '{fieldname}':\n\t{err}" for fieldname, err in exception_msg_dict.items()) + ) + + @classmethod + def from_readme(cls, path: Path) -> "DatasetMetadata": + """Loads and validates the dataset metadat from its dataset card (README.md) + + Args: + path (:obj:`Path`): Path to the dataset card (its README.md file) + + Returns: + :class:`DatasetMetadata`: The dataset's metadata + + Raises: + :obj:`TypeError`: If the dataset card has no metadata (no YAML header) + :obj:`TypeError`: If the dataset's metadata is invalid + """ + yaml_string = yaml_block_from_readme(path) + if yaml_string is not None: + return cls.from_yaml_string(yaml_string) + else: + raise TypeError(f"did not find a yaml block in '{path}'") + + @classmethod + def from_yaml_string(cls, string: str) -> "DatasetMetadata": + """Loads and validates the dataset metadat from a YAML string + + Args: + string (:obj:`str`): The YAML string + + Returns: + :class:`DatasetMetadata`: The dataset's metadata + + Raises: + :obj:`TypeError`: If the dataset's metadata is invalid + """ + metada_dict = yaml.safe_load(string) or dict() + return cls(**metada_dict) + + @staticmethod + def validate_annotations_creators(annotations_creators: List[str]) -> ValidatorOutput: + return tagset_validator( + annotations_creators, known_creators["annotations"], "annotations_creators", known_creators_url + ) + + @staticmethod + def validate_language_creators(language_creators: List[str]) -> ValidatorOutput: + return tagset_validator(language_creators, known_creators["language"], "language_creators", known_creators_url) + + @staticmethod + def validate_language_codes(languages: List[str]) -> ValidatorOutput: + return tagset_validator( + values=languages, + reference_values=known_language_codes.keys(), + name="languages", + url=known_language_codes_url, + ) + + @staticmethod + def validate_licences(licenses: List[str]) -> ValidatorOutput: + others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e) + validated, error = tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url) + return [*validated, *others], error + + @staticmethod + def validate_task_categories(task_categories: List[str]) -> ValidatorOutput: + # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change + # in the near future and we don't want to waste energy in tagging against a moving taxonomy. + known_set = list(known_task_ids.keys()) + others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other")) + validated, error = tagset_validator(to_validate, known_set, "task_categories", known_task_ids_url) + return [*validated, *others], error + + @staticmethod + def validate_task_ids(task_ids: List[str]) -> ValidatorOutput: + # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change + # in the near future and we don't want to waste energy in tagging against a moving taxonomy. + known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]] + others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e) + validated, error = tagset_validator(to_validate, known_set, "task_ids", known_task_ids_url) + return [*validated, *others], error + + @staticmethod + def validate_mulitlinguality(multilinguality: List[str]) -> ValidatorOutput: + others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other")) + validated, error = tagset_validator( + to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url + ) + return [*validated, *others], error + + @staticmethod + def validate_size_catgeories(size_cats: List[str]) -> ValidatorOutput: + return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url) + + @staticmethod + def validate_source_datasets(sources: List[str]) -> ValidatorOutput: + invalid_values = [] + for src in sources: + is_ok = src in ["original", "extended"] or src.startswith("extended|") + if not is_ok: + invalid_values.append(src) + if len(invalid_values) > 0: + return ( + [], + f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}", + ) + + return sources, None + + +if __name__ == "__main__": + from argparse import ArgumentParser + + ap = ArgumentParser(usage="Validate the yaml metadata block of a README.md file.") + ap.add_argument("readme_filepath") + args = ap.parse_args() + + readme_filepath = Path(args.readme_filepath) + DatasetMetadata.from_readme(readme_filepath) diff --git a/src/datasets/utils/resources/__init__.py b/src/datasets/utils/resources/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/utils/resources/creators.json b/src/datasets/utils/resources/creators.json new file mode 100644 index 00000000000..d9e15f0039c --- /dev/null +++ b/src/datasets/utils/resources/creators.json @@ -0,0 +1,17 @@ +{ + "language": [ + "found", + "crowdsourced", + "expert-generated", + "machine-generated", + "other" + ], + "annotations": [ + "found", + "crowdsourced", + "expert-generated", + "machine-generated", + "no-annotation", + "other" + ] +} diff --git a/src/datasets/utils/resources/languages.json b/src/datasets/utils/resources/languages.json new file mode 100644 index 00000000000..d0c77f66f11 --- /dev/null +++ b/src/datasets/utils/resources/languages.json @@ -0,0 +1,792 @@ +{ + "af": "Afrikaans", + "af-NA": "Afrikaans (Namibia)", + "af-ZA": "Afrikaans (South Africa)", + "agq": "Aghem", + "agq-CM": "Aghem (Cameroon)", + "ak": "Akan", + "ak-GH": "Akan (Ghana)", + "am": "Amharic", + "am-ET": "Amharic (Ethiopia)", + "ar": "Arabic", + "ar-001": "Arabic (World)", + "ar-AE": "Arabic (United Arab Emirates)", + "ar-BH": "Arabic (Bahrain)", + "ar-DJ": "Arabic (Djibouti)", + "ar-DZ": "Arabic (Algeria)", + "ar-EG": "Arabic (Egypt)", + "ar-EH": "Arabic (Western Sahara)", + "ar-ER": "Arabic (Eritrea)", + "ar-IL": "Arabic (Israel)", + "ar-IQ": "Arabic (Iraq)", + "ar-JO": "Arabic (Jordan)", + "ar-KM": "Arabic (Comoros)", + "ar-KW": "Arabic (Kuwait)", + "ar-LB": "Arabic (Lebanon)", + "ar-LY": "Arabic (Libya)", + "ar-MA": "Arabic (Morocco)", + "ar-MR": "Arabic (Mauritania)", + "ar-OM": "Arabic (Oman)", + "ar-PS": "Arabic (Palestinian Territories)", + "ar-QA": "Arabic (Qatar)", + "ar-SA": "Arabic (Saudi Arabia)", + "ar-SD": "Arabic (Sudan)", + "ar-SO": "Arabic (Somalia)", + "ar-SS": "Arabic (South Sudan)", + "ar-SY": "Arabic (Syria)", + "ar-TD": "Arabic (Chad)", + "ar-TN": "Arabic (Tunisia)", + "ar-YE": "Arabic (Yemen)", + "as": "Assamese", + "as-IN": "Assamese (India)", + "asa": "Asu", + "asa-TZ": "Asu (Tanzania)", + "ast": "Asturian", + "ast-ES": "Asturian (Spain)", + "az": "Azerbaijani", + "az-Cyrl": "Azerbaijani (Cyrillic)", + "az-Cyrl-AZ": "Azerbaijani (Cyrillic, Azerbaijan)", + "az-Latn": "Azerbaijani (Latin)", + "az-Latn-AZ": "Azerbaijani (Latin, Azerbaijan)", + "bas": "Basaa", + "bas-CM": "Basaa (Cameroon)", + "be": "Belarusian", + "be-BY": "Belarusian (Belarus)", + "bem": "Bemba", + "bem-ZM": "Bemba (Zambia)", + "bez": "Bena", + "bez-TZ": "Bena (Tanzania)", + "bg": "Bulgarian", + "bg-BG": "Bulgarian (Bulgaria)", + "bm": "Bambara", + "bm-ML": "Bambara (Mali)", + "bn": "Bangla", + "bn-BD": "Bangla (Bangladesh)", + "bn-IN": "Bangla (India)", + "bo": "Tibetan", + "bo-CN": "Tibetan (China)", + "bo-IN": "Tibetan (India)", + "br": "Breton", + "br-FR": "Breton (France)", + "brx": "Bodo", + "brx-IN": "Bodo (India)", + "bs": "Bosnian", + "bs-Cyrl": "Bosnian (Cyrillic)", + "bs-Cyrl-BA": "Bosnian (Cyrillic, Bosnia & Herzegovina)", + "bs-Latn": "Bosnian", + "bs-Latn-BA": "Bosnian (Bosnia & Herzegovina)", + "ca": "Catalan", + "ca-AD": "Catalan (Andorra)", + "ca-ES": "Catalan (Spain)", + "ca-ES-valencia": "Catalan (Spain)", + "ca-FR": "Catalan (France)", + "ca-IT": "Catalan (Italy)", + "ccp": "Chakma", + "ccp-BD": "Chakma (Bangladesh)", + "ccp-IN": "Chakma (India)", + "ce": "Chechen", + "ce-RU": "Chechen (Russia)", + "ceb": "Cebuano", + "ceb-PH": "Cebuano (Philippines)", + "cgg": "Chiga", + "cgg-UG": "Chiga (Uganda)", + "chr": "Cherokee", + "chr-US": "Cherokee (United States)", + "ckb": "Central Kurdish", + "ckb-IQ": "Central Kurdish (Iraq)", + "ckb-IR": "Central Kurdish (Iran)", + "cs": "Czech", + "cs-CZ": "Czech (Czechia)", + "cu": "Church Slavic", + "cu-RU": "Church Slavic (Russia)", + "cy": "Welsh", + "cy-GB": "Welsh (United Kingdom)", + "da": "Danish", + "da-DK": "Danish (Denmark)", + "da-GL": "Danish (Greenland)", + "dav": "Taita", + "dav-KE": "Taita (Kenya)", + "de": "German", + "de-AT": "German (Austria)", + "de-BE": "German (Belgium)", + "de-CH": "German (Switzerland)", + "de-DE": "German (Germany)", + "de-IT": "German (Italy)", + "de-LI": "German (Liechtenstein)", + "de-LU": "German (Luxembourg)", + "dje": "Zarma", + "dje-NE": "Zarma (Niger)", + "dsb": "Lower Sorbian", + "dsb-DE": "Lower Sorbian (Germany)", + "dua": "Duala", + "dua-CM": "Duala (Cameroon)", + "dyo": "Jola-Fonyi", + "dyo-SN": "Jola-Fonyi (Senegal)", + "dz": "Dzongkha", + "dz-BT": "Dzongkha (Bhutan)", + "ebu": "Embu", + "ebu-KE": "Embu (Kenya)", + "ee": "Ewe", + "ee-GH": "Ewe (Ghana)", + "ee-TG": "Ewe (Togo)", + "el": "Greek", + "el-CY": "Greek (Cyprus)", + "el-GR": "Greek (Greece)", + "en": "English", + "en-001": "English (World)", + "en-150": "English (Europe)", + "en-AE": "English (United Arab Emirates)", + "en-AG": "English (Antigua & Barbuda)", + "en-AI": "English (Anguilla)", + "en-AS": "English (American Samoa)", + "en-AT": "English (Austria)", + "en-AU": "English (Australia)", + "en-BB": "English (Barbados)", + "en-BE": "English (Belgium)", + "en-BI": "English (Burundi)", + "en-BM": "English (Bermuda)", + "en-BS": "English (Bahamas)", + "en-BW": "English (Botswana)", + "en-BZ": "English (Belize)", + "en-CA": "English (Canada)", + "en-CC": "English (Cocos (Keeling) Islands)", + "en-CH": "English (Switzerland)", + "en-CK": "English (Cook Islands)", + "en-CM": "English (Cameroon)", + "en-CX": "English (Christmas Island)", + "en-CY": "English (Cyprus)", + "en-DE": "English (Germany)", + "en-DG": "English (Diego Garcia)", + "en-DK": "English (Denmark)", + "en-DM": "English (Dominica)", + "en-ER": "English (Eritrea)", + "en-FI": "English (Finland)", + "en-FJ": "English (Fiji)", + "en-FK": "English (Falkland Islands)", + "en-FM": "English (Micronesia)", + "en-GB": "English (United Kingdom)", + "en-GD": "English (Grenada)", + "en-GG": "English (Guernsey)", + "en-GH": "English (Ghana)", + "en-GI": "English (Gibraltar)", + "en-GM": "English (Gambia)", + "en-GU": "English (Guam)", + "en-GY": "English (Guyana)", + "en-HK": "English (Hong Kong SAR China)", + "en-IE": "English (Ireland)", + "en-IL": "English (Israel)", + "en-IM": "English (Isle of Man)", + "en-IN": "English (India)", + "en-IO": "English (British Indian Ocean Territory)", + "en-JE": "English (Jersey)", + "en-JM": "English (Jamaica)", + "en-KE": "English (Kenya)", + "en-KI": "English (Kiribati)", + "en-KN": "English (St. Kitts & Nevis)", + "en-KY": "English (Cayman Islands)", + "en-LC": "English (St. Lucia)", + "en-LR": "English (Liberia)", + "en-LS": "English (Lesotho)", + "en-MG": "English (Madagascar)", + "en-MH": "English (Marshall Islands)", + "en-MO": "English (Macao SAR China)", + "en-MP": "English (Northern Mariana Islands)", + "en-MS": "English (Montserrat)", + "en-MT": "English (Malta)", + "en-MU": "English (Mauritius)", + "en-MW": "English (Malawi)", + "en-MY": "English (Malaysia)", + "en-NA": "English (Namibia)", + "en-NF": "English (Norfolk Island)", + "en-NG": "English (Nigeria)", + "en-NL": "English (Netherlands)", + "en-NR": "English (Nauru)", + "en-NU": "English (Niue)", + "en-NZ": "English (New Zealand)", + "en-PG": "English (Papua New Guinea)", + "en-PH": "English (Philippines)", + "en-PK": "English (Pakistan)", + "en-PN": "English (Pitcairn Islands)", + "en-PR": "English (Puerto Rico)", + "en-PW": "English (Palau)", + "en-RW": "English (Rwanda)", + "en-SB": "English (Solomon Islands)", + "en-SC": "English (Seychelles)", + "en-SD": "English (Sudan)", + "en-SE": "English (Sweden)", + "en-SG": "English (Singapore)", + "en-SH": "English (St. Helena)", + "en-SI": "English (Slovenia)", + "en-SL": "English (Sierra Leone)", + "en-SS": "English (South Sudan)", + "en-SX": "English (Sint Maarten)", + "en-SZ": "English (Eswatini)", + "en-TC": "English (Turks & Caicos Islands)", + "en-TK": "English (Tokelau)", + "en-TO": "English (Tonga)", + "en-TT": "English (Trinidad & Tobago)", + "en-TV": "English (Tuvalu)", + "en-TZ": "English (Tanzania)", + "en-UG": "English (Uganda)", + "en-UM": "English (U.S. Outlying Islands)", + "en-US": "English (United States)", + "en-US-posix": "English (United States)", + "en-VC": "English (St. Vincent & Grenadines)", + "en-VG": "English (British Virgin Islands)", + "en-VI": "English (U.S. Virgin Islands)", + "en-VU": "English (Vanuatu)", + "en-WS": "English (Samoa)", + "en-ZA": "English (South Africa)", + "en-ZM": "English (Zambia)", + "en-ZW": "English (Zimbabwe)", + "eo": "Esperanto", + "eo-001": "Esperanto (World)", + "es": "Spanish", + "es-419": "Spanish (Latin America)", + "es-AR": "Spanish (Argentina)", + "es-BO": "Spanish (Bolivia)", + "es-BR": "Spanish (Brazil)", + "es-BZ": "Spanish (Belize)", + "es-CL": "Spanish (Chile)", + "es-CO": "Spanish (Colombia)", + "es-CR": "Spanish (Costa Rica)", + "es-CU": "Spanish (Cuba)", + "es-DO": "Spanish (Dominican Republic)", + "es-EA": "Spanish (Ceuta & Melilla)", + "es-EC": "Spanish (Ecuador)", + "es-ES": "Spanish (Spain)", + "es-GQ": "Spanish (Equatorial Guinea)", + "es-GT": "Spanish (Guatemala)", + "es-HN": "Spanish (Honduras)", + "es-IC": "Spanish (Canary Islands)", + "es-MX": "Spanish (Mexico)", + "es-NI": "Spanish (Nicaragua)", + "es-PA": "Spanish (Panama)", + "es-PE": "Spanish (Peru)", + "es-PH": "Spanish (Philippines)", + "es-PR": "Spanish (Puerto Rico)", + "es-PY": "Spanish (Paraguay)", + "es-SV": "Spanish (El Salvador)", + "es-US": "Spanish (United States)", + "es-UY": "Spanish (Uruguay)", + "es-VE": "Spanish (Venezuela)", + "et": "Estonian", + "et-EE": "Estonian (Estonia)", + "eu": "Basque", + "eu-ES": "Basque (Spain)", + "ewo": "Ewondo", + "ewo-CM": "Ewondo (Cameroon)", + "fa": "Persian", + "fa-AF": "Persian (Afghanistan)", + "fa-IR": "Persian (Iran)", + "ff": "Fulah", + "ff-Adlm": "Fulah (Adlam)", + "ff-Adlm-BF": "Fulah (Adlam, Burkina Faso)", + "ff-Adlm-CM": "Fulah (Adlam, Cameroon)", + "ff-Adlm-GH": "Fulah (Adlam, Ghana)", + "ff-Adlm-GM": "Fulah (Adlam, Gambia)", + "ff-Adlm-GN": "Fulah (Adlam, Guinea)", + "ff-Adlm-GW": "Fulah (Adlam, Guinea-Bissau)", + "ff-Adlm-LR": "Fulah (Adlam, Liberia)", + "ff-Adlm-MR": "Fulah (Adlam, Mauritania)", + "ff-Adlm-NE": "Fulah (Adlam, Niger)", + "ff-Adlm-NG": "Fulah (Adlam, Nigeria)", + "ff-Adlm-SL": "Fulah (Adlam, Sierra Leone)", + "ff-Adlm-SN": "Fulah (Adlam, Senegal)", + "ff-Latn": "Fulah (Latin)", + "ff-Latn-BF": "Fulah (Latin, Burkina Faso)", + "ff-Latn-CM": "Fulah (Latin, Cameroon)", + "ff-Latn-GH": "Fulah (Latin, Ghana)", + "ff-Latn-GM": "Fulah (Latin, Gambia)", + "ff-Latn-GN": "Fulah (Latin, Guinea)", + "ff-Latn-GW": "Fulah (Latin, Guinea-Bissau)", + "ff-Latn-LR": "Fulah (Latin, Liberia)", + "ff-Latn-MR": "Fulah (Latin, Mauritania)", + "ff-Latn-NE": "Fulah (Latin, Niger)", + "ff-Latn-NG": "Fulah (Latin, Nigeria)", + "ff-Latn-SL": "Fulah (Latin, Sierra Leone)", + "ff-Latn-SN": "Fulah (Latin, Senegal)", + "fi": "Finnish", + "fi-FI": "Finnish (Finland)", + "fil": "Filipino", + "fil-PH": "Filipino (Philippines)", + "fo": "Faroese", + "fo-DK": "Faroese (Denmark)", + "fo-FO": "Faroese (Faroe Islands)", + "fr": "French", + "fr-BE": "French (Belgium)", + "fr-BF": "French (Burkina Faso)", + "fr-BI": "French (Burundi)", + "fr-BJ": "French (Benin)", + "fr-BL": "French (St. Barth\u00e9lemy)", + "fr-CA": "French (Canada)", + "fr-CD": "French (Congo - Kinshasa)", + "fr-CF": "French (Central African Republic)", + "fr-CG": "French (Congo - Brazzaville)", + "fr-CH": "French (Switzerland)", + "fr-CI": "French (C\u00f4te d\u2019Ivoire)", + "fr-CM": "French (Cameroon)", + "fr-DJ": "French (Djibouti)", + "fr-DZ": "French (Algeria)", + "fr-FR": "French (France)", + "fr-GA": "French (Gabon)", + "fr-GF": "French (French Guiana)", + "fr-GN": "French (Guinea)", + "fr-GP": "French (Guadeloupe)", + "fr-GQ": "French (Equatorial Guinea)", + "fr-HT": "French (Haiti)", + "fr-KM": "French (Comoros)", + "fr-LU": "French (Luxembourg)", + "fr-MA": "French (Morocco)", + "fr-MC": "French (Monaco)", + "fr-MF": "French (St. Martin)", + "fr-MG": "French (Madagascar)", + "fr-ML": "French (Mali)", + "fr-MQ": "French (Martinique)", + "fr-MR": "French (Mauritania)", + "fr-MU": "French (Mauritius)", + "fr-NC": "French (New Caledonia)", + "fr-NE": "French (Niger)", + "fr-PF": "French (French Polynesia)", + "fr-PM": "French (St. Pierre & Miquelon)", + "fr-RE": "French (R\u00e9union)", + "fr-RW": "French (Rwanda)", + "fr-SC": "French (Seychelles)", + "fr-SN": "French (Senegal)", + "fr-SY": "French (Syria)", + "fr-TD": "French (Chad)", + "fr-TG": "French (Togo)", + "fr-TN": "French (Tunisia)", + "fr-VU": "French (Vanuatu)", + "fr-WF": "French (Wallis & Futuna)", + "fr-YT": "French (Mayotte)", + "fur": "Friulian", + "fur-IT": "Friulian (Italy)", + "fy": "Western Frisian", + "fy-NL": "Western Frisian (Netherlands)", + "ga": "Irish", + "ga-GB": "Irish (United Kingdom)", + "ga-IE": "Irish (Ireland)", + "gd": "Scottish Gaelic", + "gd-GB": "Scottish Gaelic (United Kingdom)", + "gl": "Galician", + "gl-ES": "Galician (Spain)", + "gsw": "Swiss German", + "gsw-CH": "Swiss German (Switzerland)", + "gsw-FR": "Swiss German (France)", + "gsw-LI": "Swiss German (Liechtenstein)", + "gu": "Gujarati", + "gu-IN": "Gujarati (India)", + "guz": "Gusii", + "guz-KE": "Gusii (Kenya)", + "gv": "Manx", + "gv-IM": "Manx (Isle of Man)", + "ha": "Hausa", + "ha-GH": "Hausa (Ghana)", + "ha-NE": "Hausa (Niger)", + "ha-NG": "Hausa (Nigeria)", + "haw": "Hawaiian", + "haw-US": "Hawaiian (United States)", + "he": "Hebrew", + "he-IL": "Hebrew (Israel)", + "hi": "Hindi", + "hi-IN": "Hindi (India)", + "hr": "Croatian", + "hr-BA": "Croatian (Bosnia & Herzegovina)", + "hr-HR": "Croatian (Croatia)", + "hsb": "Upper Sorbian", + "hsb-DE": "Upper Sorbian (Germany)", + "hu": "Hungarian", + "hu-HU": "Hungarian (Hungary)", + "hy": "Armenian", + "hy-AM": "Armenian (Armenia)", + "ia": "Interlingua", + "ia-001": "Interlingua (World)", + "id": "Indonesian", + "id-ID": "Indonesian (Indonesia)", + "ig": "Igbo", + "ig-NG": "Igbo (Nigeria)", + "ii": "Sichuan Yi", + "ii-CN": "Sichuan Yi (China)", + "is": "Icelandic", + "is-IS": "Icelandic (Iceland)", + "it": "Italian", + "it-CH": "Italian (Switzerland)", + "it-IT": "Italian (Italy)", + "it-SM": "Italian (San Marino)", + "it-VA": "Italian (Vatican City)", + "ja": "Japanese", + "ja-JP": "Japanese (Japan)", + "jgo": "Ngomba", + "jgo-CM": "Ngomba (Cameroon)", + "jmc": "Machame", + "jmc-TZ": "Machame (Tanzania)", + "jv": "Javanese", + "jv-ID": "Javanese (Indonesia)", + "ka": "Georgian", + "ka-GE": "Georgian (Georgia)", + "kab": "Kabyle", + "kab-DZ": "Kabyle (Algeria)", + "kam": "Kamba", + "kam-KE": "Kamba (Kenya)", + "kde": "Makonde", + "kde-TZ": "Makonde (Tanzania)", + "kea": "Kabuverdianu", + "kea-CV": "Kabuverdianu (Cape Verde)", + "khq": "Koyra Chiini", + "khq-ML": "Koyra Chiini (Mali)", + "ki": "Kikuyu", + "ki-KE": "Kikuyu (Kenya)", + "kk": "Kazakh", + "kk-KZ": "Kazakh (Kazakhstan)", + "kkj": "Kako", + "kkj-CM": "Kako (Cameroon)", + "kl": "Kalaallisut", + "kl-GL": "Kalaallisut (Greenland)", + "kln": "Kalenjin", + "kln-KE": "Kalenjin (Kenya)", + "km": "Khmer", + "km-KH": "Khmer (Cambodia)", + "kn": "Kannada", + "kn-IN": "Kannada (India)", + "ko": "Korean", + "ko-KP": "Korean (North Korea)", + "ko-KR": "Korean (South Korea)", + "kok": "Konkani", + "kok-IN": "Konkani (India)", + "ks": "Kashmiri", + "ks-Arab": "Kashmiri (Arabic)", + "ks-IN": "Kashmiri (India)", + "ksb": "Shambala", + "ksb-TZ": "Shambala (Tanzania)", + "ksf": "Bafia", + "ksf-CM": "Bafia (Cameroon)", + "ksh": "Colognian", + "ksh-DE": "Colognian (Germany)", + "ku": "Kurdish", + "ku-TR": "Kurdish (Turkey)", + "kw": "Cornish", + "kw-GB": "Cornish (United Kingdom)", + "ky": "Kyrgyz", + "ky-KG": "Kyrgyz (Kyrgyzstan)", + "lag": "Langi", + "lag-TZ": "Langi (Tanzania)", + "lb": "Luxembourgish", + "lb-LU": "Luxembourgish (Luxembourg)", + "lg": "Ganda", + "lg-UG": "Ganda (Uganda)", + "lkt": "Lakota", + "lkt-US": "Lakota (United States)", + "ln": "Lingala", + "ln-AO": "Lingala (Angola)", + "ln-CD": "Lingala (Congo - Kinshasa)", + "ln-CF": "Lingala (Central African Republic)", + "ln-CG": "Lingala (Congo - Brazzaville)", + "lo": "Lao", + "lo-LA": "Lao (Laos)", + "lrc": "Northern Luri", + "lrc-IQ": "Northern Luri (Iraq)", + "lrc-IR": "Northern Luri (Iran)", + "lt": "Lithuanian", + "lt-LT": "Lithuanian (Lithuania)", + "lu": "Luba-Katanga", + "lu-CD": "Luba-Katanga (Congo - Kinshasa)", + "luo": "Luo (Kenya and Tanzania)", + "luo-KE": "Luo (Kenya and Tanzania) (Kenya)", + "luy": "Luyia", + "luy-KE": "Luyia (Kenya)", + "lv": "Latvian", + "lv-LV": "Latvian (Latvia)", + "mai": "Maithili", + "mai-IN": "Maithili (India)", + "mas": "Masai", + "mas-KE": "Masai (Kenya)", + "mas-TZ": "Masai (Tanzania)", + "mer": "Meru", + "mer-KE": "Meru (Kenya)", + "mfe": "Morisyen", + "mfe-MU": "Morisyen (Mauritius)", + "mg": "Malagasy", + "mg-MG": "Malagasy (Madagascar)", + "mgh": "Makhuwa-Meetto", + "mgh-MZ": "Makhuwa-Meetto (Mozambique)", + "mgo": "Meta\u02bc", + "mgo-CM": "Meta\u02bc (Cameroon)", + "mi": "Maori", + "mi-NZ": "Maori (New Zealand)", + "mk": "Macedonian", + "mk-MK": "Macedonian (North Macedonia)", + "ml": "Malayalam", + "ml-IN": "Malayalam (India)", + "mn": "Mongolian", + "mn-MN": "Mongolian (Mongolia)", + "mni": "Manipuri", + "mni-Beng": "Manipuri (Bangla)", + "mni-Beng-IN": "Manipuri (Bangla, India)", + "mr": "Marathi", + "mr-IN": "Marathi (India)", + "ms": "Malay", + "ms-BN": "Malay (Brunei)", + "ms-ID": "Malay (Indonesia)", + "ms-MY": "Malay (Malaysia)", + "ms-SG": "Malay (Singapore)", + "mt": "Maltese", + "mt-MT": "Maltese (Malta)", + "mua": "Mundang", + "mua-CM": "Mundang (Cameroon)", + "my": "Burmese", + "my-MM": "Burmese (Myanmar (Burma))", + "mzn": "Mazanderani", + "mzn-IR": "Mazanderani (Iran)", + "naq": "Nama", + "naq-NA": "Nama (Namibia)", + "nb": "Norwegian Bokm\u00e5l", + "nb-NO": "Norwegian Bokm\u00e5l (Norway)", + "nb-SJ": "Norwegian Bokm\u00e5l (Svalbard & Jan Mayen)", + "nd": "North Ndebele", + "nd-ZW": "North Ndebele (Zimbabwe)", + "nds": "Low German", + "nds-DE": "Low German (Germany)", + "nds-NL": "Low German (Netherlands)", + "ne": "Nepali", + "ne-IN": "Nepali (India)", + "ne-NP": "Nepali (Nepal)", + "nl": "Dutch", + "nl-AW": "Dutch (Aruba)", + "nl-BE": "Dutch (Belgium)", + "nl-BQ": "Dutch (Caribbean Netherlands)", + "nl-CW": "Dutch (Cura\u00e7ao)", + "nl-NL": "Dutch (Netherlands)", + "nl-SR": "Dutch (Suriname)", + "nl-SX": "Dutch (Sint Maarten)", + "nmg": "Kwasio", + "nmg-CM": "Kwasio (Cameroon)", + "nn": "Norwegian Nynorsk", + "nn-NO": "Norwegian Nynorsk (Norway)", + "nnh": "Ngiemboon", + "nnh-CM": "Ngiemboon (Cameroon)", + "nus": "Nuer", + "nus-SS": "Nuer (South Sudan)", + "nyn": "Nyankole", + "nyn-UG": "Nyankole (Uganda)", + "om": "Oromo", + "om-ET": "Oromo (Ethiopia)", + "om-KE": "Oromo (Kenya)", + "or": "Odia", + "or-IN": "Odia (India)", + "os": "Ossetic", + "os-GE": "Ossetic (Georgia)", + "os-RU": "Ossetic (Russia)", + "pa": "Punjabi", + "pa-Arab": "Punjabi (Arabic)", + "pa-Arab-PK": "Punjabi (Arabic, Pakistan)", + "pa-Guru": "Punjabi", + "pa-Guru-IN": "Punjabi (India)", + "pcm": "Nigerian Pidgin", + "pcm-NG": "Nigerian Pidgin (Nigeria)", + "pl": "Polish", + "pl-PL": "Polish (Poland)", + "prg": "Prussian", + "prg-001": "Prussian (World)", + "ps": "Pashto", + "ps-AF": "Pashto (Afghanistan)", + "ps-PK": "Pashto (Pakistan)", + "pt": "Portuguese", + "pt-AO": "Portuguese (Angola)", + "pt-BR": "Portuguese (Brazil)", + "pt-CH": "Portuguese (Switzerland)", + "pt-CV": "Portuguese (Cape Verde)", + "pt-GQ": "Portuguese (Equatorial Guinea)", + "pt-GW": "Portuguese (Guinea-Bissau)", + "pt-LU": "Portuguese (Luxembourg)", + "pt-MO": "Portuguese (Macao SAR China)", + "pt-MZ": "Portuguese (Mozambique)", + "pt-PT": "Portuguese (Portugal)", + "pt-ST": "Portuguese (S\u00e3o Tom\u00e9 & Pr\u00edncipe)", + "pt-TL": "Portuguese (Timor-Leste)", + "qu": "Quechua", + "qu-BO": "Quechua (Bolivia)", + "qu-EC": "Quechua (Ecuador)", + "qu-PE": "Quechua (Peru)", + "rm": "Romansh", + "rm-CH": "Romansh (Switzerland)", + "rn": "Rundi", + "rn-BI": "Rundi (Burundi)", + "ro": "Romanian", + "ro-MD": "Romanian (Moldova)", + "ro-RO": "Romanian (Romania)", + "rof": "Rombo", + "rof-TZ": "Rombo (Tanzania)", + "und": "Unknown language", + "ru": "Russian", + "ru-BY": "Russian (Belarus)", + "ru-KG": "Russian (Kyrgyzstan)", + "ru-KZ": "Russian (Kazakhstan)", + "ru-MD": "Russian (Moldova)", + "ru-RU": "Russian (Russia)", + "ru-UA": "Russian (Ukraine)", + "rw": "Kinyarwanda", + "rw-RW": "Kinyarwanda (Rwanda)", + "rwk": "Rwa", + "rwk-TZ": "Rwa (Tanzania)", + "sah": "Sakha", + "sah-RU": "Sakha (Russia)", + "saq": "Samburu", + "saq-KE": "Samburu (Kenya)", + "sat": "Santali", + "sat-Olck": "Santali (Ol Chiki)", + "sat-Olck-IN": "Santali (Ol Chiki, India)", + "sbp": "Sangu", + "sbp-TZ": "Sangu (Tanzania)", + "sd": "Sindhi", + "sd-Arab": "Sindhi (Arabic)", + "sd-Arab-PK": "Sindhi (Arabic, Pakistan)", + "sd-Deva": "Sindhi (Devanagari)", + "sd-Deva-IN": "Sindhi (Devanagari, India)", + "se": "Northern Sami", + "se-FI": "Northern Sami (Finland)", + "se-NO": "Northern Sami (Norway)", + "se-SE": "Northern Sami (Sweden)", + "seh": "Sena", + "seh-MZ": "Sena (Mozambique)", + "ses": "Koyraboro Senni", + "ses-ML": "Koyraboro Senni (Mali)", + "sg": "Sango", + "sg-CF": "Sango (Central African Republic)", + "shi": "Tachelhit", + "shi-Latn": "Tachelhit (Latin)", + "shi-Latn-MA": "Tachelhit (Latin, Morocco)", + "shi-Tfng": "Tachelhit (Tifinagh)", + "shi-Tfng-MA": "Tachelhit (Tifinagh, Morocco)", + "si": "Sinhala", + "si-LK": "Sinhala (Sri Lanka)", + "sk": "Slovak", + "sk-SK": "Slovak (Slovakia)", + "sl": "Slovenian", + "sl-SI": "Slovenian (Slovenia)", + "smn": "Inari Sami", + "smn-FI": "Inari Sami (Finland)", + "sn": "Shona", + "sn-ZW": "Shona (Zimbabwe)", + "so": "Somali", + "so-DJ": "Somali (Djibouti)", + "so-ET": "Somali (Ethiopia)", + "so-KE": "Somali (Kenya)", + "so-SO": "Somali (Somalia)", + "sq": "Albanian", + "sq-AL": "Albanian (Albania)", + "sq-MK": "Albanian (North Macedonia)", + "sq-XK": "Albanian (Kosovo)", + "sr": "Serbian", + "sr-Cyrl": "Serbian (Cyrillic)", + "sr-Cyrl-BA": "Serbian (Cyrillic, Bosnia & Herzegovina)", + "sr-Cyrl-ME": "Serbian (Cyrillic, Montenegro)", + "sr-Cyrl-RS": "Serbian (Cyrillic, Serbia)", + "sr-Cyrl-XK": "Serbian (Cyrillic, Kosovo)", + "sr-Latn": "Serbian (Latin)", + "sr-Latn-BA": "Serbian (Latin, Bosnia & Herzegovina)", + "sr-Latn-ME": "Serbian (Latin, Montenegro)", + "sr-Latn-RS": "Serbian (Latin, Serbia)", + "sr-Latn-XK": "Serbian (Latin, Kosovo)", + "su": "Sundanese", + "su-Latn": "Sundanese (Latin)", + "su-Latn-ID": "Sundanese (Latin, Indonesia)", + "sv": "Swedish", + "sv-AX": "Swedish (\u00c5land Islands)", + "sv-FI": "Swedish (Finland)", + "sv-SE": "Swedish (Sweden)", + "sw": "Swahili", + "sw-CD": "Swahili (Congo - Kinshasa)", + "sw-KE": "Swahili (Kenya)", + "sw-TZ": "Swahili (Tanzania)", + "sw-UG": "Swahili (Uganda)", + "ta": "Tamil", + "ta-IN": "Tamil (India)", + "ta-LK": "Tamil (Sri Lanka)", + "ta-MY": "Tamil (Malaysia)", + "ta-SG": "Tamil (Singapore)", + "te": "Telugu", + "te-IN": "Telugu (India)", + "teo": "Teso", + "teo-KE": "Teso (Kenya)", + "teo-UG": "Teso (Uganda)", + "tg": "Tajik", + "tg-TJ": "Tajik (Tajikistan)", + "th": "Thai", + "th-TH": "Thai (Thailand)", + "ti": "Tigrinya", + "ti-ER": "Tigrinya (Eritrea)", + "ti-ET": "Tigrinya (Ethiopia)", + "tk": "Turkmen", + "tk-TM": "Turkmen (Turkmenistan)", + "to": "Tongan", + "to-TO": "Tongan (Tonga)", + "tr": "Turkish", + "tr-CY": "Turkish (Cyprus)", + "tr-TR": "Turkish (Turkey)", + "tt": "Tatar", + "tt-RU": "Tatar (Russia)", + "twq": "Tasawaq", + "twq-NE": "Tasawaq (Niger)", + "tzm": "Central Atlas Tamazight", + "tzm-MA": "Central Atlas Tamazight (Morocco)", + "ug": "Uyghur", + "ug-CN": "Uyghur (China)", + "uk": "Ukrainian", + "uk-UA": "Ukrainian (Ukraine)", + "ur": "Urdu", + "ur-IN": "Urdu (India)", + "ur-PK": "Urdu (Pakistan)", + "uz": "Uzbek", + "uz-Arab": "Uzbek (Arabic)", + "uz-Arab-AF": "Uzbek (Arabic, Afghanistan)", + "uz-Cyrl": "Uzbek (Cyrillic)", + "uz-Cyrl-UZ": "Uzbek (Cyrillic, Uzbekistan)", + "uz-Latn": "Uzbek (Latin)", + "uz-Latn-UZ": "Uzbek (Latin, Uzbekistan)", + "vai": "Vai", + "vai-Latn": "Vai (Latin)", + "vai-Latn-LR": "Vai (Latin, Liberia)", + "vai-Vaii": "Vai (Vai)", + "vai-Vaii-LR": "Vai (Vai, Liberia)", + "vi": "Vietnamese", + "vi-VN": "Vietnamese (Vietnam)", + "vo": "Volap\u00fck", + "vo-001": "Volap\u00fck (World)", + "vun": "Vunjo", + "vun-TZ": "Vunjo (Tanzania)", + "wae": "Walser", + "wae-CH": "Walser (Switzerland)", + "wo": "Wolof", + "wo-SN": "Wolof (Senegal)", + "xh": "Xhosa", + "xh-ZA": "Xhosa (South Africa)", + "xog": "Soga", + "xog-UG": "Soga (Uganda)", + "yav": "Yangben", + "yav-CM": "Yangben (Cameroon)", + "yi": "Yiddish", + "yi-001": "Yiddish (World)", + "yo": "Yoruba", + "yo-BJ": "Yoruba (Benin)", + "yo-NG": "Yoruba (Nigeria)", + "yue": "Cantonese", + "yue-Hans": "Cantonese (Simplified)", + "yue-Hans-CN": "Cantonese (Simplified, China)", + "yue-Hant": "Cantonese (Traditional)", + "yue-Hant-HK": "Cantonese (Traditional, Hong Kong SAR China)", + "zgh": "Standard Moroccan Tamazight", + "zgh-MA": "Standard Moroccan Tamazight (Morocco)", + "zh": "Chinese", + "zh-Hans": "Chinese (Simplified)", + "zh-Hans-CN": "Chinese (Simplified, China)", + "zh-Hans-HK": "Chinese (Simplified, Hong Kong SAR China)", + "zh-Hans-MO": "Chinese (Simplified, Macao SAR China)", + "zh-Hans-SG": "Chinese (Simplified, Singapore)", + "zh-Hant": "Chinese (Traditional)", + "zh-Hant-HK": "Chinese (Traditional, Hong Kong SAR China)", + "zh-Hant-MO": "Chinese (Traditional, Macao SAR China)", + "zh-Hant-TW": "Chinese (Traditional, Taiwan)", + "zu": "Zulu", + "zu-ZA": "Zulu (South Africa)" +} \ No newline at end of file diff --git a/src/datasets/utils/resources/licenses.json b/src/datasets/utils/resources/licenses.json new file mode 100644 index 00000000000..31e76c43d48 --- /dev/null +++ b/src/datasets/utils/resources/licenses.json @@ -0,0 +1,452 @@ +{ + "other": "Other license", + "unknown": "License information unavailable", + "0bsd": "BSD Zero Clause License", + "aal": "Attribution Assurance License", + "abstyles": "Abstyles License", + "adobe-2006": "Adobe Systems Incorporated Source Code License Agreement", + "adobe-glyph": "Adobe Glyph List License", + "adsl": "Amazon Digital Services License", + "afl-1.1": "Academic Free License v1.1", + "afl-1.2": "Academic Free License v1.2", + "afl-2.0": "Academic Free License v2.0", + "afl-2.1": "Academic Free License v2.1", + "afl-3.0": "Academic Free License v3.0", + "afmparse": "Afmparse License", + "agpl-1.0": "Affero General Public License v1.0", + "agpl-1.0-only": "Affero General Public License v1.0 only", + "agpl-1.0-or-later": "Affero General Public License v1.0 or later", + "agpl-3.0": "GNU Affero General Public License v3.0", + "agpl-3.0-only": "GNU Affero General Public License v3.0 only", + "agpl-3.0-or-later": "GNU Affero General Public License v3.0 or later", + "aladdin": "Aladdin Free Public License", + "amdplpa": "AMD's plpa_map.c License", + "aml": "Apple MIT License", + "ampas": "Academy of Motion Picture Arts and Sciences BSD", + "antlr-pd": "ANTLR Software Rights Notice", + "antlr-pd-fallback": "ANTLR Software Rights Notice with license fallback", + "apache-1.0": "Apache License 1.0", + "apache-1.1": "Apache License 1.1", + "apache-2.0": "Apache License 2.0", + "apafml": "Adobe Postscript AFM License", + "apl-1.0": "Adaptive Public License 1.0", + "apsl-1.0": "Apple Public Source License 1.0", + "apsl-1.1": "Apple Public Source License 1.1", + "apsl-1.2": "Apple Public Source License 1.2", + "apsl-2.0": "Apple Public Source License 2.0", + "artistic-1.0": "Artistic License 1.0", + "artistic-1.0-cl8": "Artistic License 1.0 w/clause 8", + "artistic-1.0-perl": "Artistic License 1.0 (Perl)", + "artistic-2.0": "Artistic License 2.0", + "bahyph": "Bahyph License", + "barr": "Barr License", + "beerware": "Beerware License", + "bittorrent-1.0": "BitTorrent Open Source License v1.0", + "bittorrent-1.1": "BitTorrent Open Source License v1.1", + "blessing": "SQLite Blessing", + "blueoak-1.0.0": "Blue Oak Model License 1.0.0", + "borceux": "Borceux license", + "bsd-1-clause": "BSD 1-Clause License", + "bsd-2-clause": "BSD 2-Clause \"Simplified\" License", + "bsd-2-clause-freebsd": "BSD 2-Clause FreeBSD License", + "bsd-2-clause-netbsd": "BSD 2-Clause NetBSD License", + "bsd-2-clause-patent": "BSD-2-Clause Plus Patent License", + "bsd-2-clause-views": "BSD 2-Clause with views sentence", + "bsd-3-clause": "BSD 3-Clause \"New\" or \"Revised\" License", + "bsd-3-clause-attribution": "BSD with attribution", + "bsd-3-clause-clear": "BSD 3-Clause Clear License", + "bsd-3-clause-lbnl": "Lawrence Berkeley National Labs BSD variant license", + "bsd-3-clause-no-nuclear-license": "BSD 3-Clause No Nuclear License", + "bsd-3-clause-no-nuclear-license-2014": "BSD 3-Clause No Nuclear License 2014", + "bsd-3-clause-no-nuclear-warranty": "BSD 3-Clause No Nuclear Warranty", + "bsd-3-clause-open-mpi": "BSD 3-Clause Open MPI variant", + "bsd-4-clause": "BSD 4-Clause \"Original\" or \"Old\" License", + "bsd-4-clause-uc": "BSD-4-Clause (University of California-Specific)", + "bsd-protection": "BSD Protection License", + "bsd-source-code": "BSD Source Code Attribution", + "bsl-1.0": "Boost Software License 1.0", + "busl-1.1": "Business Source License 1.1", + "bzip2-1.0.5": "bzip2 and libbzip2 License v1.0.5", + "bzip2-1.0.6": "bzip2 and libbzip2 License v1.0.6", + "cal-1.0": "Cryptographic Autonomy License 1.0", + "cal-1.0-combined-work-exception": "Cryptographic Autonomy License 1.0 (Combined Work Exception)", + "caldera": "Caldera License", + "catosl-1.1": "Computer Associates Trusted Open Source License 1.1", + "cc-by-1.0": "Creative Commons Attribution 1.0 Generic", + "cc-by-2.0": "Creative Commons Attribution 2.0 Generic", + "cc-by-2.5": "Creative Commons Attribution 2.5 Generic", + "cc-by-3.0": "Creative Commons Attribution 3.0 Unported", + "cc-by-3.0-at": "Creative Commons Attribution 3.0 Austria", + "cc-by-3.0-us": "Creative Commons Attribution 3.0 United States", + "cc-by-4.0": "Creative Commons Attribution 4.0 International", + "cc-by-nc-1.0": "Creative Commons Attribution Non Commercial 1.0 Generic", + "cc-by-nc-2.0": "Creative Commons Attribution Non Commercial 2.0 Generic", + "cc-by-nc-2.5": "Creative Commons Attribution Non Commercial 2.5 Generic", + "cc-by-nc-3.0": "Creative Commons Attribution Non Commercial 3.0 Unported", + "cc-by-nc-4.0": "Creative Commons Attribution Non Commercial 4.0 International", + "cc-by-nc-nd-1.0": "Creative Commons Attribution Non Commercial No Derivatives 1.0 Generic", + "cc-by-nc-nd-2.0": "Creative Commons Attribution Non Commercial No Derivatives 2.0 Generic", + "cc-by-nc-nd-2.5": "Creative Commons Attribution Non Commercial No Derivatives 2.5 Generic", + "cc-by-nc-nd-3.0": "Creative Commons Attribution Non Commercial No Derivatives 3.0 Unported", + "cc-by-nc-nd-3.0-igo": "Creative Commons Attribution Non Commercial No Derivatives 3.0 IGO", + "cc-by-nc-nd-4.0": "Creative Commons Attribution Non Commercial No Derivatives 4.0 International", + "cc-by-nc-sa-1.0": "Creative Commons Attribution Non Commercial Share Alike 1.0 Generic", + "cc-by-nc-sa-2.0": "Creative Commons Attribution Non Commercial Share Alike 2.0 Generic", + "cc-by-nc-sa-2.5": "Creative Commons Attribution Non Commercial Share Alike 2.5 Generic", + "cc-by-nc-sa-3.0": "Creative Commons Attribution Non Commercial Share Alike 3.0 Unported", + "cc-by-nc-sa-4.0": "Creative Commons Attribution Non Commercial Share Alike 4.0 International", + "cc-by-nd-1.0": "Creative Commons Attribution No Derivatives 1.0 Generic", + "cc-by-nd-2.0": "Creative Commons Attribution No Derivatives 2.0 Generic", + "cc-by-nd-2.5": "Creative Commons Attribution No Derivatives 2.5 Generic", + "cc-by-nd-3.0": "Creative Commons Attribution No Derivatives 3.0 Unported", + "cc-by-nd-4.0": "Creative Commons Attribution No Derivatives 4.0 International", + "cc-by-sa-1.0": "Creative Commons Attribution Share Alike 1.0 Generic", + "cc-by-sa-2.0": "Creative Commons Attribution Share Alike 2.0 Generic", + "cc-by-sa-2.0-uk": "Creative Commons Attribution Share Alike 2.0 England and Wales", + "cc-by-sa-2.5": "Creative Commons Attribution Share Alike 2.5 Generic", + "cc-by-sa-3.0": "Creative Commons Attribution Share Alike 3.0 Unported", + "cc-by-sa-3.0-at": "Creative Commons Attribution-Share Alike 3.0 Austria", + "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0 International", + "cc-pddc": "Creative Commons Public Domain Dedication and Certification", + "cc0-1.0": "Creative Commons Zero v1.0 Universal", + "cddl-1.0": "Common Development and Distribution License 1.0", + "cddl-1.1": "Common Development and Distribution License 1.1", + "cdla-permissive-1.0": "Community Data License Agreement Permissive 1.0", + "cdla-sharing-1.0": "Community Data License Agreement Sharing 1.0", + "cecill-1.0": "CeCILL Free Software License Agreement v1.0", + "cecill-1.1": "CeCILL Free Software License Agreement v1.1", + "cecill-2.0": "CeCILL Free Software License Agreement v2.0", + "cecill-2.1": "CeCILL Free Software License Agreement v2.1", + "cecill-b": "CeCILL-B Free Software License Agreement", + "cecill-c": "CeCILL-C Free Software License Agreement", + "cern-ohl-1.1": "CERN Open Hardware Licence v1.1", + "cern-ohl-1.2": "CERN Open Hardware Licence v1.2", + "cern-ohl-p-2.0": "CERN Open Hardware Licence Version 2 - Permissive", + "cern-ohl-s-2.0": "CERN Open Hardware Licence Version 2 - Strongly Reciprocal", + "cern-ohl-w-2.0": "CERN Open Hardware Licence Version 2 - Weakly Reciprocal", + "clartistic": "Clarified Artistic License", + "cnri-jython": "CNRI Jython License", + "cnri-python": "CNRI Python License", + "cnri-python-gpl-compatible": "CNRI Python Open Source GPL Compatible License Agreement", + "condor-1.1": "Condor Public License v1.1", + "copyleft-next-0.3.0": "copyleft-next 0.3.0", + "copyleft-next-0.3.1": "copyleft-next 0.3.1", + "cpal-1.0": "Common Public Attribution License 1.0", + "cpl-1.0": "Common Public License 1.0", + "cpol-1.02": "Code Project Open License 1.02", + "crossword": "Crossword License", + "crystalstacker": "CrystalStacker License", + "cua-opl-1.0": "CUA Office Public License v1.0", + "cube": "Cube License", + "curl": "curl License", + "d-fsl-1.0": "Deutsche Freie Software Lizenz", + "diffmark": "diffmark license", + "doc": "DOC License", + "dotseqn": "Dotseqn License", + "dsdp": "DSDP License", + "dvipdfm": "dvipdfm License", + "ecl-1.0": "Educational Community License v1.0", + "ecl-2.0": "Educational Community License v2.0", + "ecos-2.0": "eCos license version 2.0", + "efl-1.0": "Eiffel Forum License v1.0", + "efl-2.0": "Eiffel Forum License v2.0", + "egenix": "eGenix.com Public License 1.1.0", + "entessa": "Entessa Public License v1.0", + "epics": "EPICS Open License", + "epl-1.0": "Eclipse Public License 1.0", + "epl-2.0": "Eclipse Public License 2.0", + "erlpl-1.1": "Erlang Public License v1.1", + "etalab-2.0": "Etalab Open License 2.0", + "eudatagrid": "EU DataGrid Software License", + "eupl-1.0": "European Union Public License 1.0", + "eupl-1.1": "European Union Public License 1.1", + "eupl-1.2": "European Union Public License 1.2", + "eurosym": "Eurosym License", + "fair": "Fair License", + "frameworx-1.0": "Frameworx Open License 1.0", + "freeimage": "FreeImage Public License v1.0", + "fsfap": "FSF All Permissive License", + "fsful": "FSF Unlimited License", + "fsfullr": "FSF Unlimited License (with License Retention)", + "ftl": "Freetype Project License", + "gfdl-1.1": "GNU Free Documentation License v1.1", + "gfdl-1.1-invariants-only": "GNU Free Documentation License v1.1 only - invariants", + "gfdl-1.1-invariants-or-later": "GNU Free Documentation License v1.1 or later - invariants", + "gfdl-1.1-no-invariants-only": "GNU Free Documentation License v1.1 only - no invariants", + "gfdl-1.1-no-invariants-or-later": "GNU Free Documentation License v1.1 or later - no invariants", + "gfdl-1.1-only": "GNU Free Documentation License v1.1 only", + "gfdl-1.1-or-later": "GNU Free Documentation License v1.1 or later", + "gfdl-1.2": "GNU Free Documentation License v1.2", + "gfdl-1.2-invariants-only": "GNU Free Documentation License v1.2 only - invariants", + "gfdl-1.2-invariants-or-later": "GNU Free Documentation License v1.2 or later - invariants", + "gfdl-1.2-no-invariants-only": "GNU Free Documentation License v1.2 only - no invariants", + "gfdl-1.2-no-invariants-or-later": "GNU Free Documentation License v1.2 or later - no invariants", + "gfdl-1.2-only": "GNU Free Documentation License v1.2 only", + "gfdl-1.2-or-later": "GNU Free Documentation License v1.2 or later", + "gfdl-1.3": "GNU Free Documentation License v1.3", + "gfdl-1.3-invariants-only": "GNU Free Documentation License v1.3 only - invariants", + "gfdl-1.3-invariants-or-later": "GNU Free Documentation License v1.3 or later - invariants", + "gfdl-1.3-no-invariants-only": "GNU Free Documentation License v1.3 only - no invariants", + "gfdl-1.3-no-invariants-or-later": "GNU Free Documentation License v1.3 or later - no invariants", + "gfdl-1.3-only": "GNU Free Documentation License v1.3 only", + "gfdl-1.3-or-later": "GNU Free Documentation License v1.3 or later", + "giftware": "Giftware License", + "gl2ps": "GL2PS License", + "glide": "3dfx Glide License", + "glulxe": "Glulxe License", + "glwtpl": "Good Luck With That Public License", + "gnuplot": "gnuplot License", + "gpl-1.0": "GNU General Public License v1.0 only", + "gpl-1.0+": "GNU General Public License v1.0 or later", + "gpl-1.0-only": "GNU General Public License v1.0 only", + "gpl-1.0-or-later": "GNU General Public License v1.0 or later", + "gpl-2.0": "GNU General Public License v2.0 only", + "gpl-2.0+": "GNU General Public License v2.0 or later", + "gpl-2.0-only": "GNU General Public License v2.0 only", + "gpl-2.0-or-later": "GNU General Public License v2.0 or later", + "gpl-2.0-with-autoconf-exception": "GNU General Public License v2.0 w/Autoconf exception", + "gpl-2.0-with-bison-exception": "GNU General Public License v2.0 w/Bison exception", + "gpl-2.0-with-classpath-exception": "GNU General Public License v2.0 w/Classpath exception", + "gpl-2.0-with-font-exception": "GNU General Public License v2.0 w/Font exception", + "gpl-2.0-with-gcc-exception": "GNU General Public License v2.0 w/GCC Runtime Library exception", + "gpl-3.0": "GNU General Public License v3.0 only", + "gpl-3.0+": "GNU General Public License v3.0 or later", + "gpl-3.0-only": "GNU General Public License v3.0 only", + "gpl-3.0-or-later": "GNU General Public License v3.0 or later", + "gpl-3.0-with-autoconf-exception": "GNU General Public License v3.0 w/Autoconf exception", + "gpl-3.0-with-gcc-exception": "GNU General Public License v3.0 w/GCC Runtime Library exception", + "gsoap-1.3b": "gSOAP Public License v1.3b", + "haskellreport": "Haskell Language Report License", + "hippocratic-2.1": "Hippocratic License 2.1", + "hpnd": "Historical Permission Notice and Disclaimer", + "hpnd-sell-variant": "Historical Permission Notice and Disclaimer - sell variant", + "htmltidy": "HTML Tidy License", + "ibm-pibs": "IBM PowerPC Initialization and Boot Software", + "icu": "ICU License", + "ijg": "Independent JPEG Group License", + "imagemagick": "ImageMagick License", + "imatix": "iMatix Standard Function Library Agreement", + "imlib2": "Imlib2 License", + "info-zip": "Info-ZIP License", + "intel": "Intel Open Source License", + "intel-acpi": "Intel ACPI Software License Agreement", + "interbase-1.0": "Interbase Public License v1.0", + "ipa": "IPA Font License", + "ipl-1.0": "IBM Public License v1.0", + "isc": "ISC License", + "jasper-2.0": "JasPer License", + "jpnic": "Japan Network Information Center License", + "json": "JSON License", + "lal-1.2": "Licence Art Libre 1.2", + "lal-1.3": "Licence Art Libre 1.3", + "latex2e": "Latex2e License", + "leptonica": "Leptonica License", + "lgpl-2.0": "GNU Library General Public License v2 only", + "lgpl-2.0+": "GNU Library General Public License v2 or later", + "lgpl-2.0-only": "GNU Library General Public License v2 only", + "lgpl-2.0-or-later": "GNU Library General Public License v2 or later", + "lgpl-2.1": "GNU Lesser General Public License v2.1 only", + "lgpl-2.1+": "GNU Library General Public License v2.1 or later", + "lgpl-2.1-only": "GNU Lesser General Public License v2.1 only", + "lgpl-2.1-or-later": "GNU Lesser General Public License v2.1 or later", + "lgpl-3.0": "GNU Lesser General Public License v3.0 only", + "lgpl-3.0+": "GNU Lesser General Public License v3.0 or later", + "lgpl-3.0-only": "GNU Lesser General Public License v3.0 only", + "lgpl-3.0-or-later": "GNU Lesser General Public License v3.0 or later", + "lgpllr": "Lesser General Public License For Linguistic Resources", + "libpng": "libpng License", + "libpng-2.0": "PNG Reference Library version 2", + "libselinux-1.0": "libselinux public domain notice", + "libtiff": "libtiff License", + "liliq-p-1.1": "Licence Libre du Qu\u00e9bec \u2013 Permissive version 1.1", + "liliq-r-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 version 1.1", + "liliq-rplus-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 forte version 1.1", + "linux-openib": "Linux Kernel Variant of OpenIB.org license", + "lpl-1.0": "Lucent Public License Version 1.0", + "lpl-1.02": "Lucent Public License v1.02", + "lppl-1.0": "LaTeX Project Public License v1.0", + "lppl-1.1": "LaTeX Project Public License v1.1", + "lppl-1.2": "LaTeX Project Public License v1.2", + "lppl-1.3a": "LaTeX Project Public License v1.3a", + "lppl-1.3c": "LaTeX Project Public License v1.3c", + "makeindex": "MakeIndex License", + "miros": "The MirOS Licence", + "mit": "MIT License", + "mit-0": "MIT No Attribution", + "mit-advertising": "Enlightenment License (e16)", + "mit-cmu": "CMU License", + "mit-enna": "enna License", + "mit-feh": "feh License", + "mit-open-group": "MIT Open Group variant", + "mitnfa": "MIT +no-false-attribs license", + "motosoto": "Motosoto License", + "mpich2": "mpich2 License", + "mpl-1.0": "Mozilla Public License 1.0", + "mpl-1.1": "Mozilla Public License 1.1", + "mpl-2.0": "Mozilla Public License 2.0", + "mpl-2.0-no-copyleft-exception": "Mozilla Public License 2.0 (no copyleft exception)", + "ms-pl": "Microsoft Public License", + "ms-rl": "Microsoft Reciprocal License", + "mtll": "Matrix Template Library License", + "mulanpsl-1.0": "Mulan Permissive Software License, Version 1", + "mulanpsl-2.0": "Mulan Permissive Software License, Version 2", + "multics": "Multics License", + "mup": "Mup License", + "nasa-1.3": "NASA Open Source Agreement 1.3", + "naumen": "Naumen Public License", + "nbpl-1.0": "Net Boolean Public License v1", + "ncgl-uk-2.0": "Non-Commercial Government Licence", + "ncsa": "University of Illinois/NCSA Open Source License", + "net-snmp": "Net-SNMP License", + "netcdf": "NetCDF license", + "newsletr": "Newsletr License", + "ngpl": "Nethack General Public License", + "nist-pd": "NIST Public Domain Notice", + "nist-pd-fallback": "NIST Public Domain Notice with license fallback", + "nlod-1.0": "Norwegian Licence for Open Government Data", + "nlpl": "No Limit Public License", + "nokia": "Nokia Open Source License", + "nosl": "Netizen Open Source License", + "noweb": "Noweb License", + "npl-1.0": "Netscape Public License v1.0", + "npl-1.1": "Netscape Public License v1.1", + "nposl-3.0": "Non-Profit Open Software License 3.0", + "nrl": "NRL License", + "ntp": "NTP License", + "ntp-0": "NTP No Attribution", + "nunit": "Nunit License", + "o-uda-1.0": "Open Use of Data Agreement v1.0", + "occt-pl": "Open CASCADE Technology Public License", + "oclc-2.0": "OCLC Research Public License 2.0", + "odbl-1.0": "ODC Open Database License v1.0", + "odc-by-1.0": "Open Data Commons Attribution License v1.0", + "ofl-1.0": "SIL Open Font License 1.0", + "ofl-1.0-no-rfn": "SIL Open Font License 1.0 with no Reserved Font Name", + "ofl-1.0-rfn": "SIL Open Font License 1.0 with Reserved Font Name", + "ofl-1.1": "SIL Open Font License 1.1", + "ofl-1.1-no-rfn": "SIL Open Font License 1.1 with no Reserved Font Name", + "ofl-1.1-rfn": "SIL Open Font License 1.1 with Reserved Font Name", + "ogc-1.0": "OGC Software License, Version 1.0", + "ogl-canada-2.0": "Open Government Licence - Canada", + "ogl-uk-1.0": "Open Government Licence v1.0", + "ogl-uk-2.0": "Open Government Licence v2.0", + "ogl-uk-3.0": "Open Government Licence v3.0", + "ogtsl": "Open Group Test Suite License", + "oldap-1.1": "Open LDAP Public License v1.1", + "oldap-1.2": "Open LDAP Public License v1.2", + "oldap-1.3": "Open LDAP Public License v1.3", + "oldap-1.4": "Open LDAP Public License v1.4", + "oldap-2.0": "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)", + "oldap-2.0.1": "Open LDAP Public License v2.0.1", + "oldap-2.1": "Open LDAP Public License v2.1", + "oldap-2.2": "Open LDAP Public License v2.2", + "oldap-2.2.1": "Open LDAP Public License v2.2.1", + "oldap-2.2.2": "Open LDAP Public License 2.2.2", + "oldap-2.3": "Open LDAP Public License v2.3", + "oldap-2.4": "Open LDAP Public License v2.4", + "oldap-2.5": "Open LDAP Public License v2.5", + "oldap-2.6": "Open LDAP Public License v2.6", + "oldap-2.7": "Open LDAP Public License v2.7", + "oldap-2.8": "Open LDAP Public License v2.8", + "oml": "Open Market License", + "openssl": "OpenSSL License", + "opl-1.0": "Open Public License v1.0", + "oset-pl-2.1": "OSET Public License version 2.1", + "osl-1.0": "Open Software License 1.0", + "osl-1.1": "Open Software License 1.1", + "osl-2.0": "Open Software License 2.0", + "osl-2.1": "Open Software License 2.1", + "osl-3.0": "Open Software License 3.0", + "parity-6.0.0": "The Parity Public License 6.0.0", + "parity-7.0.0": "The Parity Public License 7.0.0", + "pddl-1.0": "ODC Public Domain Dedication & License 1.0", + "php-3.0": "PHP License v3.0", + "php-3.01": "PHP License v3.01", + "plexus": "Plexus Classworlds License", + "polyform-noncommercial-1.0.0": "PolyForm Noncommercial License 1.0.0", + "polyform-small-business-1.0.0": "PolyForm Small Business License 1.0.0", + "postgresql": "PostgreSQL License", + "psf-2.0": "Python Software Foundation License 2.0", + "psfrag": "psfrag License", + "psutils": "psutils License", + "python-2.0": "Python License 2.0", + "qhull": "Qhull License", + "qpl-1.0": "Q Public License 1.0", + "rdisc": "Rdisc License", + "rhecos-1.1": "Red Hat eCos Public License v1.1", + "rpl-1.1": "Reciprocal Public License 1.1", + "rpl-1.5": "Reciprocal Public License 1.5", + "rpsl-1.0": "RealNetworks Public Source License v1.0", + "rsa-md": "RSA Message-Digest License", + "rscpl": "Ricoh Source Code Public License", + "ruby": "Ruby License", + "sax-pd": "Sax Public Domain Notice", + "saxpath": "Saxpath License", + "scea": "SCEA Shared Source License", + "sendmail": "Sendmail License", + "sendmail-8.23": "Sendmail License 8.23", + "sgi-b-1.0": "SGI Free Software License B v1.0", + "sgi-b-1.1": "SGI Free Software License B v1.1", + "sgi-b-2.0": "SGI Free Software License B v2.0", + "shl-0.5": "Solderpad Hardware License v0.5", + "shl-0.51": "Solderpad Hardware License, Version 0.51", + "simpl-2.0": "Simple Public License 2.0", + "sissl": "Sun Industry Standards Source License v1.1", + "sissl-1.2": "Sun Industry Standards Source License v1.2", + "sleepycat": "Sleepycat License", + "smlnj": "Standard ML of New Jersey License", + "smppl": "Secure Messaging Protocol Public License", + "snia": "SNIA Public License 1.1", + "spencer-86": "Spencer License 86", + "spencer-94": "Spencer License 94", + "spencer-99": "Spencer License 99", + "spl-1.0": "Sun Public License v1.0", + "ssh-openssh": "SSH OpenSSH license", + "ssh-short": "SSH short notice", + "sspl-1.0": "Server Side Public License, v 1", + "standardml-nj": "Standard ML of New Jersey License", + "sugarcrm-1.1.3": "SugarCRM Public License v1.1.3", + "swl": "Scheme Widget Library (SWL) Software License Agreement", + "tapr-ohl-1.0": "TAPR Open Hardware License v1.0", + "tcl": "TCL/TK License", + "tcp-wrappers": "TCP Wrappers License", + "tmate": "TMate Open Source License", + "torque-1.1": "TORQUE v2.5+ Software License v1.1", + "tosl": "Trusster Open Source License", + "tu-berlin-1.0": "Technische Universitaet Berlin License 1.0", + "tu-berlin-2.0": "Technische Universitaet Berlin License 2.0", + "ucl-1.0": "Upstream Compatibility License v1.0", + "unicode-dfs-2015": "Unicode License Agreement - Data Files and Software (2015)", + "unicode-dfs-2016": "Unicode License Agreement - Data Files and Software (2016)", + "unicode-tou": "Unicode Terms of Use", + "unlicense": "The Unlicense", + "upl-1.0": "Universal Permissive License v1.0", + "vim": "Vim License", + "vostrom": "VOSTROM Public License for Open Source", + "vsl-1.0": "Vovida Software License v1.0", + "w3c": "W3C Software Notice and License (2002-12-31)", + "w3c-19980720": "W3C Software Notice and License (1998-07-20)", + "w3c-20150513": "W3C Software Notice and Document License (2015-05-13)", + "watcom-1.0": "Sybase Open Watcom Public License 1.0", + "wsuipa": "Wsuipa License", + "wtfpl": "Do What The F*ck You Want To Public License", + "wxwindows": "wxWindows Library License", + "x11": "X11 License", + "xerox": "Xerox License", + "xfree86-1.1": "XFree86 License 1.1", + "xinetd": "xinetd License", + "xnet": "X.Net License", + "xpp": "XPP License", + "xskat": "XSkat License", + "ypl-1.0": "Yahoo! Public License v1.0", + "ypl-1.1": "Yahoo! Public License v1.1", + "zed": "Zed License", + "zend-2.0": "Zend License v2.0", + "zimbra-1.3": "Zimbra Public License v1.3", + "zimbra-1.4": "Zimbra Public License v1.4", + "zlib": "zlib License", + "zlib-acknowledgement": "zlib/libpng License with Acknowledgement", + "zpl-1.1": "Zope Public License 1.1", + "zpl-2.0": "Zope Public License 2.0", + "zpl-2.1": "Zope Public License 2.1" +} \ No newline at end of file diff --git a/src/datasets/utils/resources/multilingualities.json b/src/datasets/utils/resources/multilingualities.json new file mode 100644 index 00000000000..a35c79f03df --- /dev/null +++ b/src/datasets/utils/resources/multilingualities.json @@ -0,0 +1,6 @@ +{ + "monolingual": "contains a single language", + "multilingual": "contains multiple languages", + "translation": "contains translated or aligned text", + "other": "other type of language distribution" +} diff --git a/src/datasets/utils/resources/size_categories.json b/src/datasets/utils/resources/size_categories.json new file mode 100644 index 00000000000..983ce0c10db --- /dev/null +++ b/src/datasets/utils/resources/size_categories.json @@ -0,0 +1,14 @@ +[ + "unknown", + "n<1K", + "1K1T" +] diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json new file mode 100644 index 00000000000..966ba106c9d --- /dev/null +++ b/src/datasets/utils/resources/tasks.json @@ -0,0 +1,86 @@ +{ + "conditional-text-generation": { + "description": "data-to-text and text transduction tasks such as translation or summarization", + "options": [ + "machine-translation", + "sentence-splitting-fusion", + "summarization", + "table-to-text", + "text-simplification", + "explanation-generation", + "other-stuctured-to-text", + "other" + ] + }, + "question-answering": { + "description": "question answering tasks", + "options": [ + "open-domain-qa", + "closed-domain-qa", + "multiple-choice-qa", + "extractive-qa", + "abstractive-qa", + "other" + ] + }, + "sequence-modeling": { + "description": "such as language modeling or dialogue", + "options": [ + "dialogue-modeling", + "language-modeling", + "other-multi-turn", + "slot-filling", + "other" + ] + }, + "structure-prediction": { + "description": "predicting structural properties of the text, such as syntax", + "options": [ + "coreference-resolution", + "named-entity-recognition", + "part-of-speech-tagging", + "parsing", + "other" + ] + }, + "text-classification": { + "description": "predicting a class index or boolean value", + "options": [ + "acceptability-classification", + "entity-linking-classification", + "fact-checking", + "intent-classification", + "multi-class-classification", + "multi-label-classification", + "natural-language-inference", + "semantic-similarity-classification", + "sentiment-classification", + "topic-classification", + "other" + ] + }, + "text-retrieval": { + "description": "information or text retrieval tasks", + "options": [ + "document-retrieval", + "utterance-retrieval", + "entity-linking-retrieval", + "fact-checking-retrieval", + "other" + ] + }, + "text-scoring": { + "description": "text scoring tasks, predicting a real valued score for some text", + "options": [ + "semantic-similarity-scoring", + "sentiment-scoring", + "other" + ] + }, + "other": { + "description": "other task family not mentioned here", + "options": [ + "other" + ] + } +} diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py new file mode 100644 index 00000000000..a24e0741862 --- /dev/null +++ b/tests/test_metadata_util.py @@ -0,0 +1,236 @@ +import tempfile +import unittest +from pathlib import Path + +from datasets.utils.metadata import ( + DatasetMetadata, + escape_validation_for_predicate, + metadata_dict_from_readme, + tagset_validator, + validate_metadata_type, + yaml_block_from_readme, +) + + +def _dedent(string: str) -> str: + return "\n".join([line.lstrip() for line in string.splitlines()]) + + +README_YAML = """\ +--- +languages: +- zh +- en +task_ids: +- sentiment-classification +--- +# Begin of markdown + +Some cool dataset card +""" + +README_EMPTY_YAML = """\ +--- +--- +# Begin of markdown + +Some cool dataset card +""" + + +README_NO_YAML = """\ +# Begin of markdown + +Some cool dataset card +""" + + +class TestMetadataUtils(unittest.TestCase): + def test_validate_metadata_type(self): + metadata_dict = { + "tag": ["list", "of", "values"], + "another tag": ["Another", {"list"}, ["of"], 0x646D46736457567A], + } + validate_metadata_type(metadata_dict) + + metadata_dict = {"tag1": []} + with self.assertRaises(TypeError): + validate_metadata_type(metadata_dict) + + metadata_dict = {"tag1": None} + with self.assertRaises(TypeError): + validate_metadata_type(metadata_dict) + + def test_tagset_validator(self): + name = "test_tag" + url = "https://dummy.hf.co" + + values = ["tag1", "tag2", "tag2", "tag3"] + reference_values = ["tag1", "tag2", "tag3"] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, values) + self.assertIsNone(error) + + values = [] + reference_values = ["tag1", "tag2", "tag3"] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, values) + self.assertIsNone(error) + + values = [] + reference_values = [] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, values) + self.assertIsNone(error) + + values = ["tag1", "tag2", "tag2", "tag3", "unknown tag"] + reference_values = ["tag1", "tag2", "tag3"] + returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url) + self.assertListEqual(returned_values, []) + self.assertEqual(error, f"{['unknown tag']} are not registered tags for '{name}', reference at {url}") + + def test_escape_validation_for_predicate(self): + def predicate_fn(string: str) -> bool: + return "ignore" in string + + values = ["process me", "process me too", "ignore me"] + to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) + self.assertListEqual(to_ignore, ["ignore me"]) + self.assertListEqual(to_validate, ["process me", "process me too"]) + + values = ["process me", "process me too"] + to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) + self.assertListEqual(to_ignore, []) + self.assertListEqual(to_validate, values) + + values = ["this value will be ignored", "ignore this one two"] + to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn) + self.assertListEqual(to_ignore, values) + self.assertListEqual(to_validate, []) + + def test_yaml_block_from_readme(self): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + + with open(path, "w+") as readme_file: + readme_file.write(README_YAML) + yaml_block = yaml_block_from_readme(path=path) + self.assertEqual( + yaml_block, + _dedent( + """\ + languages: + - zh + - en + task_ids: + - sentiment-classification +""" + ), + ) + + with open(path, "w+") as readme_file: + readme_file.write(README_EMPTY_YAML) + yaml_block = yaml_block_from_readme(path=path) + self.assertEqual( + yaml_block, + _dedent( + """\ + """ + ), + ) + + with open(path, "w+") as readme_file: + readme_file.write(README_NO_YAML) + yaml_block = yaml_block_from_readme(path=path) + self.assertIsNone(yaml_block) + + def test_metadata_dict_from_readme(self): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(README_YAML) + metadata_dict = metadata_dict_from_readme(path) + self.assertDictEqual(metadata_dict, {"languages": ["zh", "en"], "task_ids": ["sentiment-classification"]}) + + with open(path, "w+") as readme_file: + readme_file.write(README_EMPTY_YAML) + metadata_dict = metadata_dict_from_readme(path) + self.assertDictEqual(metadata_dict, {}) + + with open(path, "w+") as readme_file: + readme_file.write(README_NO_YAML) + metadata_dict = metadata_dict_from_readme(path) + self.assertIsNone(metadata_dict) + + def test_from_yaml_string(self): + valid_yaml_string = _dedent( + """\ + annotations_creators: + - found + language_creators: + - found + languages: + - en + licenses: + - unknown + multilinguality: + - monolingual + size_categories: + - 10K