diff --git a/.circleci/config.yml b/.circleci/config.yml
index cbd925431b7..42cbc406ba2 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -81,6 +81,7 @@ jobs:
             - run: black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics
             - run: isort --check-only tests src benchmarks datasets metrics
             - run: flake8 tests src benchmarks datasets metrics
+            - run: ./scripts/datasets_metadata_validator.py
 
     build_doc:
         working_directory: ~/datasets
diff --git a/scripts/datasets_metadata_validator.py b/scripts/datasets_metadata_validator.py
new file mode 100755
index 00000000000..857d11c116e
--- /dev/null
+++ b/scripts/datasets_metadata_validator.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+""" This script will run in CI and make sure all new changes to datasets readme files have valid metadata yaml headers.
+
+"""
+
+from pathlib import Path
+from subprocess import check_output
+from typing import List
+
+from datasets.utils.metadata import DatasetMetadata
+
+
+def get_changed_files(repo_path: Path) -> List[Path]:
+    diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
+    changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
+    return changed_files
+
+
+if __name__ == "__main__":
+    import logging
+    from argparse import ArgumentParser
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    ap = ArgumentParser()
+    ap.add_argument("--repo_path", type=Path, default=Path.cwd())
+    ap.add_argument("--check_all", action="store_true")
+    args = ap.parse_args()
+
+    repo_path: Path = args.repo_path
+    if args.check_all:
+        readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
+    else:
+        changed_files = get_changed_files(repo_path)
+        readmes = [
+            f
+            for f in changed_files
+            if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
+        ]
+
+    failed: List[Path] = []
+    for readme in sorted(readmes):
+        try:
+            DatasetMetadata.from_readme(readme)
+            logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
+        except TypeError as e:
+            failed.append(readme)
+            logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
+        except Exception as e:
+            failed.append(readme)
+            logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")
+
+    if len(failed) > 0:
+        logging.info(f"❌ Failed on {len(failed)} files.")
+        exit(1)
+    else:
+        logging.info("All is well, keep up the good work 🤗!")
+        exit(0)
diff --git a/setup.py b/setup.py
index 52563ad0cdf..817c06189a9 100644
--- a/setup.py
+++ b/setup.py
@@ -52,16 +52,20 @@
 import os
 import sys
 
-from setuptools import find_packages
-from setuptools import setup
+from setuptools import find_packages, setup
+
 
 DOCLINES = __doc__.split("\n")
 
 
 # Pin some dependencies for old python versions
 _deps = {
-    "fsspec": "fsspec" if sys.version_info >= (3, 7) else "fsspec<0.8.1",  # fsspec>=0.8.1 requires py>=3.7 for async stuff
-    "s3fs": "s3fs" if sys.version_info >= (3, 7) else "s3fs==0.4.2",  # later versions of s3fs have issues downloading directories recursively for py36
+    "fsspec": "fsspec"
+    if sys.version_info >= (3, 7)
+    else "fsspec<0.8.1",  # fsspec>=0.8.1 requires py>=3.7 for async stuff
+    "s3fs": "s3fs"
+    if sys.version_info >= (3, 7)
+    else "s3fs==0.4.2",  # later versions of s3fs have issues downloading directories recursively for py36
 }
 
 
@@ -149,6 +153,8 @@
     "tldextract>=3.1.0",
     "texttable>=1.6.3",
     "Werkzeug>=1.0.1",
+    # metadata validation
+    "importlib_resources;python_version<'3.7'",
 ]
 
 if os.name == "nt":  # windows
@@ -167,11 +173,7 @@
     )
 
 
-QUALITY_REQUIRE = [
-    "black",
-    "isort",
-    "flake8==3.7.9",
-]
+QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "pyyaml>=5.3.1"]
 
 
 EXTRAS_REQUIRE = {
@@ -214,11 +216,7 @@
     license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
-    package_data={
-        "datasets": [
-            "scripts/templates/*",
-        ],
-    },
+    package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]},
     entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py
new file mode 100644
index 00000000000..e94fe6abe0f
--- /dev/null
+++ b/src/datasets/utils/metadata.py
@@ -0,0 +1,259 @@
+import json
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+
+# loading package files: https://stackoverflow.com/a/20885799
+try:
+    import importlib.resources as pkg_resources
+except ImportError:
+    # Try backported to PY<37 `importlib_resources`.
+    import importlib_resources as pkg_resources
+
+import yaml
+
+from . import resources
+
+
+BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
+this_url = f"{BASE_REF_URL}/{__file__}"
+logger = logging.getLogger(__name__)
+
+
+def load_json_resource(resource: str) -> Tuple[Any, str]:
+    content = pkg_resources.read_text(resources, resource)
+    return json.loads(content), f"{BASE_REF_URL}/resources/{resource}"
+
+
+# Source of languages.json:
+# https://datahub.io/core/language-codes/r/ietf-language-tags.csv
+# Language names were obtained with langcodes: https://github.com/LuminosoInsight/langcodes
+known_language_codes, known_language_codes_url = load_json_resource("languages.json")
+known_licenses, known_licenses_url = load_json_resource("licenses.json")
+known_task_ids, known_task_ids_url = load_json_resource("tasks.json")
+known_creators, known_creators_url = load_json_resource("creators.json")
+known_size_categories, known_size_categories_url = load_json_resource("size_categories.json")
+known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json")
+
+
+def yaml_block_from_readme(path: Path) -> Optional[str]:
+    with path.open() as readme_file:
+        content = [line.strip() for line in readme_file]
+
+    if content[0] == "---" and "---" in content[1:]:
+        yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
+        return yamlblock
+
+    return None
+
+
+def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]:
+    """"Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict"""
+    yaml_block = yaml_block_from_readme(path=path)
+    if yaml_block is None:
+        return None
+    metada_dict = yaml.safe_load(yaml_block) or dict()
+    return metada_dict
+
+
+ValidatorOutput = Tuple[List[str], Optional[str]]
+
+
+def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> ValidatorOutput:
+    invalid_values = [v for v in values if v not in reference_values]
+    if len(invalid_values) > 0:
+        return [], f"{invalid_values} are not registered tags for '{name}', reference at {url}"
+    return values, None
+
+
+def escape_validation_for_predicate(
+    values: List[Any], predicate_fn: Callable[[Any], bool]
+) -> Tuple[List[Any], List[Any]]:
+    trues, falses = list(), list()
+    for v in values:
+        if predicate_fn(v):
+            trues.append(v)
+        else:
+            falses.append(v)
+    if len(trues) > 0:
+        logger.warning(f"The following values will escape validation: {trues}")
+    return trues, falses
+
+
+def validate_metadata_type(metadata_dict: dict):
+    basic_typing_errors = {
+        name: value
+        for name, value in metadata_dict.items()
+        if not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str)
+    }
+    if len(basic_typing_errors) > 0:
+        raise TypeError(f"Found fields that are not non-empty list of strings: {basic_typing_errors}")
+
+
+@dataclass
+class DatasetMetadata:
+    annotations_creators: List[str]
+    language_creators: List[str]
+    languages: List[str]
+    licenses: List[str]
+    multilinguality: List[str]
+    size_categories: List[str]
+    source_datasets: List[str]
+    task_categories: List[str]
+    task_ids: List[str]
+
+    def __post_init__(self):
+        validate_metadata_type(metadata_dict=vars(self))
+
+        self.annotations_creators, annotations_creators_errors = self.validate_annotations_creators(
+            self.annotations_creators
+        )
+        self.language_creators, language_creators_errors = self.validate_language_creators(self.language_creators)
+        self.languages, languages_errors = self.validate_language_codes(self.languages)
+        self.licenses, licenses_errors = self.validate_licences(self.licenses)
+        self.multilinguality, multilinguality_errors = self.validate_mulitlinguality(self.multilinguality)
+        self.size_categories, size_categories_errors = self.validate_size_catgeories(self.size_categories)
+        self.source_datasets, source_datasets_errors = self.validate_source_datasets(self.source_datasets)
+        self.task_categories, task_categories_errors = self.validate_task_categories(self.task_categories)
+        self.task_ids, task_ids_errors = self.validate_task_ids(self.task_ids)
+
+        errors = {
+            "annotations_creators": annotations_creators_errors,
+            "language_creators": language_creators_errors,
+            "licenses": licenses_errors,
+            "multilinguality": multilinguality_errors,
+            "size_categories": size_categories_errors,
+            "source_datasets": source_datasets_errors,
+            "task_categories": task_categories_errors,
+            "task_ids": task_ids_errors,
+            "languages": languages_errors,
+        }
+
+        exception_msg_dict = dict()
+        for field, errs in errors.items():
+            if errs is not None:
+                exception_msg_dict[field] = errs
+        if len(exception_msg_dict) > 0:
+            raise TypeError(
+                "Could not validate the metada, found the following errors:\n"
+                + "\n".join(f"* field '{fieldname}':\n\t{err}" for fieldname, err in exception_msg_dict.items())
+            )
+
+    @classmethod
+    def from_readme(cls, path: Path) -> "DatasetMetadata":
+        """Loads and validates the dataset metadat from its dataset card (README.md)
+
+        Args:
+            path (:obj:`Path`): Path to the dataset card (its README.md file)
+
+        Returns:
+            :class:`DatasetMetadata`: The dataset's metadata
+
+        Raises:
+            :obj:`TypeError`: If the dataset card has no metadata (no YAML header)
+            :obj:`TypeError`: If the dataset's metadata is invalid
+        """
+        yaml_string = yaml_block_from_readme(path)
+        if yaml_string is not None:
+            return cls.from_yaml_string(yaml_string)
+        else:
+            raise TypeError(f"did not find a yaml block in '{path}'")
+
+    @classmethod
+    def from_yaml_string(cls, string: str) -> "DatasetMetadata":
+        """Loads and validates the dataset metadat from a YAML string
+
+        Args:
+            string (:obj:`str`): The YAML string
+
+        Returns:
+            :class:`DatasetMetadata`: The dataset's metadata
+
+        Raises:
+            :obj:`TypeError`: If the dataset's metadata is invalid
+        """
+        metada_dict = yaml.safe_load(string) or dict()
+        return cls(**metada_dict)
+
+    @staticmethod
+    def validate_annotations_creators(annotations_creators: List[str]) -> ValidatorOutput:
+        return tagset_validator(
+            annotations_creators, known_creators["annotations"], "annotations_creators", known_creators_url
+        )
+
+    @staticmethod
+    def validate_language_creators(language_creators: List[str]) -> ValidatorOutput:
+        return tagset_validator(language_creators, known_creators["language"], "language_creators", known_creators_url)
+
+    @staticmethod
+    def validate_language_codes(languages: List[str]) -> ValidatorOutput:
+        return tagset_validator(
+            values=languages,
+            reference_values=known_language_codes.keys(),
+            name="languages",
+            url=known_language_codes_url,
+        )
+
+    @staticmethod
+    def validate_licences(licenses: List[str]) -> ValidatorOutput:
+        others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e)
+        validated, error = tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url)
+        return [*validated, *others], error
+
+    @staticmethod
+    def validate_task_categories(task_categories: List[str]) -> ValidatorOutput:
+        # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
+        #   in the near future and we don't want to waste energy in tagging against a moving taxonomy.
+        known_set = list(known_task_ids.keys())
+        others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other"))
+        validated, error = tagset_validator(to_validate, known_set, "task_categories", known_task_ids_url)
+        return [*validated, *others], error
+
+    @staticmethod
+    def validate_task_ids(task_ids: List[str]) -> ValidatorOutput:
+        # TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
+        #   in the near future and we don't want to waste energy in tagging against a moving taxonomy.
+        known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]]
+        others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e)
+        validated, error = tagset_validator(to_validate, known_set, "task_ids", known_task_ids_url)
+        return [*validated, *others], error
+
+    @staticmethod
+    def validate_mulitlinguality(multilinguality: List[str]) -> ValidatorOutput:
+        others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other"))
+        validated, error = tagset_validator(
+            to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url
+        )
+        return [*validated, *others], error
+
+    @staticmethod
+    def validate_size_catgeories(size_cats: List[str]) -> ValidatorOutput:
+        return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url)
+
+    @staticmethod
+    def validate_source_datasets(sources: List[str]) -> ValidatorOutput:
+        invalid_values = []
+        for src in sources:
+            is_ok = src in ["original", "extended"] or src.startswith("extended|")
+            if not is_ok:
+                invalid_values.append(src)
+        if len(invalid_values) > 0:
+            return (
+                [],
+                f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}",
+            )
+
+        return sources, None
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    ap = ArgumentParser(usage="Validate the yaml metadata block of a README.md file.")
+    ap.add_argument("readme_filepath")
+    args = ap.parse_args()
+
+    readme_filepath = Path(args.readme_filepath)
+    DatasetMetadata.from_readme(readme_filepath)
diff --git a/src/datasets/utils/resources/__init__.py b/src/datasets/utils/resources/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/datasets/utils/resources/creators.json b/src/datasets/utils/resources/creators.json
new file mode 100644
index 00000000000..d9e15f0039c
--- /dev/null
+++ b/src/datasets/utils/resources/creators.json
@@ -0,0 +1,17 @@
+{
+  "language": [
+    "found",
+    "crowdsourced",
+    "expert-generated",
+    "machine-generated",
+    "other"
+  ],
+  "annotations": [
+    "found",
+    "crowdsourced",
+    "expert-generated",
+    "machine-generated",
+    "no-annotation",
+    "other"
+  ]
+}
diff --git a/src/datasets/utils/resources/languages.json b/src/datasets/utils/resources/languages.json
new file mode 100644
index 00000000000..d0c77f66f11
--- /dev/null
+++ b/src/datasets/utils/resources/languages.json
@@ -0,0 +1,792 @@
+{
+    "af": "Afrikaans",
+    "af-NA": "Afrikaans (Namibia)",
+    "af-ZA": "Afrikaans (South Africa)",
+    "agq": "Aghem",
+    "agq-CM": "Aghem (Cameroon)",
+    "ak": "Akan",
+    "ak-GH": "Akan (Ghana)",
+    "am": "Amharic",
+    "am-ET": "Amharic (Ethiopia)",
+    "ar": "Arabic",
+    "ar-001": "Arabic (World)",
+    "ar-AE": "Arabic (United Arab Emirates)",
+    "ar-BH": "Arabic (Bahrain)",
+    "ar-DJ": "Arabic (Djibouti)",
+    "ar-DZ": "Arabic (Algeria)",
+    "ar-EG": "Arabic (Egypt)",
+    "ar-EH": "Arabic (Western Sahara)",
+    "ar-ER": "Arabic (Eritrea)",
+    "ar-IL": "Arabic (Israel)",
+    "ar-IQ": "Arabic (Iraq)",
+    "ar-JO": "Arabic (Jordan)",
+    "ar-KM": "Arabic (Comoros)",
+    "ar-KW": "Arabic (Kuwait)",
+    "ar-LB": "Arabic (Lebanon)",
+    "ar-LY": "Arabic (Libya)",
+    "ar-MA": "Arabic (Morocco)",
+    "ar-MR": "Arabic (Mauritania)",
+    "ar-OM": "Arabic (Oman)",
+    "ar-PS": "Arabic (Palestinian Territories)",
+    "ar-QA": "Arabic (Qatar)",
+    "ar-SA": "Arabic (Saudi Arabia)",
+    "ar-SD": "Arabic (Sudan)",
+    "ar-SO": "Arabic (Somalia)",
+    "ar-SS": "Arabic (South Sudan)",
+    "ar-SY": "Arabic (Syria)",
+    "ar-TD": "Arabic (Chad)",
+    "ar-TN": "Arabic (Tunisia)",
+    "ar-YE": "Arabic (Yemen)",
+    "as": "Assamese",
+    "as-IN": "Assamese (India)",
+    "asa": "Asu",
+    "asa-TZ": "Asu (Tanzania)",
+    "ast": "Asturian",
+    "ast-ES": "Asturian (Spain)",
+    "az": "Azerbaijani",
+    "az-Cyrl": "Azerbaijani (Cyrillic)",
+    "az-Cyrl-AZ": "Azerbaijani (Cyrillic, Azerbaijan)",
+    "az-Latn": "Azerbaijani (Latin)",
+    "az-Latn-AZ": "Azerbaijani (Latin, Azerbaijan)",
+    "bas": "Basaa",
+    "bas-CM": "Basaa (Cameroon)",
+    "be": "Belarusian",
+    "be-BY": "Belarusian (Belarus)",
+    "bem": "Bemba",
+    "bem-ZM": "Bemba (Zambia)",
+    "bez": "Bena",
+    "bez-TZ": "Bena (Tanzania)",
+    "bg": "Bulgarian",
+    "bg-BG": "Bulgarian (Bulgaria)",
+    "bm": "Bambara",
+    "bm-ML": "Bambara (Mali)",
+    "bn": "Bangla",
+    "bn-BD": "Bangla (Bangladesh)",
+    "bn-IN": "Bangla (India)",
+    "bo": "Tibetan",
+    "bo-CN": "Tibetan (China)",
+    "bo-IN": "Tibetan (India)",
+    "br": "Breton",
+    "br-FR": "Breton (France)",
+    "brx": "Bodo",
+    "brx-IN": "Bodo (India)",
+    "bs": "Bosnian",
+    "bs-Cyrl": "Bosnian (Cyrillic)",
+    "bs-Cyrl-BA": "Bosnian (Cyrillic, Bosnia & Herzegovina)",
+    "bs-Latn": "Bosnian",
+    "bs-Latn-BA": "Bosnian (Bosnia & Herzegovina)",
+    "ca": "Catalan",
+    "ca-AD": "Catalan (Andorra)",
+    "ca-ES": "Catalan (Spain)",
+    "ca-ES-valencia": "Catalan (Spain)",
+    "ca-FR": "Catalan (France)",
+    "ca-IT": "Catalan (Italy)",
+    "ccp": "Chakma",
+    "ccp-BD": "Chakma (Bangladesh)",
+    "ccp-IN": "Chakma (India)",
+    "ce": "Chechen",
+    "ce-RU": "Chechen (Russia)",
+    "ceb": "Cebuano",
+    "ceb-PH": "Cebuano (Philippines)",
+    "cgg": "Chiga",
+    "cgg-UG": "Chiga (Uganda)",
+    "chr": "Cherokee",
+    "chr-US": "Cherokee (United States)",
+    "ckb": "Central Kurdish",
+    "ckb-IQ": "Central Kurdish (Iraq)",
+    "ckb-IR": "Central Kurdish (Iran)",
+    "cs": "Czech",
+    "cs-CZ": "Czech (Czechia)",
+    "cu": "Church Slavic",
+    "cu-RU": "Church Slavic (Russia)",
+    "cy": "Welsh",
+    "cy-GB": "Welsh (United Kingdom)",
+    "da": "Danish",
+    "da-DK": "Danish (Denmark)",
+    "da-GL": "Danish (Greenland)",
+    "dav": "Taita",
+    "dav-KE": "Taita (Kenya)",
+    "de": "German",
+    "de-AT": "German (Austria)",
+    "de-BE": "German (Belgium)",
+    "de-CH": "German (Switzerland)",
+    "de-DE": "German (Germany)",
+    "de-IT": "German (Italy)",
+    "de-LI": "German (Liechtenstein)",
+    "de-LU": "German (Luxembourg)",
+    "dje": "Zarma",
+    "dje-NE": "Zarma (Niger)",
+    "dsb": "Lower Sorbian",
+    "dsb-DE": "Lower Sorbian (Germany)",
+    "dua": "Duala",
+    "dua-CM": "Duala (Cameroon)",
+    "dyo": "Jola-Fonyi",
+    "dyo-SN": "Jola-Fonyi (Senegal)",
+    "dz": "Dzongkha",
+    "dz-BT": "Dzongkha (Bhutan)",
+    "ebu": "Embu",
+    "ebu-KE": "Embu (Kenya)",
+    "ee": "Ewe",
+    "ee-GH": "Ewe (Ghana)",
+    "ee-TG": "Ewe (Togo)",
+    "el": "Greek",
+    "el-CY": "Greek (Cyprus)",
+    "el-GR": "Greek (Greece)",
+    "en": "English",
+    "en-001": "English (World)",
+    "en-150": "English (Europe)",
+    "en-AE": "English (United Arab Emirates)",
+    "en-AG": "English (Antigua & Barbuda)",
+    "en-AI": "English (Anguilla)",
+    "en-AS": "English (American Samoa)",
+    "en-AT": "English (Austria)",
+    "en-AU": "English (Australia)",
+    "en-BB": "English (Barbados)",
+    "en-BE": "English (Belgium)",
+    "en-BI": "English (Burundi)",
+    "en-BM": "English (Bermuda)",
+    "en-BS": "English (Bahamas)",
+    "en-BW": "English (Botswana)",
+    "en-BZ": "English (Belize)",
+    "en-CA": "English (Canada)",
+    "en-CC": "English (Cocos (Keeling) Islands)",
+    "en-CH": "English (Switzerland)",
+    "en-CK": "English (Cook Islands)",
+    "en-CM": "English (Cameroon)",
+    "en-CX": "English (Christmas Island)",
+    "en-CY": "English (Cyprus)",
+    "en-DE": "English (Germany)",
+    "en-DG": "English (Diego Garcia)",
+    "en-DK": "English (Denmark)",
+    "en-DM": "English (Dominica)",
+    "en-ER": "English (Eritrea)",
+    "en-FI": "English (Finland)",
+    "en-FJ": "English (Fiji)",
+    "en-FK": "English (Falkland Islands)",
+    "en-FM": "English (Micronesia)",
+    "en-GB": "English (United Kingdom)",
+    "en-GD": "English (Grenada)",
+    "en-GG": "English (Guernsey)",
+    "en-GH": "English (Ghana)",
+    "en-GI": "English (Gibraltar)",
+    "en-GM": "English (Gambia)",
+    "en-GU": "English (Guam)",
+    "en-GY": "English (Guyana)",
+    "en-HK": "English (Hong Kong SAR China)",
+    "en-IE": "English (Ireland)",
+    "en-IL": "English (Israel)",
+    "en-IM": "English (Isle of Man)",
+    "en-IN": "English (India)",
+    "en-IO": "English (British Indian Ocean Territory)",
+    "en-JE": "English (Jersey)",
+    "en-JM": "English (Jamaica)",
+    "en-KE": "English (Kenya)",
+    "en-KI": "English (Kiribati)",
+    "en-KN": "English (St. Kitts & Nevis)",
+    "en-KY": "English (Cayman Islands)",
+    "en-LC": "English (St. Lucia)",
+    "en-LR": "English (Liberia)",
+    "en-LS": "English (Lesotho)",
+    "en-MG": "English (Madagascar)",
+    "en-MH": "English (Marshall Islands)",
+    "en-MO": "English (Macao SAR China)",
+    "en-MP": "English (Northern Mariana Islands)",
+    "en-MS": "English (Montserrat)",
+    "en-MT": "English (Malta)",
+    "en-MU": "English (Mauritius)",
+    "en-MW": "English (Malawi)",
+    "en-MY": "English (Malaysia)",
+    "en-NA": "English (Namibia)",
+    "en-NF": "English (Norfolk Island)",
+    "en-NG": "English (Nigeria)",
+    "en-NL": "English (Netherlands)",
+    "en-NR": "English (Nauru)",
+    "en-NU": "English (Niue)",
+    "en-NZ": "English (New Zealand)",
+    "en-PG": "English (Papua New Guinea)",
+    "en-PH": "English (Philippines)",
+    "en-PK": "English (Pakistan)",
+    "en-PN": "English (Pitcairn Islands)",
+    "en-PR": "English (Puerto Rico)",
+    "en-PW": "English (Palau)",
+    "en-RW": "English (Rwanda)",
+    "en-SB": "English (Solomon Islands)",
+    "en-SC": "English (Seychelles)",
+    "en-SD": "English (Sudan)",
+    "en-SE": "English (Sweden)",
+    "en-SG": "English (Singapore)",
+    "en-SH": "English (St. Helena)",
+    "en-SI": "English (Slovenia)",
+    "en-SL": "English (Sierra Leone)",
+    "en-SS": "English (South Sudan)",
+    "en-SX": "English (Sint Maarten)",
+    "en-SZ": "English (Eswatini)",
+    "en-TC": "English (Turks & Caicos Islands)",
+    "en-TK": "English (Tokelau)",
+    "en-TO": "English (Tonga)",
+    "en-TT": "English (Trinidad & Tobago)",
+    "en-TV": "English (Tuvalu)",
+    "en-TZ": "English (Tanzania)",
+    "en-UG": "English (Uganda)",
+    "en-UM": "English (U.S. Outlying Islands)",
+    "en-US": "English (United States)",
+    "en-US-posix": "English (United States)",
+    "en-VC": "English (St. Vincent & Grenadines)",
+    "en-VG": "English (British Virgin Islands)",
+    "en-VI": "English (U.S. Virgin Islands)",
+    "en-VU": "English (Vanuatu)",
+    "en-WS": "English (Samoa)",
+    "en-ZA": "English (South Africa)",
+    "en-ZM": "English (Zambia)",
+    "en-ZW": "English (Zimbabwe)",
+    "eo": "Esperanto",
+    "eo-001": "Esperanto (World)",
+    "es": "Spanish",
+    "es-419": "Spanish (Latin America)",
+    "es-AR": "Spanish (Argentina)",
+    "es-BO": "Spanish (Bolivia)",
+    "es-BR": "Spanish (Brazil)",
+    "es-BZ": "Spanish (Belize)",
+    "es-CL": "Spanish (Chile)",
+    "es-CO": "Spanish (Colombia)",
+    "es-CR": "Spanish (Costa Rica)",
+    "es-CU": "Spanish (Cuba)",
+    "es-DO": "Spanish (Dominican Republic)",
+    "es-EA": "Spanish (Ceuta & Melilla)",
+    "es-EC": "Spanish (Ecuador)",
+    "es-ES": "Spanish (Spain)",
+    "es-GQ": "Spanish (Equatorial Guinea)",
+    "es-GT": "Spanish (Guatemala)",
+    "es-HN": "Spanish (Honduras)",
+    "es-IC": "Spanish (Canary Islands)",
+    "es-MX": "Spanish (Mexico)",
+    "es-NI": "Spanish (Nicaragua)",
+    "es-PA": "Spanish (Panama)",
+    "es-PE": "Spanish (Peru)",
+    "es-PH": "Spanish (Philippines)",
+    "es-PR": "Spanish (Puerto Rico)",
+    "es-PY": "Spanish (Paraguay)",
+    "es-SV": "Spanish (El Salvador)",
+    "es-US": "Spanish (United States)",
+    "es-UY": "Spanish (Uruguay)",
+    "es-VE": "Spanish (Venezuela)",
+    "et": "Estonian",
+    "et-EE": "Estonian (Estonia)",
+    "eu": "Basque",
+    "eu-ES": "Basque (Spain)",
+    "ewo": "Ewondo",
+    "ewo-CM": "Ewondo (Cameroon)",
+    "fa": "Persian",
+    "fa-AF": "Persian (Afghanistan)",
+    "fa-IR": "Persian (Iran)",
+    "ff": "Fulah",
+    "ff-Adlm": "Fulah (Adlam)",
+    "ff-Adlm-BF": "Fulah (Adlam, Burkina Faso)",
+    "ff-Adlm-CM": "Fulah (Adlam, Cameroon)",
+    "ff-Adlm-GH": "Fulah (Adlam, Ghana)",
+    "ff-Adlm-GM": "Fulah (Adlam, Gambia)",
+    "ff-Adlm-GN": "Fulah (Adlam, Guinea)",
+    "ff-Adlm-GW": "Fulah (Adlam, Guinea-Bissau)",
+    "ff-Adlm-LR": "Fulah (Adlam, Liberia)",
+    "ff-Adlm-MR": "Fulah (Adlam, Mauritania)",
+    "ff-Adlm-NE": "Fulah (Adlam, Niger)",
+    "ff-Adlm-NG": "Fulah (Adlam, Nigeria)",
+    "ff-Adlm-SL": "Fulah (Adlam, Sierra Leone)",
+    "ff-Adlm-SN": "Fulah (Adlam, Senegal)",
+    "ff-Latn": "Fulah (Latin)",
+    "ff-Latn-BF": "Fulah (Latin, Burkina Faso)",
+    "ff-Latn-CM": "Fulah (Latin, Cameroon)",
+    "ff-Latn-GH": "Fulah (Latin, Ghana)",
+    "ff-Latn-GM": "Fulah (Latin, Gambia)",
+    "ff-Latn-GN": "Fulah (Latin, Guinea)",
+    "ff-Latn-GW": "Fulah (Latin, Guinea-Bissau)",
+    "ff-Latn-LR": "Fulah (Latin, Liberia)",
+    "ff-Latn-MR": "Fulah (Latin, Mauritania)",
+    "ff-Latn-NE": "Fulah (Latin, Niger)",
+    "ff-Latn-NG": "Fulah (Latin, Nigeria)",
+    "ff-Latn-SL": "Fulah (Latin, Sierra Leone)",
+    "ff-Latn-SN": "Fulah (Latin, Senegal)",
+    "fi": "Finnish",
+    "fi-FI": "Finnish (Finland)",
+    "fil": "Filipino",
+    "fil-PH": "Filipino (Philippines)",
+    "fo": "Faroese",
+    "fo-DK": "Faroese (Denmark)",
+    "fo-FO": "Faroese (Faroe Islands)",
+    "fr": "French",
+    "fr-BE": "French (Belgium)",
+    "fr-BF": "French (Burkina Faso)",
+    "fr-BI": "French (Burundi)",
+    "fr-BJ": "French (Benin)",
+    "fr-BL": "French (St. Barth\u00e9lemy)",
+    "fr-CA": "French (Canada)",
+    "fr-CD": "French (Congo - Kinshasa)",
+    "fr-CF": "French (Central African Republic)",
+    "fr-CG": "French (Congo - Brazzaville)",
+    "fr-CH": "French (Switzerland)",
+    "fr-CI": "French (C\u00f4te d\u2019Ivoire)",
+    "fr-CM": "French (Cameroon)",
+    "fr-DJ": "French (Djibouti)",
+    "fr-DZ": "French (Algeria)",
+    "fr-FR": "French (France)",
+    "fr-GA": "French (Gabon)",
+    "fr-GF": "French (French Guiana)",
+    "fr-GN": "French (Guinea)",
+    "fr-GP": "French (Guadeloupe)",
+    "fr-GQ": "French (Equatorial Guinea)",
+    "fr-HT": "French (Haiti)",
+    "fr-KM": "French (Comoros)",
+    "fr-LU": "French (Luxembourg)",
+    "fr-MA": "French (Morocco)",
+    "fr-MC": "French (Monaco)",
+    "fr-MF": "French (St. Martin)",
+    "fr-MG": "French (Madagascar)",
+    "fr-ML": "French (Mali)",
+    "fr-MQ": "French (Martinique)",
+    "fr-MR": "French (Mauritania)",
+    "fr-MU": "French (Mauritius)",
+    "fr-NC": "French (New Caledonia)",
+    "fr-NE": "French (Niger)",
+    "fr-PF": "French (French Polynesia)",
+    "fr-PM": "French (St. Pierre & Miquelon)",
+    "fr-RE": "French (R\u00e9union)",
+    "fr-RW": "French (Rwanda)",
+    "fr-SC": "French (Seychelles)",
+    "fr-SN": "French (Senegal)",
+    "fr-SY": "French (Syria)",
+    "fr-TD": "French (Chad)",
+    "fr-TG": "French (Togo)",
+    "fr-TN": "French (Tunisia)",
+    "fr-VU": "French (Vanuatu)",
+    "fr-WF": "French (Wallis & Futuna)",
+    "fr-YT": "French (Mayotte)",
+    "fur": "Friulian",
+    "fur-IT": "Friulian (Italy)",
+    "fy": "Western Frisian",
+    "fy-NL": "Western Frisian (Netherlands)",
+    "ga": "Irish",
+    "ga-GB": "Irish (United Kingdom)",
+    "ga-IE": "Irish (Ireland)",
+    "gd": "Scottish Gaelic",
+    "gd-GB": "Scottish Gaelic (United Kingdom)",
+    "gl": "Galician",
+    "gl-ES": "Galician (Spain)",
+    "gsw": "Swiss German",
+    "gsw-CH": "Swiss German (Switzerland)",
+    "gsw-FR": "Swiss German (France)",
+    "gsw-LI": "Swiss German (Liechtenstein)",
+    "gu": "Gujarati",
+    "gu-IN": "Gujarati (India)",
+    "guz": "Gusii",
+    "guz-KE": "Gusii (Kenya)",
+    "gv": "Manx",
+    "gv-IM": "Manx (Isle of Man)",
+    "ha": "Hausa",
+    "ha-GH": "Hausa (Ghana)",
+    "ha-NE": "Hausa (Niger)",
+    "ha-NG": "Hausa (Nigeria)",
+    "haw": "Hawaiian",
+    "haw-US": "Hawaiian (United States)",
+    "he": "Hebrew",
+    "he-IL": "Hebrew (Israel)",
+    "hi": "Hindi",
+    "hi-IN": "Hindi (India)",
+    "hr": "Croatian",
+    "hr-BA": "Croatian (Bosnia & Herzegovina)",
+    "hr-HR": "Croatian (Croatia)",
+    "hsb": "Upper Sorbian",
+    "hsb-DE": "Upper Sorbian (Germany)",
+    "hu": "Hungarian",
+    "hu-HU": "Hungarian (Hungary)",
+    "hy": "Armenian",
+    "hy-AM": "Armenian (Armenia)",
+    "ia": "Interlingua",
+    "ia-001": "Interlingua (World)",
+    "id": "Indonesian",
+    "id-ID": "Indonesian (Indonesia)",
+    "ig": "Igbo",
+    "ig-NG": "Igbo (Nigeria)",
+    "ii": "Sichuan Yi",
+    "ii-CN": "Sichuan Yi (China)",
+    "is": "Icelandic",
+    "is-IS": "Icelandic (Iceland)",
+    "it": "Italian",
+    "it-CH": "Italian (Switzerland)",
+    "it-IT": "Italian (Italy)",
+    "it-SM": "Italian (San Marino)",
+    "it-VA": "Italian (Vatican City)",
+    "ja": "Japanese",
+    "ja-JP": "Japanese (Japan)",
+    "jgo": "Ngomba",
+    "jgo-CM": "Ngomba (Cameroon)",
+    "jmc": "Machame",
+    "jmc-TZ": "Machame (Tanzania)",
+    "jv": "Javanese",
+    "jv-ID": "Javanese (Indonesia)",
+    "ka": "Georgian",
+    "ka-GE": "Georgian (Georgia)",
+    "kab": "Kabyle",
+    "kab-DZ": "Kabyle (Algeria)",
+    "kam": "Kamba",
+    "kam-KE": "Kamba (Kenya)",
+    "kde": "Makonde",
+    "kde-TZ": "Makonde (Tanzania)",
+    "kea": "Kabuverdianu",
+    "kea-CV": "Kabuverdianu (Cape Verde)",
+    "khq": "Koyra Chiini",
+    "khq-ML": "Koyra Chiini (Mali)",
+    "ki": "Kikuyu",
+    "ki-KE": "Kikuyu (Kenya)",
+    "kk": "Kazakh",
+    "kk-KZ": "Kazakh (Kazakhstan)",
+    "kkj": "Kako",
+    "kkj-CM": "Kako (Cameroon)",
+    "kl": "Kalaallisut",
+    "kl-GL": "Kalaallisut (Greenland)",
+    "kln": "Kalenjin",
+    "kln-KE": "Kalenjin (Kenya)",
+    "km": "Khmer",
+    "km-KH": "Khmer (Cambodia)",
+    "kn": "Kannada",
+    "kn-IN": "Kannada (India)",
+    "ko": "Korean",
+    "ko-KP": "Korean (North Korea)",
+    "ko-KR": "Korean (South Korea)",
+    "kok": "Konkani",
+    "kok-IN": "Konkani (India)",
+    "ks": "Kashmiri",
+    "ks-Arab": "Kashmiri (Arabic)",
+    "ks-IN": "Kashmiri (India)",
+    "ksb": "Shambala",
+    "ksb-TZ": "Shambala (Tanzania)",
+    "ksf": "Bafia",
+    "ksf-CM": "Bafia (Cameroon)",
+    "ksh": "Colognian",
+    "ksh-DE": "Colognian (Germany)",
+    "ku": "Kurdish",
+    "ku-TR": "Kurdish (Turkey)",
+    "kw": "Cornish",
+    "kw-GB": "Cornish (United Kingdom)",
+    "ky": "Kyrgyz",
+    "ky-KG": "Kyrgyz (Kyrgyzstan)",
+    "lag": "Langi",
+    "lag-TZ": "Langi (Tanzania)",
+    "lb": "Luxembourgish",
+    "lb-LU": "Luxembourgish (Luxembourg)",
+    "lg": "Ganda",
+    "lg-UG": "Ganda (Uganda)",
+    "lkt": "Lakota",
+    "lkt-US": "Lakota (United States)",
+    "ln": "Lingala",
+    "ln-AO": "Lingala (Angola)",
+    "ln-CD": "Lingala (Congo - Kinshasa)",
+    "ln-CF": "Lingala (Central African Republic)",
+    "ln-CG": "Lingala (Congo - Brazzaville)",
+    "lo": "Lao",
+    "lo-LA": "Lao (Laos)",
+    "lrc": "Northern Luri",
+    "lrc-IQ": "Northern Luri (Iraq)",
+    "lrc-IR": "Northern Luri (Iran)",
+    "lt": "Lithuanian",
+    "lt-LT": "Lithuanian (Lithuania)",
+    "lu": "Luba-Katanga",
+    "lu-CD": "Luba-Katanga (Congo - Kinshasa)",
+    "luo": "Luo (Kenya and Tanzania)",
+    "luo-KE": "Luo (Kenya and Tanzania) (Kenya)",
+    "luy": "Luyia",
+    "luy-KE": "Luyia (Kenya)",
+    "lv": "Latvian",
+    "lv-LV": "Latvian (Latvia)",
+    "mai": "Maithili",
+    "mai-IN": "Maithili (India)",
+    "mas": "Masai",
+    "mas-KE": "Masai (Kenya)",
+    "mas-TZ": "Masai (Tanzania)",
+    "mer": "Meru",
+    "mer-KE": "Meru (Kenya)",
+    "mfe": "Morisyen",
+    "mfe-MU": "Morisyen (Mauritius)",
+    "mg": "Malagasy",
+    "mg-MG": "Malagasy (Madagascar)",
+    "mgh": "Makhuwa-Meetto",
+    "mgh-MZ": "Makhuwa-Meetto (Mozambique)",
+    "mgo": "Meta\u02bc",
+    "mgo-CM": "Meta\u02bc (Cameroon)",
+    "mi": "Maori",
+    "mi-NZ": "Maori (New Zealand)",
+    "mk": "Macedonian",
+    "mk-MK": "Macedonian (North Macedonia)",
+    "ml": "Malayalam",
+    "ml-IN": "Malayalam (India)",
+    "mn": "Mongolian",
+    "mn-MN": "Mongolian (Mongolia)",
+    "mni": "Manipuri",
+    "mni-Beng": "Manipuri (Bangla)",
+    "mni-Beng-IN": "Manipuri (Bangla, India)",
+    "mr": "Marathi",
+    "mr-IN": "Marathi (India)",
+    "ms": "Malay",
+    "ms-BN": "Malay (Brunei)",
+    "ms-ID": "Malay (Indonesia)",
+    "ms-MY": "Malay (Malaysia)",
+    "ms-SG": "Malay (Singapore)",
+    "mt": "Maltese",
+    "mt-MT": "Maltese (Malta)",
+    "mua": "Mundang",
+    "mua-CM": "Mundang (Cameroon)",
+    "my": "Burmese",
+    "my-MM": "Burmese (Myanmar (Burma))",
+    "mzn": "Mazanderani",
+    "mzn-IR": "Mazanderani (Iran)",
+    "naq": "Nama",
+    "naq-NA": "Nama (Namibia)",
+    "nb": "Norwegian Bokm\u00e5l",
+    "nb-NO": "Norwegian Bokm\u00e5l (Norway)",
+    "nb-SJ": "Norwegian Bokm\u00e5l (Svalbard & Jan Mayen)",
+    "nd": "North Ndebele",
+    "nd-ZW": "North Ndebele (Zimbabwe)",
+    "nds": "Low German",
+    "nds-DE": "Low German (Germany)",
+    "nds-NL": "Low German (Netherlands)",
+    "ne": "Nepali",
+    "ne-IN": "Nepali (India)",
+    "ne-NP": "Nepali (Nepal)",
+    "nl": "Dutch",
+    "nl-AW": "Dutch (Aruba)",
+    "nl-BE": "Dutch (Belgium)",
+    "nl-BQ": "Dutch (Caribbean Netherlands)",
+    "nl-CW": "Dutch (Cura\u00e7ao)",
+    "nl-NL": "Dutch (Netherlands)",
+    "nl-SR": "Dutch (Suriname)",
+    "nl-SX": "Dutch (Sint Maarten)",
+    "nmg": "Kwasio",
+    "nmg-CM": "Kwasio (Cameroon)",
+    "nn": "Norwegian Nynorsk",
+    "nn-NO": "Norwegian Nynorsk (Norway)",
+    "nnh": "Ngiemboon",
+    "nnh-CM": "Ngiemboon (Cameroon)",
+    "nus": "Nuer",
+    "nus-SS": "Nuer (South Sudan)",
+    "nyn": "Nyankole",
+    "nyn-UG": "Nyankole (Uganda)",
+    "om": "Oromo",
+    "om-ET": "Oromo (Ethiopia)",
+    "om-KE": "Oromo (Kenya)",
+    "or": "Odia",
+    "or-IN": "Odia (India)",
+    "os": "Ossetic",
+    "os-GE": "Ossetic (Georgia)",
+    "os-RU": "Ossetic (Russia)",
+    "pa": "Punjabi",
+    "pa-Arab": "Punjabi (Arabic)",
+    "pa-Arab-PK": "Punjabi (Arabic, Pakistan)",
+    "pa-Guru": "Punjabi",
+    "pa-Guru-IN": "Punjabi (India)",
+    "pcm": "Nigerian Pidgin",
+    "pcm-NG": "Nigerian Pidgin (Nigeria)",
+    "pl": "Polish",
+    "pl-PL": "Polish (Poland)",
+    "prg": "Prussian",
+    "prg-001": "Prussian (World)",
+    "ps": "Pashto",
+    "ps-AF": "Pashto (Afghanistan)",
+    "ps-PK": "Pashto (Pakistan)",
+    "pt": "Portuguese",
+    "pt-AO": "Portuguese (Angola)",
+    "pt-BR": "Portuguese (Brazil)",
+    "pt-CH": "Portuguese (Switzerland)",
+    "pt-CV": "Portuguese (Cape Verde)",
+    "pt-GQ": "Portuguese (Equatorial Guinea)",
+    "pt-GW": "Portuguese (Guinea-Bissau)",
+    "pt-LU": "Portuguese (Luxembourg)",
+    "pt-MO": "Portuguese (Macao SAR China)",
+    "pt-MZ": "Portuguese (Mozambique)",
+    "pt-PT": "Portuguese (Portugal)",
+    "pt-ST": "Portuguese (S\u00e3o Tom\u00e9 & Pr\u00edncipe)",
+    "pt-TL": "Portuguese (Timor-Leste)",
+    "qu": "Quechua",
+    "qu-BO": "Quechua (Bolivia)",
+    "qu-EC": "Quechua (Ecuador)",
+    "qu-PE": "Quechua (Peru)",
+    "rm": "Romansh",
+    "rm-CH": "Romansh (Switzerland)",
+    "rn": "Rundi",
+    "rn-BI": "Rundi (Burundi)",
+    "ro": "Romanian",
+    "ro-MD": "Romanian (Moldova)",
+    "ro-RO": "Romanian (Romania)",
+    "rof": "Rombo",
+    "rof-TZ": "Rombo (Tanzania)",
+    "und": "Unknown language",
+    "ru": "Russian",
+    "ru-BY": "Russian (Belarus)",
+    "ru-KG": "Russian (Kyrgyzstan)",
+    "ru-KZ": "Russian (Kazakhstan)",
+    "ru-MD": "Russian (Moldova)",
+    "ru-RU": "Russian (Russia)",
+    "ru-UA": "Russian (Ukraine)",
+    "rw": "Kinyarwanda",
+    "rw-RW": "Kinyarwanda (Rwanda)",
+    "rwk": "Rwa",
+    "rwk-TZ": "Rwa (Tanzania)",
+    "sah": "Sakha",
+    "sah-RU": "Sakha (Russia)",
+    "saq": "Samburu",
+    "saq-KE": "Samburu (Kenya)",
+    "sat": "Santali",
+    "sat-Olck": "Santali (Ol Chiki)",
+    "sat-Olck-IN": "Santali (Ol Chiki, India)",
+    "sbp": "Sangu",
+    "sbp-TZ": "Sangu (Tanzania)",
+    "sd": "Sindhi",
+    "sd-Arab": "Sindhi (Arabic)",
+    "sd-Arab-PK": "Sindhi (Arabic, Pakistan)",
+    "sd-Deva": "Sindhi (Devanagari)",
+    "sd-Deva-IN": "Sindhi (Devanagari, India)",
+    "se": "Northern Sami",
+    "se-FI": "Northern Sami (Finland)",
+    "se-NO": "Northern Sami (Norway)",
+    "se-SE": "Northern Sami (Sweden)",
+    "seh": "Sena",
+    "seh-MZ": "Sena (Mozambique)",
+    "ses": "Koyraboro Senni",
+    "ses-ML": "Koyraboro Senni (Mali)",
+    "sg": "Sango",
+    "sg-CF": "Sango (Central African Republic)",
+    "shi": "Tachelhit",
+    "shi-Latn": "Tachelhit (Latin)",
+    "shi-Latn-MA": "Tachelhit (Latin, Morocco)",
+    "shi-Tfng": "Tachelhit (Tifinagh)",
+    "shi-Tfng-MA": "Tachelhit (Tifinagh, Morocco)",
+    "si": "Sinhala",
+    "si-LK": "Sinhala (Sri Lanka)",
+    "sk": "Slovak",
+    "sk-SK": "Slovak (Slovakia)",
+    "sl": "Slovenian",
+    "sl-SI": "Slovenian (Slovenia)",
+    "smn": "Inari Sami",
+    "smn-FI": "Inari Sami (Finland)",
+    "sn": "Shona",
+    "sn-ZW": "Shona (Zimbabwe)",
+    "so": "Somali",
+    "so-DJ": "Somali (Djibouti)",
+    "so-ET": "Somali (Ethiopia)",
+    "so-KE": "Somali (Kenya)",
+    "so-SO": "Somali (Somalia)",
+    "sq": "Albanian",
+    "sq-AL": "Albanian (Albania)",
+    "sq-MK": "Albanian (North Macedonia)",
+    "sq-XK": "Albanian (Kosovo)",
+    "sr": "Serbian",
+    "sr-Cyrl": "Serbian (Cyrillic)",
+    "sr-Cyrl-BA": "Serbian (Cyrillic, Bosnia & Herzegovina)",
+    "sr-Cyrl-ME": "Serbian (Cyrillic, Montenegro)",
+    "sr-Cyrl-RS": "Serbian (Cyrillic, Serbia)",
+    "sr-Cyrl-XK": "Serbian (Cyrillic, Kosovo)",
+    "sr-Latn": "Serbian (Latin)",
+    "sr-Latn-BA": "Serbian (Latin, Bosnia & Herzegovina)",
+    "sr-Latn-ME": "Serbian (Latin, Montenegro)",
+    "sr-Latn-RS": "Serbian (Latin, Serbia)",
+    "sr-Latn-XK": "Serbian (Latin, Kosovo)",
+    "su": "Sundanese",
+    "su-Latn": "Sundanese (Latin)",
+    "su-Latn-ID": "Sundanese (Latin, Indonesia)",
+    "sv": "Swedish",
+    "sv-AX": "Swedish (\u00c5land Islands)",
+    "sv-FI": "Swedish (Finland)",
+    "sv-SE": "Swedish (Sweden)",
+    "sw": "Swahili",
+    "sw-CD": "Swahili (Congo - Kinshasa)",
+    "sw-KE": "Swahili (Kenya)",
+    "sw-TZ": "Swahili (Tanzania)",
+    "sw-UG": "Swahili (Uganda)",
+    "ta": "Tamil",
+    "ta-IN": "Tamil (India)",
+    "ta-LK": "Tamil (Sri Lanka)",
+    "ta-MY": "Tamil (Malaysia)",
+    "ta-SG": "Tamil (Singapore)",
+    "te": "Telugu",
+    "te-IN": "Telugu (India)",
+    "teo": "Teso",
+    "teo-KE": "Teso (Kenya)",
+    "teo-UG": "Teso (Uganda)",
+    "tg": "Tajik",
+    "tg-TJ": "Tajik (Tajikistan)",
+    "th": "Thai",
+    "th-TH": "Thai (Thailand)",
+    "ti": "Tigrinya",
+    "ti-ER": "Tigrinya (Eritrea)",
+    "ti-ET": "Tigrinya (Ethiopia)",
+    "tk": "Turkmen",
+    "tk-TM": "Turkmen (Turkmenistan)",
+    "to": "Tongan",
+    "to-TO": "Tongan (Tonga)",
+    "tr": "Turkish",
+    "tr-CY": "Turkish (Cyprus)",
+    "tr-TR": "Turkish (Turkey)",
+    "tt": "Tatar",
+    "tt-RU": "Tatar (Russia)",
+    "twq": "Tasawaq",
+    "twq-NE": "Tasawaq (Niger)",
+    "tzm": "Central Atlas Tamazight",
+    "tzm-MA": "Central Atlas Tamazight (Morocco)",
+    "ug": "Uyghur",
+    "ug-CN": "Uyghur (China)",
+    "uk": "Ukrainian",
+    "uk-UA": "Ukrainian (Ukraine)",
+    "ur": "Urdu",
+    "ur-IN": "Urdu (India)",
+    "ur-PK": "Urdu (Pakistan)",
+    "uz": "Uzbek",
+    "uz-Arab": "Uzbek (Arabic)",
+    "uz-Arab-AF": "Uzbek (Arabic, Afghanistan)",
+    "uz-Cyrl": "Uzbek (Cyrillic)",
+    "uz-Cyrl-UZ": "Uzbek (Cyrillic, Uzbekistan)",
+    "uz-Latn": "Uzbek (Latin)",
+    "uz-Latn-UZ": "Uzbek (Latin, Uzbekistan)",
+    "vai": "Vai",
+    "vai-Latn": "Vai (Latin)",
+    "vai-Latn-LR": "Vai (Latin, Liberia)",
+    "vai-Vaii": "Vai (Vai)",
+    "vai-Vaii-LR": "Vai (Vai, Liberia)",
+    "vi": "Vietnamese",
+    "vi-VN": "Vietnamese (Vietnam)",
+    "vo": "Volap\u00fck",
+    "vo-001": "Volap\u00fck (World)",
+    "vun": "Vunjo",
+    "vun-TZ": "Vunjo (Tanzania)",
+    "wae": "Walser",
+    "wae-CH": "Walser (Switzerland)",
+    "wo": "Wolof",
+    "wo-SN": "Wolof (Senegal)",
+    "xh": "Xhosa",
+    "xh-ZA": "Xhosa (South Africa)",
+    "xog": "Soga",
+    "xog-UG": "Soga (Uganda)",
+    "yav": "Yangben",
+    "yav-CM": "Yangben (Cameroon)",
+    "yi": "Yiddish",
+    "yi-001": "Yiddish (World)",
+    "yo": "Yoruba",
+    "yo-BJ": "Yoruba (Benin)",
+    "yo-NG": "Yoruba (Nigeria)",
+    "yue": "Cantonese",
+    "yue-Hans": "Cantonese (Simplified)",
+    "yue-Hans-CN": "Cantonese (Simplified, China)",
+    "yue-Hant": "Cantonese (Traditional)",
+    "yue-Hant-HK": "Cantonese (Traditional, Hong Kong SAR China)",
+    "zgh": "Standard Moroccan Tamazight",
+    "zgh-MA": "Standard Moroccan Tamazight (Morocco)",
+    "zh": "Chinese",
+    "zh-Hans": "Chinese (Simplified)",
+    "zh-Hans-CN": "Chinese (Simplified, China)",
+    "zh-Hans-HK": "Chinese (Simplified, Hong Kong SAR China)",
+    "zh-Hans-MO": "Chinese (Simplified, Macao SAR China)",
+    "zh-Hans-SG": "Chinese (Simplified, Singapore)",
+    "zh-Hant": "Chinese (Traditional)",
+    "zh-Hant-HK": "Chinese (Traditional, Hong Kong SAR China)",
+    "zh-Hant-MO": "Chinese (Traditional, Macao SAR China)",
+    "zh-Hant-TW": "Chinese (Traditional, Taiwan)",
+    "zu": "Zulu",
+    "zu-ZA": "Zulu (South Africa)"
+}
\ No newline at end of file
diff --git a/src/datasets/utils/resources/licenses.json b/src/datasets/utils/resources/licenses.json
new file mode 100644
index 00000000000..31e76c43d48
--- /dev/null
+++ b/src/datasets/utils/resources/licenses.json
@@ -0,0 +1,452 @@
+{
+    "other": "Other license",
+    "unknown": "License information unavailable",
+    "0bsd": "BSD Zero Clause License",
+    "aal": "Attribution Assurance License",
+    "abstyles": "Abstyles License",
+    "adobe-2006": "Adobe Systems Incorporated Source Code License Agreement",
+    "adobe-glyph": "Adobe Glyph List License",
+    "adsl": "Amazon Digital Services License",
+    "afl-1.1": "Academic Free License v1.1",
+    "afl-1.2": "Academic Free License v1.2",
+    "afl-2.0": "Academic Free License v2.0",
+    "afl-2.1": "Academic Free License v2.1",
+    "afl-3.0": "Academic Free License v3.0",
+    "afmparse": "Afmparse License",
+    "agpl-1.0": "Affero General Public License v1.0",
+    "agpl-1.0-only": "Affero General Public License v1.0 only",
+    "agpl-1.0-or-later": "Affero General Public License v1.0 or later",
+    "agpl-3.0": "GNU Affero General Public License v3.0",
+    "agpl-3.0-only": "GNU Affero General Public License v3.0 only",
+    "agpl-3.0-or-later": "GNU Affero General Public License v3.0 or later",
+    "aladdin": "Aladdin Free Public License",
+    "amdplpa": "AMD's plpa_map.c License",
+    "aml": "Apple MIT License",
+    "ampas": "Academy of Motion Picture Arts and Sciences BSD",
+    "antlr-pd": "ANTLR Software Rights Notice",
+    "antlr-pd-fallback": "ANTLR Software Rights Notice with license fallback",
+    "apache-1.0": "Apache License 1.0",
+    "apache-1.1": "Apache License 1.1",
+    "apache-2.0": "Apache License 2.0",
+    "apafml": "Adobe Postscript AFM License",
+    "apl-1.0": "Adaptive Public License 1.0",
+    "apsl-1.0": "Apple Public Source License 1.0",
+    "apsl-1.1": "Apple Public Source License 1.1",
+    "apsl-1.2": "Apple Public Source License 1.2",
+    "apsl-2.0": "Apple Public Source License 2.0",
+    "artistic-1.0": "Artistic License 1.0",
+    "artistic-1.0-cl8": "Artistic License 1.0 w/clause 8",
+    "artistic-1.0-perl": "Artistic License 1.0 (Perl)",
+    "artistic-2.0": "Artistic License 2.0",
+    "bahyph": "Bahyph License",
+    "barr": "Barr License",
+    "beerware": "Beerware License",
+    "bittorrent-1.0": "BitTorrent Open Source License v1.0",
+    "bittorrent-1.1": "BitTorrent Open Source License v1.1",
+    "blessing": "SQLite Blessing",
+    "blueoak-1.0.0": "Blue Oak Model License 1.0.0",
+    "borceux": "Borceux license",
+    "bsd-1-clause": "BSD 1-Clause License",
+    "bsd-2-clause": "BSD 2-Clause \"Simplified\" License",
+    "bsd-2-clause-freebsd": "BSD 2-Clause FreeBSD License",
+    "bsd-2-clause-netbsd": "BSD 2-Clause NetBSD License",
+    "bsd-2-clause-patent": "BSD-2-Clause Plus Patent License",
+    "bsd-2-clause-views": "BSD 2-Clause with views sentence",
+    "bsd-3-clause": "BSD 3-Clause \"New\" or \"Revised\" License",
+    "bsd-3-clause-attribution": "BSD with attribution",
+    "bsd-3-clause-clear": "BSD 3-Clause Clear License",
+    "bsd-3-clause-lbnl": "Lawrence Berkeley National Labs BSD variant license",
+    "bsd-3-clause-no-nuclear-license": "BSD 3-Clause No Nuclear License",
+    "bsd-3-clause-no-nuclear-license-2014": "BSD 3-Clause No Nuclear License 2014",
+    "bsd-3-clause-no-nuclear-warranty": "BSD 3-Clause No Nuclear Warranty",
+    "bsd-3-clause-open-mpi": "BSD 3-Clause Open MPI variant",
+    "bsd-4-clause": "BSD 4-Clause \"Original\" or \"Old\" License",
+    "bsd-4-clause-uc": "BSD-4-Clause (University of California-Specific)",
+    "bsd-protection": "BSD Protection License",
+    "bsd-source-code": "BSD Source Code Attribution",
+    "bsl-1.0": "Boost Software License 1.0",
+    "busl-1.1": "Business Source License 1.1",
+    "bzip2-1.0.5": "bzip2 and libbzip2 License v1.0.5",
+    "bzip2-1.0.6": "bzip2 and libbzip2 License v1.0.6",
+    "cal-1.0": "Cryptographic Autonomy License 1.0",
+    "cal-1.0-combined-work-exception": "Cryptographic Autonomy License 1.0 (Combined Work Exception)",
+    "caldera": "Caldera License",
+    "catosl-1.1": "Computer Associates Trusted Open Source License 1.1",
+    "cc-by-1.0": "Creative Commons Attribution 1.0 Generic",
+    "cc-by-2.0": "Creative Commons Attribution 2.0 Generic",
+    "cc-by-2.5": "Creative Commons Attribution 2.5 Generic",
+    "cc-by-3.0": "Creative Commons Attribution 3.0 Unported",
+    "cc-by-3.0-at": "Creative Commons Attribution 3.0 Austria",
+    "cc-by-3.0-us": "Creative Commons Attribution 3.0 United States",
+    "cc-by-4.0": "Creative Commons Attribution 4.0 International",
+    "cc-by-nc-1.0": "Creative Commons Attribution Non Commercial 1.0 Generic",
+    "cc-by-nc-2.0": "Creative Commons Attribution Non Commercial 2.0 Generic",
+    "cc-by-nc-2.5": "Creative Commons Attribution Non Commercial 2.5 Generic",
+    "cc-by-nc-3.0": "Creative Commons Attribution Non Commercial 3.0 Unported",
+    "cc-by-nc-4.0": "Creative Commons Attribution Non Commercial 4.0 International",
+    "cc-by-nc-nd-1.0": "Creative Commons Attribution Non Commercial No Derivatives 1.0 Generic",
+    "cc-by-nc-nd-2.0": "Creative Commons Attribution Non Commercial No Derivatives 2.0 Generic",
+    "cc-by-nc-nd-2.5": "Creative Commons Attribution Non Commercial No Derivatives 2.5 Generic",
+    "cc-by-nc-nd-3.0": "Creative Commons Attribution Non Commercial No Derivatives 3.0 Unported",
+    "cc-by-nc-nd-3.0-igo": "Creative Commons Attribution Non Commercial No Derivatives 3.0 IGO",
+    "cc-by-nc-nd-4.0": "Creative Commons Attribution Non Commercial No Derivatives 4.0 International",
+    "cc-by-nc-sa-1.0": "Creative Commons Attribution Non Commercial Share Alike 1.0 Generic",
+    "cc-by-nc-sa-2.0": "Creative Commons Attribution Non Commercial Share Alike 2.0 Generic",
+    "cc-by-nc-sa-2.5": "Creative Commons Attribution Non Commercial Share Alike 2.5 Generic",
+    "cc-by-nc-sa-3.0": "Creative Commons Attribution Non Commercial Share Alike 3.0 Unported",
+    "cc-by-nc-sa-4.0": "Creative Commons Attribution Non Commercial Share Alike 4.0 International",
+    "cc-by-nd-1.0": "Creative Commons Attribution No Derivatives 1.0 Generic",
+    "cc-by-nd-2.0": "Creative Commons Attribution No Derivatives 2.0 Generic",
+    "cc-by-nd-2.5": "Creative Commons Attribution No Derivatives 2.5 Generic",
+    "cc-by-nd-3.0": "Creative Commons Attribution No Derivatives 3.0 Unported",
+    "cc-by-nd-4.0": "Creative Commons Attribution No Derivatives 4.0 International",
+    "cc-by-sa-1.0": "Creative Commons Attribution Share Alike 1.0 Generic",
+    "cc-by-sa-2.0": "Creative Commons Attribution Share Alike 2.0 Generic",
+    "cc-by-sa-2.0-uk": "Creative Commons Attribution Share Alike 2.0 England and Wales",
+    "cc-by-sa-2.5": "Creative Commons Attribution Share Alike 2.5 Generic",
+    "cc-by-sa-3.0": "Creative Commons Attribution Share Alike 3.0 Unported",
+    "cc-by-sa-3.0-at": "Creative Commons Attribution-Share Alike 3.0 Austria",
+    "cc-by-sa-4.0": "Creative Commons Attribution Share Alike 4.0 International",
+    "cc-pddc": "Creative Commons Public Domain Dedication and Certification",
+    "cc0-1.0": "Creative Commons Zero v1.0 Universal",
+    "cddl-1.0": "Common Development and Distribution License 1.0",
+    "cddl-1.1": "Common Development and Distribution License 1.1",
+    "cdla-permissive-1.0": "Community Data License Agreement Permissive 1.0",
+    "cdla-sharing-1.0": "Community Data License Agreement Sharing 1.0",
+    "cecill-1.0": "CeCILL Free Software License Agreement v1.0",
+    "cecill-1.1": "CeCILL Free Software License Agreement v1.1",
+    "cecill-2.0": "CeCILL Free Software License Agreement v2.0",
+    "cecill-2.1": "CeCILL Free Software License Agreement v2.1",
+    "cecill-b": "CeCILL-B Free Software License Agreement",
+    "cecill-c": "CeCILL-C Free Software License Agreement",
+    "cern-ohl-1.1": "CERN Open Hardware Licence v1.1",
+    "cern-ohl-1.2": "CERN Open Hardware Licence v1.2",
+    "cern-ohl-p-2.0": "CERN Open Hardware Licence Version 2 - Permissive",
+    "cern-ohl-s-2.0": "CERN Open Hardware Licence Version 2 - Strongly Reciprocal",
+    "cern-ohl-w-2.0": "CERN Open Hardware Licence Version 2 - Weakly Reciprocal",
+    "clartistic": "Clarified Artistic License",
+    "cnri-jython": "CNRI Jython License",
+    "cnri-python": "CNRI Python License",
+    "cnri-python-gpl-compatible": "CNRI Python Open Source GPL Compatible License Agreement",
+    "condor-1.1": "Condor Public License v1.1",
+    "copyleft-next-0.3.0": "copyleft-next 0.3.0",
+    "copyleft-next-0.3.1": "copyleft-next 0.3.1",
+    "cpal-1.0": "Common Public Attribution License 1.0",
+    "cpl-1.0": "Common Public License 1.0",
+    "cpol-1.02": "Code Project Open License 1.02",
+    "crossword": "Crossword License",
+    "crystalstacker": "CrystalStacker License",
+    "cua-opl-1.0": "CUA Office Public License v1.0",
+    "cube": "Cube License",
+    "curl": "curl License",
+    "d-fsl-1.0": "Deutsche Freie Software Lizenz",
+    "diffmark": "diffmark license",
+    "doc": "DOC License",
+    "dotseqn": "Dotseqn License",
+    "dsdp": "DSDP License",
+    "dvipdfm": "dvipdfm License",
+    "ecl-1.0": "Educational Community License v1.0",
+    "ecl-2.0": "Educational Community License v2.0",
+    "ecos-2.0": "eCos license version 2.0",
+    "efl-1.0": "Eiffel Forum License v1.0",
+    "efl-2.0": "Eiffel Forum License v2.0",
+    "egenix": "eGenix.com Public License 1.1.0",
+    "entessa": "Entessa Public License v1.0",
+    "epics": "EPICS Open License",
+    "epl-1.0": "Eclipse Public License 1.0",
+    "epl-2.0": "Eclipse Public License 2.0",
+    "erlpl-1.1": "Erlang Public License v1.1",
+    "etalab-2.0": "Etalab Open License 2.0",
+    "eudatagrid": "EU DataGrid Software License",
+    "eupl-1.0": "European Union Public License 1.0",
+    "eupl-1.1": "European Union Public License 1.1",
+    "eupl-1.2": "European Union Public License 1.2",
+    "eurosym": "Eurosym License",
+    "fair": "Fair License",
+    "frameworx-1.0": "Frameworx Open License 1.0",
+    "freeimage": "FreeImage Public License v1.0",
+    "fsfap": "FSF All Permissive License",
+    "fsful": "FSF Unlimited License",
+    "fsfullr": "FSF Unlimited License (with License Retention)",
+    "ftl": "Freetype Project License",
+    "gfdl-1.1": "GNU Free Documentation License v1.1",
+    "gfdl-1.1-invariants-only": "GNU Free Documentation License v1.1 only - invariants",
+    "gfdl-1.1-invariants-or-later": "GNU Free Documentation License v1.1 or later - invariants",
+    "gfdl-1.1-no-invariants-only": "GNU Free Documentation License v1.1 only - no invariants",
+    "gfdl-1.1-no-invariants-or-later": "GNU Free Documentation License v1.1 or later - no invariants",
+    "gfdl-1.1-only": "GNU Free Documentation License v1.1 only",
+    "gfdl-1.1-or-later": "GNU Free Documentation License v1.1 or later",
+    "gfdl-1.2": "GNU Free Documentation License v1.2",
+    "gfdl-1.2-invariants-only": "GNU Free Documentation License v1.2 only - invariants",
+    "gfdl-1.2-invariants-or-later": "GNU Free Documentation License v1.2 or later - invariants",
+    "gfdl-1.2-no-invariants-only": "GNU Free Documentation License v1.2 only - no invariants",
+    "gfdl-1.2-no-invariants-or-later": "GNU Free Documentation License v1.2 or later - no invariants",
+    "gfdl-1.2-only": "GNU Free Documentation License v1.2 only",
+    "gfdl-1.2-or-later": "GNU Free Documentation License v1.2 or later",
+    "gfdl-1.3": "GNU Free Documentation License v1.3",
+    "gfdl-1.3-invariants-only": "GNU Free Documentation License v1.3 only - invariants",
+    "gfdl-1.3-invariants-or-later": "GNU Free Documentation License v1.3 or later - invariants",
+    "gfdl-1.3-no-invariants-only": "GNU Free Documentation License v1.3 only - no invariants",
+    "gfdl-1.3-no-invariants-or-later": "GNU Free Documentation License v1.3 or later - no invariants",
+    "gfdl-1.3-only": "GNU Free Documentation License v1.3 only",
+    "gfdl-1.3-or-later": "GNU Free Documentation License v1.3 or later",
+    "giftware": "Giftware License",
+    "gl2ps": "GL2PS License",
+    "glide": "3dfx Glide License",
+    "glulxe": "Glulxe License",
+    "glwtpl": "Good Luck With That Public License",
+    "gnuplot": "gnuplot License",
+    "gpl-1.0": "GNU General Public License v1.0 only",
+    "gpl-1.0+": "GNU General Public License v1.0 or later",
+    "gpl-1.0-only": "GNU General Public License v1.0 only",
+    "gpl-1.0-or-later": "GNU General Public License v1.0 or later",
+    "gpl-2.0": "GNU General Public License v2.0 only",
+    "gpl-2.0+": "GNU General Public License v2.0 or later",
+    "gpl-2.0-only": "GNU General Public License v2.0 only",
+    "gpl-2.0-or-later": "GNU General Public License v2.0 or later",
+    "gpl-2.0-with-autoconf-exception": "GNU General Public License v2.0 w/Autoconf exception",
+    "gpl-2.0-with-bison-exception": "GNU General Public License v2.0 w/Bison exception",
+    "gpl-2.0-with-classpath-exception": "GNU General Public License v2.0 w/Classpath exception",
+    "gpl-2.0-with-font-exception": "GNU General Public License v2.0 w/Font exception",
+    "gpl-2.0-with-gcc-exception": "GNU General Public License v2.0 w/GCC Runtime Library exception",
+    "gpl-3.0": "GNU General Public License v3.0 only",
+    "gpl-3.0+": "GNU General Public License v3.0 or later",
+    "gpl-3.0-only": "GNU General Public License v3.0 only",
+    "gpl-3.0-or-later": "GNU General Public License v3.0 or later",
+    "gpl-3.0-with-autoconf-exception": "GNU General Public License v3.0 w/Autoconf exception",
+    "gpl-3.0-with-gcc-exception": "GNU General Public License v3.0 w/GCC Runtime Library exception",
+    "gsoap-1.3b": "gSOAP Public License v1.3b",
+    "haskellreport": "Haskell Language Report License",
+    "hippocratic-2.1": "Hippocratic License 2.1",
+    "hpnd": "Historical Permission Notice and Disclaimer",
+    "hpnd-sell-variant": "Historical Permission Notice and Disclaimer - sell variant",
+    "htmltidy": "HTML Tidy License",
+    "ibm-pibs": "IBM PowerPC Initialization and Boot Software",
+    "icu": "ICU License",
+    "ijg": "Independent JPEG Group License",
+    "imagemagick": "ImageMagick License",
+    "imatix": "iMatix Standard Function Library Agreement",
+    "imlib2": "Imlib2 License",
+    "info-zip": "Info-ZIP License",
+    "intel": "Intel Open Source License",
+    "intel-acpi": "Intel ACPI Software License Agreement",
+    "interbase-1.0": "Interbase Public License v1.0",
+    "ipa": "IPA Font License",
+    "ipl-1.0": "IBM Public License v1.0",
+    "isc": "ISC License",
+    "jasper-2.0": "JasPer License",
+    "jpnic": "Japan Network Information Center License",
+    "json": "JSON License",
+    "lal-1.2": "Licence Art Libre 1.2",
+    "lal-1.3": "Licence Art Libre 1.3",
+    "latex2e": "Latex2e License",
+    "leptonica": "Leptonica License",
+    "lgpl-2.0": "GNU Library General Public License v2 only",
+    "lgpl-2.0+": "GNU Library General Public License v2 or later",
+    "lgpl-2.0-only": "GNU Library General Public License v2 only",
+    "lgpl-2.0-or-later": "GNU Library General Public License v2 or later",
+    "lgpl-2.1": "GNU Lesser General Public License v2.1 only",
+    "lgpl-2.1+": "GNU Library General Public License v2.1 or later",
+    "lgpl-2.1-only": "GNU Lesser General Public License v2.1 only",
+    "lgpl-2.1-or-later": "GNU Lesser General Public License v2.1 or later",
+    "lgpl-3.0": "GNU Lesser General Public License v3.0 only",
+    "lgpl-3.0+": "GNU Lesser General Public License v3.0 or later",
+    "lgpl-3.0-only": "GNU Lesser General Public License v3.0 only",
+    "lgpl-3.0-or-later": "GNU Lesser General Public License v3.0 or later",
+    "lgpllr": "Lesser General Public License For Linguistic Resources",
+    "libpng": "libpng License",
+    "libpng-2.0": "PNG Reference Library version 2",
+    "libselinux-1.0": "libselinux public domain notice",
+    "libtiff": "libtiff License",
+    "liliq-p-1.1": "Licence Libre du Qu\u00e9bec \u2013 Permissive version 1.1",
+    "liliq-r-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 version 1.1",
+    "liliq-rplus-1.1": "Licence Libre du Qu\u00e9bec \u2013 R\u00e9ciprocit\u00e9 forte version 1.1",
+    "linux-openib": "Linux Kernel Variant of OpenIB.org license",
+    "lpl-1.0": "Lucent Public License Version 1.0",
+    "lpl-1.02": "Lucent Public License v1.02",
+    "lppl-1.0": "LaTeX Project Public License v1.0",
+    "lppl-1.1": "LaTeX Project Public License v1.1",
+    "lppl-1.2": "LaTeX Project Public License v1.2",
+    "lppl-1.3a": "LaTeX Project Public License v1.3a",
+    "lppl-1.3c": "LaTeX Project Public License v1.3c",
+    "makeindex": "MakeIndex License",
+    "miros": "The MirOS Licence",
+    "mit": "MIT License",
+    "mit-0": "MIT No Attribution",
+    "mit-advertising": "Enlightenment License (e16)",
+    "mit-cmu": "CMU License",
+    "mit-enna": "enna License",
+    "mit-feh": "feh License",
+    "mit-open-group": "MIT Open Group variant",
+    "mitnfa": "MIT +no-false-attribs license",
+    "motosoto": "Motosoto License",
+    "mpich2": "mpich2 License",
+    "mpl-1.0": "Mozilla Public License 1.0",
+    "mpl-1.1": "Mozilla Public License 1.1",
+    "mpl-2.0": "Mozilla Public License 2.0",
+    "mpl-2.0-no-copyleft-exception": "Mozilla Public License 2.0 (no copyleft exception)",
+    "ms-pl": "Microsoft Public License",
+    "ms-rl": "Microsoft Reciprocal License",
+    "mtll": "Matrix Template Library License",
+    "mulanpsl-1.0": "Mulan Permissive Software License, Version 1",
+    "mulanpsl-2.0": "Mulan Permissive Software License, Version 2",
+    "multics": "Multics License",
+    "mup": "Mup License",
+    "nasa-1.3": "NASA Open Source Agreement 1.3",
+    "naumen": "Naumen Public License",
+    "nbpl-1.0": "Net Boolean Public License v1",
+    "ncgl-uk-2.0": "Non-Commercial Government Licence",
+    "ncsa": "University of Illinois/NCSA Open Source License",
+    "net-snmp": "Net-SNMP License",
+    "netcdf": "NetCDF license",
+    "newsletr": "Newsletr License",
+    "ngpl": "Nethack General Public License",
+    "nist-pd": "NIST Public Domain Notice",
+    "nist-pd-fallback": "NIST Public Domain Notice with license fallback",
+    "nlod-1.0": "Norwegian Licence for Open Government Data",
+    "nlpl": "No Limit Public License",
+    "nokia": "Nokia Open Source License",
+    "nosl": "Netizen Open Source License",
+    "noweb": "Noweb License",
+    "npl-1.0": "Netscape Public License v1.0",
+    "npl-1.1": "Netscape Public License v1.1",
+    "nposl-3.0": "Non-Profit Open Software License 3.0",
+    "nrl": "NRL License",
+    "ntp": "NTP License",
+    "ntp-0": "NTP No Attribution",
+    "nunit": "Nunit License",
+    "o-uda-1.0": "Open Use of Data Agreement v1.0",
+    "occt-pl": "Open CASCADE Technology Public License",
+    "oclc-2.0": "OCLC Research Public License 2.0",
+    "odbl-1.0": "ODC Open Database License v1.0",
+    "odc-by-1.0": "Open Data Commons Attribution License v1.0",
+    "ofl-1.0": "SIL Open Font License 1.0",
+    "ofl-1.0-no-rfn": "SIL Open Font License 1.0 with no Reserved Font Name",
+    "ofl-1.0-rfn": "SIL Open Font License 1.0 with Reserved Font Name",
+    "ofl-1.1": "SIL Open Font License 1.1",
+    "ofl-1.1-no-rfn": "SIL Open Font License 1.1 with no Reserved Font Name",
+    "ofl-1.1-rfn": "SIL Open Font License 1.1 with Reserved Font Name",
+    "ogc-1.0": "OGC Software License, Version 1.0",
+    "ogl-canada-2.0": "Open Government Licence - Canada",
+    "ogl-uk-1.0": "Open Government Licence v1.0",
+    "ogl-uk-2.0": "Open Government Licence v2.0",
+    "ogl-uk-3.0": "Open Government Licence v3.0",
+    "ogtsl": "Open Group Test Suite License",
+    "oldap-1.1": "Open LDAP Public License v1.1",
+    "oldap-1.2": "Open LDAP Public License v1.2",
+    "oldap-1.3": "Open LDAP Public License v1.3",
+    "oldap-1.4": "Open LDAP Public License v1.4",
+    "oldap-2.0": "Open LDAP Public License v2.0 (or possibly 2.0A and 2.0B)",
+    "oldap-2.0.1": "Open LDAP Public License v2.0.1",
+    "oldap-2.1": "Open LDAP Public License v2.1",
+    "oldap-2.2": "Open LDAP Public License v2.2",
+    "oldap-2.2.1": "Open LDAP Public License v2.2.1",
+    "oldap-2.2.2": "Open LDAP Public License 2.2.2",
+    "oldap-2.3": "Open LDAP Public License v2.3",
+    "oldap-2.4": "Open LDAP Public License v2.4",
+    "oldap-2.5": "Open LDAP Public License v2.5",
+    "oldap-2.6": "Open LDAP Public License v2.6",
+    "oldap-2.7": "Open LDAP Public License v2.7",
+    "oldap-2.8": "Open LDAP Public License v2.8",
+    "oml": "Open Market License",
+    "openssl": "OpenSSL License",
+    "opl-1.0": "Open Public License v1.0",
+    "oset-pl-2.1": "OSET Public License version 2.1",
+    "osl-1.0": "Open Software License 1.0",
+    "osl-1.1": "Open Software License 1.1",
+    "osl-2.0": "Open Software License 2.0",
+    "osl-2.1": "Open Software License 2.1",
+    "osl-3.0": "Open Software License 3.0",
+    "parity-6.0.0": "The Parity Public License 6.0.0",
+    "parity-7.0.0": "The Parity Public License 7.0.0",
+    "pddl-1.0": "ODC Public Domain Dedication & License 1.0",
+    "php-3.0": "PHP License v3.0",
+    "php-3.01": "PHP License v3.01",
+    "plexus": "Plexus Classworlds License",
+    "polyform-noncommercial-1.0.0": "PolyForm Noncommercial License 1.0.0",
+    "polyform-small-business-1.0.0": "PolyForm Small Business License 1.0.0",
+    "postgresql": "PostgreSQL License",
+    "psf-2.0": "Python Software Foundation License 2.0",
+    "psfrag": "psfrag License",
+    "psutils": "psutils License",
+    "python-2.0": "Python License 2.0",
+    "qhull": "Qhull License",
+    "qpl-1.0": "Q Public License 1.0",
+    "rdisc": "Rdisc License",
+    "rhecos-1.1": "Red Hat eCos Public License v1.1",
+    "rpl-1.1": "Reciprocal Public License 1.1",
+    "rpl-1.5": "Reciprocal Public License 1.5",
+    "rpsl-1.0": "RealNetworks Public Source License v1.0",
+    "rsa-md": "RSA Message-Digest License",
+    "rscpl": "Ricoh Source Code Public License",
+    "ruby": "Ruby License",
+    "sax-pd": "Sax Public Domain Notice",
+    "saxpath": "Saxpath License",
+    "scea": "SCEA Shared Source License",
+    "sendmail": "Sendmail License",
+    "sendmail-8.23": "Sendmail License 8.23",
+    "sgi-b-1.0": "SGI Free Software License B v1.0",
+    "sgi-b-1.1": "SGI Free Software License B v1.1",
+    "sgi-b-2.0": "SGI Free Software License B v2.0",
+    "shl-0.5": "Solderpad Hardware License v0.5",
+    "shl-0.51": "Solderpad Hardware License, Version 0.51",
+    "simpl-2.0": "Simple Public License 2.0",
+    "sissl": "Sun Industry Standards Source License v1.1",
+    "sissl-1.2": "Sun Industry Standards Source License v1.2",
+    "sleepycat": "Sleepycat License",
+    "smlnj": "Standard ML of New Jersey License",
+    "smppl": "Secure Messaging Protocol Public License",
+    "snia": "SNIA Public License 1.1",
+    "spencer-86": "Spencer License 86",
+    "spencer-94": "Spencer License 94",
+    "spencer-99": "Spencer License 99",
+    "spl-1.0": "Sun Public License v1.0",
+    "ssh-openssh": "SSH OpenSSH license",
+    "ssh-short": "SSH short notice",
+    "sspl-1.0": "Server Side Public License, v 1",
+    "standardml-nj": "Standard ML of New Jersey License",
+    "sugarcrm-1.1.3": "SugarCRM Public License v1.1.3",
+    "swl": "Scheme Widget Library (SWL) Software License Agreement",
+    "tapr-ohl-1.0": "TAPR Open Hardware License v1.0",
+    "tcl": "TCL/TK License",
+    "tcp-wrappers": "TCP Wrappers License",
+    "tmate": "TMate Open Source License",
+    "torque-1.1": "TORQUE v2.5+ Software License v1.1",
+    "tosl": "Trusster Open Source License",
+    "tu-berlin-1.0": "Technische Universitaet Berlin License 1.0",
+    "tu-berlin-2.0": "Technische Universitaet Berlin License 2.0",
+    "ucl-1.0": "Upstream Compatibility License v1.0",
+    "unicode-dfs-2015": "Unicode License Agreement - Data Files and Software (2015)",
+    "unicode-dfs-2016": "Unicode License Agreement - Data Files and Software (2016)",
+    "unicode-tou": "Unicode Terms of Use",
+    "unlicense": "The Unlicense",
+    "upl-1.0": "Universal Permissive License v1.0",
+    "vim": "Vim License",
+    "vostrom": "VOSTROM Public License for Open Source",
+    "vsl-1.0": "Vovida Software License v1.0",
+    "w3c": "W3C Software Notice and License (2002-12-31)",
+    "w3c-19980720": "W3C Software Notice and License (1998-07-20)",
+    "w3c-20150513": "W3C Software Notice and Document License (2015-05-13)",
+    "watcom-1.0": "Sybase Open Watcom Public License 1.0",
+    "wsuipa": "Wsuipa License",
+    "wtfpl": "Do What The F*ck You Want To Public License",
+    "wxwindows": "wxWindows Library License",
+    "x11": "X11 License",
+    "xerox": "Xerox License",
+    "xfree86-1.1": "XFree86 License 1.1",
+    "xinetd": "xinetd License",
+    "xnet": "X.Net License",
+    "xpp": "XPP License",
+    "xskat": "XSkat License",
+    "ypl-1.0": "Yahoo! Public License v1.0",
+    "ypl-1.1": "Yahoo! Public License v1.1",
+    "zed": "Zed License",
+    "zend-2.0": "Zend License v2.0",
+    "zimbra-1.3": "Zimbra Public License v1.3",
+    "zimbra-1.4": "Zimbra Public License v1.4",
+    "zlib": "zlib License",
+    "zlib-acknowledgement": "zlib/libpng License with Acknowledgement",
+    "zpl-1.1": "Zope Public License 1.1",
+    "zpl-2.0": "Zope Public License 2.0",
+    "zpl-2.1": "Zope Public License 2.1"
+}
\ No newline at end of file
diff --git a/src/datasets/utils/resources/multilingualities.json b/src/datasets/utils/resources/multilingualities.json
new file mode 100644
index 00000000000..a35c79f03df
--- /dev/null
+++ b/src/datasets/utils/resources/multilingualities.json
@@ -0,0 +1,6 @@
+{
+  "monolingual": "contains a single language",
+  "multilingual": "contains multiple languages",
+  "translation": "contains translated or aligned text",
+  "other": "other type of language distribution"
+}
diff --git a/src/datasets/utils/resources/size_categories.json b/src/datasets/utils/resources/size_categories.json
new file mode 100644
index 00000000000..983ce0c10db
--- /dev/null
+++ b/src/datasets/utils/resources/size_categories.json
@@ -0,0 +1,14 @@
+[
+  "unknown",
+  "n<1K",
+  "1K<n<10K",
+  "10K<n<100K",
+  "100K<n<1M",
+  "1M<n<10M",
+  "10M<n<100M",
+  "100M<n<1B",
+  "1B<n<10B",
+  "10B<n<100B",
+  "100B<n<1T",
+  "n>1T"
+]
diff --git a/src/datasets/utils/resources/tasks.json b/src/datasets/utils/resources/tasks.json
new file mode 100644
index 00000000000..966ba106c9d
--- /dev/null
+++ b/src/datasets/utils/resources/tasks.json
@@ -0,0 +1,86 @@
+{
+    "conditional-text-generation": {
+        "description": "data-to-text and text transduction tasks such as translation or summarization",
+        "options": [
+            "machine-translation",
+            "sentence-splitting-fusion",
+            "summarization",
+            "table-to-text",
+            "text-simplification",
+            "explanation-generation",
+            "other-stuctured-to-text",
+            "other"
+        ]
+    },
+    "question-answering": {
+        "description": "question answering tasks",
+        "options": [
+            "open-domain-qa",
+            "closed-domain-qa",
+            "multiple-choice-qa",
+            "extractive-qa",
+            "abstractive-qa",
+            "other"
+        ]
+    },
+    "sequence-modeling": {
+        "description": "such as language modeling or dialogue",
+        "options": [
+            "dialogue-modeling",
+            "language-modeling",
+            "other-multi-turn",
+            "slot-filling",
+            "other"
+        ]
+    },
+    "structure-prediction": {
+        "description": "predicting structural properties of the text, such as syntax",
+        "options": [
+            "coreference-resolution",
+            "named-entity-recognition",
+            "part-of-speech-tagging",
+            "parsing",
+            "other"
+        ]
+    },
+    "text-classification": {
+        "description": "predicting a class index or boolean value",
+        "options": [
+            "acceptability-classification",
+            "entity-linking-classification",
+            "fact-checking",
+            "intent-classification",
+            "multi-class-classification",
+            "multi-label-classification",
+            "natural-language-inference",
+            "semantic-similarity-classification",
+            "sentiment-classification",
+            "topic-classification",
+            "other"
+        ]
+    },
+    "text-retrieval": {
+        "description": "information or text retrieval tasks",
+        "options": [
+            "document-retrieval",
+            "utterance-retrieval",
+            "entity-linking-retrieval",
+            "fact-checking-retrieval",
+            "other"
+        ]
+    },
+    "text-scoring": {
+        "description": "text scoring tasks, predicting a real valued score for some text",
+        "options": [
+            "semantic-similarity-scoring",
+            "sentiment-scoring",
+            "other"
+        ]
+    },
+    "other": {
+        "description": "other task family not mentioned here",
+        "options": [
+            "other"
+        ]
+    }
+}
diff --git a/tests/test_metadata_util.py b/tests/test_metadata_util.py
new file mode 100644
index 00000000000..a24e0741862
--- /dev/null
+++ b/tests/test_metadata_util.py
@@ -0,0 +1,236 @@
+import tempfile
+import unittest
+from pathlib import Path
+
+from datasets.utils.metadata import (
+    DatasetMetadata,
+    escape_validation_for_predicate,
+    metadata_dict_from_readme,
+    tagset_validator,
+    validate_metadata_type,
+    yaml_block_from_readme,
+)
+
+
+def _dedent(string: str) -> str:
+    return "\n".join([line.lstrip() for line in string.splitlines()])
+
+
+README_YAML = """\
+---
+languages:
+- zh
+- en
+task_ids:
+- sentiment-classification
+---
+# Begin of markdown
+
+Some cool dataset card
+"""
+
+README_EMPTY_YAML = """\
+---
+---
+# Begin of markdown
+
+Some cool dataset card
+"""
+
+
+README_NO_YAML = """\
+# Begin of markdown
+
+Some cool dataset card
+"""
+
+
+class TestMetadataUtils(unittest.TestCase):
+    def test_validate_metadata_type(self):
+        metadata_dict = {
+            "tag": ["list", "of", "values"],
+            "another tag": ["Another", {"list"}, ["of"], 0x646D46736457567A],
+        }
+        validate_metadata_type(metadata_dict)
+
+        metadata_dict = {"tag1": []}
+        with self.assertRaises(TypeError):
+            validate_metadata_type(metadata_dict)
+
+        metadata_dict = {"tag1": None}
+        with self.assertRaises(TypeError):
+            validate_metadata_type(metadata_dict)
+
+    def test_tagset_validator(self):
+        name = "test_tag"
+        url = "https://dummy.hf.co"
+
+        values = ["tag1", "tag2", "tag2", "tag3"]
+        reference_values = ["tag1", "tag2", "tag3"]
+        returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url)
+        self.assertListEqual(returned_values, values)
+        self.assertIsNone(error)
+
+        values = []
+        reference_values = ["tag1", "tag2", "tag3"]
+        returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url)
+        self.assertListEqual(returned_values, values)
+        self.assertIsNone(error)
+
+        values = []
+        reference_values = []
+        returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url)
+        self.assertListEqual(returned_values, values)
+        self.assertIsNone(error)
+
+        values = ["tag1", "tag2", "tag2", "tag3", "unknown tag"]
+        reference_values = ["tag1", "tag2", "tag3"]
+        returned_values, error = tagset_validator(values=values, reference_values=reference_values, name=name, url=url)
+        self.assertListEqual(returned_values, [])
+        self.assertEqual(error, f"{['unknown tag']} are not registered tags for '{name}', reference at {url}")
+
+    def test_escape_validation_for_predicate(self):
+        def predicate_fn(string: str) -> bool:
+            return "ignore" in string
+
+        values = ["process me", "process me too", "ignore me"]
+        to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn)
+        self.assertListEqual(to_ignore, ["ignore me"])
+        self.assertListEqual(to_validate, ["process me", "process me too"])
+
+        values = ["process me", "process me too"]
+        to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn)
+        self.assertListEqual(to_ignore, [])
+        self.assertListEqual(to_validate, values)
+
+        values = ["this value will be ignored", "ignore this one two"]
+        to_ignore, to_validate = escape_validation_for_predicate(values=values, predicate_fn=predicate_fn)
+        self.assertListEqual(to_ignore, values)
+        self.assertListEqual(to_validate, [])
+
+    def test_yaml_block_from_readme(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            path = Path(tmp_dir) / "README.md"
+
+            with open(path, "w+") as readme_file:
+                readme_file.write(README_YAML)
+            yaml_block = yaml_block_from_readme(path=path)
+            self.assertEqual(
+                yaml_block,
+                _dedent(
+                    """\
+                    languages:
+                    - zh
+                    - en
+                    task_ids:
+                    - sentiment-classification
+"""
+                ),
+            )
+
+            with open(path, "w+") as readme_file:
+                readme_file.write(README_EMPTY_YAML)
+            yaml_block = yaml_block_from_readme(path=path)
+            self.assertEqual(
+                yaml_block,
+                _dedent(
+                    """\
+                    """
+                ),
+            )
+
+            with open(path, "w+") as readme_file:
+                readme_file.write(README_NO_YAML)
+            yaml_block = yaml_block_from_readme(path=path)
+            self.assertIsNone(yaml_block)
+
+    def test_metadata_dict_from_readme(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            path = Path(tmp_dir) / "README.md"
+            with open(path, "w+") as readme_file:
+                readme_file.write(README_YAML)
+            metadata_dict = metadata_dict_from_readme(path)
+            self.assertDictEqual(metadata_dict, {"languages": ["zh", "en"], "task_ids": ["sentiment-classification"]})
+
+            with open(path, "w+") as readme_file:
+                readme_file.write(README_EMPTY_YAML)
+            metadata_dict = metadata_dict_from_readme(path)
+            self.assertDictEqual(metadata_dict, {})
+
+            with open(path, "w+") as readme_file:
+                readme_file.write(README_NO_YAML)
+            metadata_dict = metadata_dict_from_readme(path)
+            self.assertIsNone(metadata_dict)
+
+    def test_from_yaml_string(self):
+        valid_yaml_string = _dedent(
+            """\
+            annotations_creators:
+            - found
+            language_creators:
+            - found
+            languages:
+            - en
+            licenses:
+            - unknown
+            multilinguality:
+            - monolingual
+            size_categories:
+            - 10K<n<100K
+            source_datasets:
+            - extended|other-yahoo-webscope-l6
+            task_categories:
+            - question-answering
+            task_ids:
+            - open-domain-qa
+            """
+        )
+        DatasetMetadata.from_yaml_string(valid_yaml_string)
+
+        invalid_tag_yaml = _dedent(
+            """\
+            annotations_creators:
+            - found
+            language_creators:
+            - some guys in Panama
+            languages:
+            - en
+            licenses:
+            - unknown
+            multilinguality:
+            - monolingual
+            size_categories:
+            - 10K<n<100K
+            source_datasets:
+            - extended|other-yahoo-webscope-l6
+            task_categories:
+            - question-answering
+            task_ids:
+            - open-domain-qa
+            """
+        )
+        with self.assertRaises(TypeError):
+            DatasetMetadata.from_yaml_string(invalid_tag_yaml)
+
+        missing_tag_yaml = _dedent(
+            """\
+            annotations_creators:
+            - found
+            languages:
+            - en
+            licenses:
+            - unknown
+            multilinguality:
+            - monolingual
+            size_categories:
+            - 10K<n<100K
+            source_datasets:
+            - extended|other-yahoo-webscope-l6
+            task_categories:
+            - question-answering
+            task_ids:
+            - open-domain-qa
+            """
+        )
+        with self.assertRaises(TypeError):
+            DatasetMetadata.from_yaml_string(missing_tag_yaml)