remove "gated datasets unlock" logic (#189)

* refactor: 💡 move gated datasets "unlock" code to models/ also: add two tests to ensure the gated datasets can be accessed * test: 💍 adapt to new version of dummy_gated dataset I changed (https://huggingface.co/datasets/severo/dummy_gated/commit/99194748bed3625a941aaf785740df02ca5762c9) severo/dummy_gated to a simpler dataset, without a python script, to avoid having non-related errors. Also in the commit: load the HF_TOKEN from a secret in https://github.com/huggingface/datasets-preview-backend/settings/secrets/actions to be able to run the unit tests * test: 💍 fix wrong hardcoded value * chore: 🤖 ignore safety warning on ujson package it's a dependency of lm-dataformat, and last version still depends on a vulnerable ujson version * feat: 🎸 remove the "ask_access" logic for gated datasets the new "app" tokens on moonlanding can read the gated datasets without having to accept the conditions first, as it occurs for users. BREAKING CHANGE: 🧨 HF_TOKEN must be an app token
huggingface · Apr 1, 2022 · 1a6eb0c · 1a6eb0c
1 parent de2ff07
commit 1a6eb0c
Show file tree

Hide file tree

Showing 12 changed files with 65 additions and 58 deletions.
diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
@@ -35,5 +35,5 @@ jobs:
       - name: Run bandit
         run: poetry run bandit -r src
       - name: Run safety
-        run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356
+        run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499
         # ^^ safety exceptions: pillow, numpy
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -32,6 +32,7 @@ jobs:
           ROWS_MAX_NUMBER: 5
           MONGO_CACHE_DATABASE: datasets_preview_cache_test
           MONGO_QUEUE_DATABASE: datasets_preview_queue_test
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests
       - uses: codecov/codecov-action@v2
         with:

diff --git a/Makefile b/Makefile
@@ -28,7 +28,7 @@ quality:
 	poetry run flake8 tests src
 	poetry run mypy tests src
 	poetry run bandit -r src
-	poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356
+	poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499
 # ^^ safety exceptions: pillow, numpy
 
 # Format source code automatically

diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ Every `WORKER_SLEEP_SECONDS` (defaults to 5 seconds) when idle, the worker will
 - the memory (RAM + SWAP) on the machine is below `MAX_MEMORY_PCT` (defaults to 60%)
 - the number of started jobs for the same dataset is under `MAX_JOBS_PER_DATASET`
 
-Also specify `HF_TOKEN` with a User Access Token (see https://huggingface.co/settings/token, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty.
+Also specify `HF_TOKEN` with an App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty.
 
 Also specify `MAX_SIZE_FALLBACK` with the maximum size in bytes of the dataset to fallback in normal mode if streaming fails. Note that it requires to have the size in the info metadata. Set to `0` to disable the fallback. Defaults to `100_000_000`.
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ function-parser = "^0.0.3"
 gdown = "^4.2.0"
 kenlm = { url = "https://github.com/kpu/kenlm/archive/master.zip" }
 kss = "^2.6.0"
-lm-dataformat = "^0.0.19"
+lm-dataformat = "^0.0.20"
 lxml = "^4.6.3"
 mongo-types = "0.15.1"
 mongoengine = "^0.23.1"

diff --git a/src/datasets_preview_backend/config.py b/src/datasets_preview_backend/config.py
@@ -8,6 +8,7 @@
     DEFAULT_ASSETS_DIRECTORY,
     DEFAULT_DATASETS_ENABLE_PRIVATE,
     DEFAULT_DATASETS_REVISION,
+    DEFAULT_HF_TOKEN,
     DEFAULT_LOG_LEVEL,
     DEFAULT_MAX_AGE_LONG_SECONDS,
     DEFAULT_MAX_AGE_SHORT_SECONDS,
@@ -36,6 +37,7 @@
     d=os.environ, key="DATASETS_ENABLE_PRIVATE", default=DEFAULT_DATASETS_ENABLE_PRIVATE
 )
 DATASETS_REVISION = get_str_value(d=os.environ, key="DATASETS_REVISION", default=DEFAULT_DATASETS_REVISION)
+HF_TOKEN = get_str_or_none_value(d=os.environ, key="HF_TOKEN", default=DEFAULT_HF_TOKEN)
 LOG_LEVEL = get_str_value(d=os.environ, key="LOG_LEVEL", default=DEFAULT_LOG_LEVEL)
 MAX_AGE_LONG_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_LONG_SECONDS", default=DEFAULT_MAX_AGE_LONG_SECONDS)
 MAX_AGE_SHORT_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_SHORT_SECONDS", default=DEFAULT_MAX_AGE_SHORT_SECONDS)

diff --git a/src/datasets_preview_backend/io/cache.py b/src/datasets_preview_backend/io/cache.py
@@ -43,7 +43,6 @@
     SplitFullName,
     get_dataset_split_full_names,
 )
-from datasets_preview_backend.models.hf_dataset import ask_access
 from datasets_preview_backend.models.split import Split, get_split
 from datasets_preview_backend.utils import orjson_dumps
 
@@ -359,10 +358,6 @@ def clean_database() -> None:
 
 
 def refresh_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]:
-    if hf_token:
-        # remove the gate (for gated datasets) if a token is passed
-        ask_access(dataset_name, hf_token)
-
     try:
         split_full_names = get_dataset_split_full_names(dataset_name, hf_token)
         upsert_dataset(dataset_name, split_full_names)
@@ -420,10 +415,6 @@ def refresh_split(
     hf_token: Optional[str] = None,
     max_size_fallback: Optional[int] = None,
 ):
-    if hf_token:
-        # remove the gate (for gated datasets) if a token is passed
-        ask_access(dataset_name, hf_token)
-
     try:
         split = get_split(
             dataset_name, config_name, split_name, hf_token=hf_token, max_size_fallback=max_size_fallback

diff --git a/src/datasets_preview_backend/models/dataset.py b/src/datasets_preview_backend/models/dataset.py
@@ -18,6 +18,7 @@ class SplitFullName(TypedDict):
 
 def get_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]:
     logger.info(f"get dataset '{dataset_name}' split full names")
+
     try:
         guard_blocked_datasets(dataset_name)
         return [

diff --git a/src/datasets_preview_backend/models/hf_dataset.py b/src/datasets_preview_backend/models/hf_dataset.py
@@ -1,7 +1,6 @@
 import logging
 from typing import List, TypedDict, Union
 
-import requests
 from datasets import list_datasets
 
 logger = logging.getLogger(__name__)
@@ -32,15 +31,5 @@ def get_hf_datasets() -> List[HFDataset]:
     ]
 
 
-def ask_access(dataset_name: str, hf_token: str) -> None:
-    url = f"https://huggingface.co/datasets/{dataset_name}/ask-access"
-    headers = {"Authorization": f"Bearer {hf_token}"}
-    try:
-        requests.get(url, headers=headers)
-    except Exception as err:
-        logger.warning(f"error while asking access to dataset {dataset_name}: {err}")
-    # TODO: check if the access was granted: check if we were redirected to the dataset page, or to the login page
-
-
 def get_hf_dataset_names() -> List[str]:
     return [d["id"] for d in get_hf_datasets()]