Skip to content

Commit

Permalink
remove "gated datasets unlock" logic (#189)
Browse files Browse the repository at this point in the history
* refactor: 💡 move gated datasets "unlock" code to models/

also: add two tests to ensure the gated datasets can be accessed

* test: 💍 adapt to new version of dummy_gated dataset

I changed
(https://huggingface.co/datasets/severo/dummy_gated/commit/99194748bed3625a941aaf785740df02ca5762c9)
severo/dummy_gated to a simpler dataset, without a python script, to
avoid having non-related errors. Also in the commit: load the HF_TOKEN
from a secret in
https://github.com/huggingface/datasets-preview-backend/settings/secrets/actions
to be able to run the unit tests

* test: 💍 fix wrong hardcoded value

* chore: 🤖 ignore safety warning on ujson package

it's a dependency of lm-dataformat, and last version still depends on a
vulnerable ujson version

* feat: 🎸 remove the "ask_access" logic for gated datasets

the new "app" tokens on moonlanding can read the gated datasets without
having to accept the conditions first, as it occurs for users.

BREAKING CHANGE: 🧨 HF_TOKEN must be an app token
  • Loading branch information
severo committed Apr 1, 2022
1 parent de2ff07 commit 1a6eb0c
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 58 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/quality.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ jobs:
- name: Run bandit
run: poetry run bandit -r src
- name: Run safety
run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356
run: poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499
# ^^ safety exceptions: pillow, numpy
1 change: 1 addition & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
ROWS_MAX_NUMBER: 5
MONGO_CACHE_DATABASE: datasets_preview_cache_test
MONGO_QUEUE_DATABASE: datasets_preview_queue_test
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: poetry run python -m pytest -s --cov --cov-report xml:coverage.xml --cov-report=term tests
- uses: codecov/codecov-action@v2
with:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ quality:
poetry run flake8 tests src
poetry run mypy tests src
poetry run bandit -r src
poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356
poetry run safety check -i 44487 -i 44485 -i 44524 -i 44525 -i 44486 -i 44716 -i 44717 -i 44715 -i 45356 -i 46499
# ^^ safety exceptions: pillow, numpy

# Format source code automatically
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Every `WORKER_SLEEP_SECONDS` (defaults to 5 seconds) when idle, the worker will
- the memory (RAM + SWAP) on the machine is below `MAX_MEMORY_PCT` (defaults to 60%)
- the number of started jobs for the same dataset is under `MAX_JOBS_PER_DATASET`

Also specify `HF_TOKEN` with a User Access Token (see https://huggingface.co/settings/token, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty.
Also specify `HF_TOKEN` with an App Access Token (ask moonlanding administrators to get one, only the `read` role is required) to allow the worker to download gated models from the hub. Defaults to empty.

Also specify `MAX_SIZE_FALLBACK` with the maximum size in bytes of the dataset to fallback in normal mode if streaming fails. Note that it requires to have the size in the info metadata. Set to `0` to disable the fallback. Defaults to `100_000_000`.

Expand Down
68 changes: 35 additions & 33 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function-parser = "^0.0.3"
gdown = "^4.2.0"
kenlm = { url = "https://github.com/kpu/kenlm/archive/master.zip" }
kss = "^2.6.0"
lm-dataformat = "^0.0.19"
lm-dataformat = "^0.0.20"
lxml = "^4.6.3"
mongo-types = "0.15.1"
mongoengine = "^0.23.1"
Expand Down
2 changes: 2 additions & 0 deletions src/datasets_preview_backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
DEFAULT_ASSETS_DIRECTORY,
DEFAULT_DATASETS_ENABLE_PRIVATE,
DEFAULT_DATASETS_REVISION,
DEFAULT_HF_TOKEN,
DEFAULT_LOG_LEVEL,
DEFAULT_MAX_AGE_LONG_SECONDS,
DEFAULT_MAX_AGE_SHORT_SECONDS,
Expand Down Expand Up @@ -36,6 +37,7 @@
d=os.environ, key="DATASETS_ENABLE_PRIVATE", default=DEFAULT_DATASETS_ENABLE_PRIVATE
)
DATASETS_REVISION = get_str_value(d=os.environ, key="DATASETS_REVISION", default=DEFAULT_DATASETS_REVISION)
HF_TOKEN = get_str_or_none_value(d=os.environ, key="HF_TOKEN", default=DEFAULT_HF_TOKEN)
LOG_LEVEL = get_str_value(d=os.environ, key="LOG_LEVEL", default=DEFAULT_LOG_LEVEL)
MAX_AGE_LONG_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_LONG_SECONDS", default=DEFAULT_MAX_AGE_LONG_SECONDS)
MAX_AGE_SHORT_SECONDS = get_int_value(d=os.environ, key="MAX_AGE_SHORT_SECONDS", default=DEFAULT_MAX_AGE_SHORT_SECONDS)
Expand Down
9 changes: 0 additions & 9 deletions src/datasets_preview_backend/io/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
SplitFullName,
get_dataset_split_full_names,
)
from datasets_preview_backend.models.hf_dataset import ask_access
from datasets_preview_backend.models.split import Split, get_split
from datasets_preview_backend.utils import orjson_dumps

Expand Down Expand Up @@ -359,10 +358,6 @@ def clean_database() -> None:


def refresh_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]:
if hf_token:
# remove the gate (for gated datasets) if a token is passed
ask_access(dataset_name, hf_token)

try:
split_full_names = get_dataset_split_full_names(dataset_name, hf_token)
upsert_dataset(dataset_name, split_full_names)
Expand Down Expand Up @@ -420,10 +415,6 @@ def refresh_split(
hf_token: Optional[str] = None,
max_size_fallback: Optional[int] = None,
):
if hf_token:
# remove the gate (for gated datasets) if a token is passed
ask_access(dataset_name, hf_token)

try:
split = get_split(
dataset_name, config_name, split_name, hf_token=hf_token, max_size_fallback=max_size_fallback
Expand Down
1 change: 1 addition & 0 deletions src/datasets_preview_backend/models/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class SplitFullName(TypedDict):

def get_dataset_split_full_names(dataset_name: str, hf_token: Optional[str] = None) -> List[SplitFullName]:
logger.info(f"get dataset '{dataset_name}' split full names")

try:
guard_blocked_datasets(dataset_name)
return [
Expand Down
11 changes: 0 additions & 11 deletions src/datasets_preview_backend/models/hf_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
from typing import List, TypedDict, Union

import requests
from datasets import list_datasets

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -32,15 +31,5 @@ def get_hf_datasets() -> List[HFDataset]:
]


def ask_access(dataset_name: str, hf_token: str) -> None:
url = f"https://huggingface.co/datasets/{dataset_name}/ask-access"
headers = {"Authorization": f"Bearer {hf_token}"}
try:
requests.get(url, headers=headers)
except Exception as err:
logger.warning(f"error while asking access to dataset {dataset_name}: {err}")
# TODO: check if the access was granted: check if we were redirected to the dataset page, or to the login page


def get_hf_dataset_names() -> List[str]:
return [d["id"] for d in get_hf_datasets()]

0 comments on commit 1a6eb0c

Please sign in to comment.