/
_validators.py
231 lines (181 loc) · 9.14 KB
/
_validators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# coding=utf-8
# Copyright 2022-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains utilities to validate argument values in `huggingface_hub`."""
import inspect
import re
import warnings
from functools import wraps
from itertools import chain
from typing import Any, Dict
from ._typing import CallableT
REPO_ID_REGEX = re.compile(
r"""
^
(\b[\w\-.]+\b/)? # optional namespace (username or organization)
\b # starts with a word boundary
[\w\-.]{1,96} # repo_name: alphanumeric + . _ -
\b # ends with a word boundary
$
""",
flags=re.VERBOSE,
)
class HFValidationError(ValueError):
"""Generic exception thrown by `huggingface_hub` validators.
Inherits from [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError).
"""
def validate_hf_hub_args(fn: CallableT) -> CallableT:
"""Validate values received as argument for any public method of `huggingface_hub`.
The goal of this decorator is to harmonize validation of arguments reused
everywhere. By default, all defined validators are tested.
Validators:
- [`~utils.validate_repo_id`]: `repo_id` must be `"repo_name"`
or `"namespace/repo_name"`. Namespace is a username or an organization.
- [`~utils.smoothly_deprecate_use_auth_token`]: Use `token` instead of
`use_auth_token` (only if `use_auth_token` is not expected by the decorated
function - in practice, always the case in `huggingface_hub`).
Example:
```py
>>> from huggingface_hub.utils import validate_hf_hub_args
>>> @validate_hf_hub_args
... def my_cool_method(repo_id: str):
... print(repo_id)
>>> my_cool_method(repo_id="valid_repo_id")
valid_repo_id
>>> my_cool_method("other..repo..id")
huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
>>> my_cool_method(repo_id="other..repo..id")
huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
>>> @validate_hf_hub_args
... def my_cool_auth_method(token: str):
... print(token)
>>> my_cool_auth_method(token="a token")
"a token"
>>> my_cool_auth_method(use_auth_token="a use_auth_token")
"a use_auth_token"
>>> my_cool_auth_method(token="a token", use_auth_token="a use_auth_token")
UserWarning: Both `token` and `use_auth_token` are passed (...)
"a token"
```
Raises:
[`~utils.HFValidationError`]:
If an input is not valid.
"""
# TODO: add an argument to opt-out validation for specific argument?
signature = inspect.signature(fn)
# Should the validator switch `use_auth_token` values to `token`? In practice, always
# True in `huggingface_hub`. Might not be the case in a downstream library.
check_use_auth_token = "use_auth_token" not in signature.parameters and "token" in signature.parameters
@wraps(fn)
def _inner_fn(*args, **kwargs):
has_token = False
for arg_name, arg_value in chain(
zip(signature.parameters, args), # Args values
kwargs.items(), # Kwargs values
):
if arg_name in ["repo_id", "from_id", "to_id"]:
validate_repo_id(arg_value)
elif arg_name == "token" and arg_value is not None:
has_token = True
if check_use_auth_token:
kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
return fn(*args, **kwargs)
return _inner_fn # type: ignore
def validate_repo_id(repo_id: str) -> None:
"""Validate `repo_id` is valid.
This is not meant to replace the proper validation made on the Hub but rather to
avoid local inconsistencies whenever possible (example: passing `repo_type` in the
`repo_id` is forbidden).
Rules:
- Between 1 and 96 characters.
- Either "repo_name" or "namespace/repo_name"
- [a-zA-Z0-9] or "-", "_", "."
- "--" and ".." are forbidden
Valid: `"foo"`, `"foo/bar"`, `"123"`, `"Foo-BAR_foo.bar123"`
Not valid: `"datasets/foo/bar"`, `".repo_id"`, `"foo--bar"`, `"foo.git"`
Example:
```py
>>> from huggingface_hub.utils import validate_repo_id
>>> validate_repo_id(repo_id="valid_repo_id")
>>> validate_repo_id(repo_id="other..repo..id")
huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
```
Discussed in https://github.com/huggingface/huggingface_hub/issues/1008.
In moon-landing (internal repository):
- https://github.com/huggingface/moon-landing/blob/main/server/lib/Names.ts#L27
- https://github.com/huggingface/moon-landing/blob/main/server/views/components/NewRepoForm/NewRepoForm.svelte#L138
"""
if not isinstance(repo_id, str):
# Typically, a Path is not a repo_id
raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_id}'.")
if repo_id.count("/") > 1:
raise HFValidationError(
"Repo id must be in the form 'repo_name' or 'namespace/repo_name':"
f" '{repo_id}'. Use `repo_type` argument if needed."
)
if not REPO_ID_REGEX.match(repo_id):
raise HFValidationError(
"Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are"
" forbidden, '-' and '.' cannot start or end the name, max length is 96:"
f" '{repo_id}'."
)
if "--" in repo_id or ".." in repo_id:
raise HFValidationError(f"Cannot have -- or .. in repo_id: '{repo_id}'.")
if repo_id.endswith(".git"):
raise HFValidationError(f"Repo_id cannot end by '.git': '{repo_id}'.")
def smoothly_deprecate_use_auth_token(fn_name: str, has_token: bool, kwargs: Dict[str, Any]) -> Dict[str, Any]:
"""Smoothly deprecate `use_auth_token` in the `huggingface_hub` codebase.
The long-term goal is to remove any mention of `use_auth_token` in the codebase in
favor of a unique and less verbose `token` argument. This will be done a few steps:
0. Step 0: methods that require a read-access to the Hub use the `use_auth_token`
argument (`str`, `bool` or `None`). Methods requiring write-access have a `token`
argument (`str`, `None`). This implicit rule exists to be able to not send the
token when not necessary (`use_auth_token=False`) even if logged in.
1. Step 1: we want to harmonize everything and use `token` everywhere (supporting
`token=False` for read-only methods). In order not to break existing code, if
`use_auth_token` is passed to a function, the `use_auth_token` value is passed
as `token` instead, without any warning.
a. Corner case: if both `use_auth_token` and `token` values are passed, a warning
is thrown and the `use_auth_token` value is ignored.
2. Step 2: Once it is release, we should push downstream libraries to switch from
`use_auth_token` to `token` as much as possible, but without throwing a warning
(e.g. manually create issues on the corresponding repos).
3. Step 3: After a transitional period (6 months e.g. until April 2023?), we update
`huggingface_hub` to throw a warning on `use_auth_token`. Hopefully, very few
users will be impacted as it would have already been fixed.
In addition, unit tests in `huggingface_hub` must be adapted to expect warnings
to be thrown (but still use `use_auth_token` as before).
4. Step 4: After a normal deprecation cycle (3 releases ?), remove this validator.
`use_auth_token` will definitely not be supported.
In addition, we update unit tests in `huggingface_hub` to use `token` everywhere.
This has been discussed in:
- https://github.com/huggingface/huggingface_hub/issues/1094.
- https://github.com/huggingface/huggingface_hub/pull/928
- (related) https://github.com/huggingface/huggingface_hub/pull/1064
"""
new_kwargs = kwargs.copy() # do not mutate input !
use_auth_token = new_kwargs.pop("use_auth_token", None) # remove from kwargs
if use_auth_token is not None:
if has_token:
warnings.warn(
"Both `token` and `use_auth_token` are passed to"
f" `{fn_name}` with non-None values. `token` is now the"
" preferred argument to pass a User Access Token."
" `use_auth_token` value will be ignored."
)
else:
# `token` argument is not passed and a non-None value is passed in
# `use_auth_token` => use `use_auth_token` value as `token` kwarg.
new_kwargs["token"] = use_auth_token
return new_kwargs