Skip to content

Commit

Permalink
Fix tokenizer load from one file (#19073)
Browse files Browse the repository at this point in the history
* Fix tokenizer load from one file

* Add a test

* Style

Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
  • Loading branch information
sgugger and LysandreJik committed Sep 16, 2022
1 parent 1504b53 commit af20bbb
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1726,6 +1726,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike],
for file_id, file_path in vocab_files.items():
if file_path is None:
resolved_vocab_files[file_id] = None
elif os.path.isfile(file_path):
resolved_vocab_files[file_id] = file_path
elif is_remote_url(file_path):
resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies)
else:
Expand Down
11 changes: 11 additions & 0 deletions tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union

from huggingface_hub import HfFolder, delete_repo, set_access_token
from huggingface_hub.file_download import http_get
from parameterized import parameterized
from requests.exceptions import HTTPError
from transformers import (
Expand Down Expand Up @@ -3886,6 +3887,16 @@ def test_cached_files_are_used_when_internet_is_down(self):
# This check we did call the fake head request
mock_head.assert_called()

def test_legacy_load_from_one_file(self):
try:
tmp_file = tempfile.mktemp()
with open(tmp_file, "wb") as f:
http_get("https://huggingface.co/albert-base-v1/resolve/main/spiece.model", f)

AlbertTokenizer.from_pretrained(tmp_file)
finally:
os.remove(tmp_file)


@is_staging_test
class TokenizerPushToHubTester(unittest.TestCase):
Expand Down

0 comments on commit af20bbb

Please sign in to comment.