From f89f16a51ebd08c4b9728b87d63e1054df79d641 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 13 Sep 2022 13:11:24 -0400 Subject: [PATCH] Re-add support for single url files in objects download (#19014) --- src/transformers/configuration_utils.py | 7 +++- src/transformers/feature_extraction_utils.py | 5 +++ src/transformers/modeling_flax_utils.py | 5 +++ src/transformers/modeling_tf_utils.py | 5 +++ src/transformers/modeling_utils.py | 5 +++ src/transformers/tokenization_utils_base.py | 5 +++ src/transformers/utils/__init__.py | 2 ++ src/transformers/utils/hub.py | 35 +++++++++++++++++++- 8 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 41503255ac2ad..db8147b4dee34 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -32,7 +32,9 @@ PushToHubMixin, cached_file, copy_func, + download_url, extract_commit_hash, + is_remote_url, is_torch_available, logging, ) @@ -592,9 +594,12 @@ def _get_config_dict( is_local = os.path.isdir(pretrained_model_name_or_path) if os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)): - # Soecial case when pretrained_model_name_or_path is a local file + # Special case when pretrained_model_name_or_path is a local file resolved_config_file = pretrained_model_name_or_path is_local = True + elif is_remote_url(pretrained_model_name_or_path): + configuration_file = pretrained_model_name_or_path + resolved_config_file = download_url(pretrained_model_name_or_path) else: configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME) diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index 394d67a8c5a1a..85c751b841073 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -31,8 +31,10 @@ TensorType, cached_file, copy_func, + download_url, is_flax_available, is_offline_mode, + is_remote_url, is_tf_available, is_torch_available, logging, @@ -386,6 +388,9 @@ def get_feature_extractor_dict( if os.path.isfile(pretrained_model_name_or_path): resolved_feature_extractor_file = pretrained_model_name_or_path is_local = True + elif is_remote_url(pretrained_model_name_or_path): + feature_extractor_file = pretrained_model_name_or_path + resolved_feature_extractor_file = download_url(pretrained_model_name_or_path) else: feature_extractor_file = FEATURE_EXTRACTOR_NAME try: diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index b19f3db77e190..92d307e8cd7e2 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -47,8 +47,10 @@ add_start_docstrings_to_model_forward, cached_file, copy_func, + download_url, has_file, is_offline_mode, + is_remote_url, logging, replace_return_docstrings, ) @@ -677,6 +679,9 @@ def from_pretrained( elif os.path.isfile(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path is_local = True + elif is_remote_url(pretrained_model_name_or_path): + archive_file = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) else: filename = WEIGHTS_NAME if from_pt else FLAX_WEIGHTS_NAME try: diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 2c1febd43c8d2..160a68c9209dd 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -54,9 +54,11 @@ ModelOutput, PushToHubMixin, cached_file, + download_url, find_labels, has_file, is_offline_mode, + is_remote_url, logging, requires_backends, working_or_temp_dir, @@ -2345,6 +2347,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): elif os.path.isfile(pretrained_model_name_or_path + ".index"): archive_file = pretrained_model_name_or_path + ".index" is_local = True + elif is_remote_url(pretrained_model_name_or_path): + archive_file = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) else: # set correct filename filename = WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 68fe7f94d2c7f..2f305ff8dd098 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -59,10 +59,12 @@ PushToHubMixin, cached_file, copy_func, + download_url, has_file, is_accelerate_available, is_bitsandbytes_available, is_offline_mode, + is_remote_url, logging, replace_return_docstrings, ) @@ -1998,6 +2000,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ) archive_file = os.path.join(subfolder, pretrained_model_name_or_path + ".index") is_local = True + elif is_remote_url(pretrained_model_name_or_path): + archive_file = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) else: # set correct filename if from_tf: diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 0b01163c0b15e..5062a7bfb9999 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -42,9 +42,11 @@ add_end_docstrings, cached_file, copy_func, + download_url, extract_commit_hash, is_flax_available, is_offline_mode, + is_remote_url, is_tf_available, is_tokenizers_available, is_torch_available, @@ -1680,6 +1682,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], FutureWarning, ) file_id = list(cls.vocab_files_names.keys())[0] + vocab_files[file_id] = pretrained_model_name_or_path else: # At this point pretrained_model_name_or_path is either a directory or a model identifier name @@ -1723,6 +1726,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], for file_id, file_path in vocab_files.items(): if file_path is None: resolved_vocab_files[file_id] = None + elif is_remote_url(file_path): + resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies) else: resolved_vocab_files[file_id] = cached_file( pretrained_model_name_or_path, diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 2a2a4c4125749..44c3e1807860f 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -63,6 +63,7 @@ cached_file, default_cache_path, define_sagemaker_information, + download_url, extract_commit_hash, get_cached_models, get_file_from_repo, @@ -70,6 +71,7 @@ has_file, http_user_agent, is_offline_mode, + is_remote_url, move_cache, send_example_telemetry, ) diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py index 3e5863e4eff54..8bdf360b029cd 100644 --- a/src/transformers/utils/hub.py +++ b/src/transformers/utils/hub.py @@ -19,10 +19,12 @@ import re import shutil import sys +import tempfile import traceback import warnings from pathlib import Path from typing import Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse from uuid import uuid4 import huggingface_hub @@ -37,7 +39,7 @@ whoami, ) from huggingface_hub.constants import HUGGINGFACE_HEADER_X_LINKED_ETAG, HUGGINGFACE_HEADER_X_REPO_COMMIT -from huggingface_hub.file_download import REGEX_COMMIT_HASH +from huggingface_hub.file_download import REGEX_COMMIT_HASH, http_get from huggingface_hub.utils import ( EntryNotFoundError, LocalEntryNotFoundError, @@ -124,6 +126,11 @@ def is_offline_mode(): _CACHED_NO_EXIST = object() +def is_remote_url(url_or_filename): + parsed = urlparse(url_or_filename) + return parsed.scheme in ("http", "https") + + def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]: """ Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape `(model_url, @@ -541,6 +548,32 @@ def get_file_from_repo( ) +def download_url(url, proxies=None): + """ + Downloads a given url in a temporary file. This function is not safe to use in multiple processes. Its only use is + for deprecated behavior allowing to download config/models with a single url instead of using the Hub. + + Args: + url (`str`): The url of the file to download. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + + Returns: + `str`: The location of the temporary file where the url was downloaded. + """ + warnings.warn( + f"Using `from_pretrained` with the url of a file (here {url}) is deprecated and won't be possible anymore in" + " v5 of Transformers. You should host your file on the Hub (hf.co) instead and use the repository ID. Note" + " that this is not compatible with the caching system (your file will be downloaded at each execution) or" + " multiple processes (each process will download the file in a different temporary file)." + ) + tmp_file = tempfile.mktemp() + with open(tmp_file, "wb") as f: + http_get(url, f, proxies=proxies) + return tmp_file + + def has_file( path_or_repo: Union[str, os.PathLike], filename: str,