From a4e9b19d0d6de20bf052faa9bc1c411359c89580 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Fri, 8 Apr 2022 17:02:10 -0300 Subject: [PATCH 1/9] Auto track binary files --- src/huggingface_hub/repository.py | 60 ++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index 3ad5593b7c..0c987a527d 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -226,6 +226,26 @@ def is_git_ignored(filename: Union[str, Path]) -> bool: return is_ignored +def is_binary_file(filename: Union[str, Path]) -> bool: + """ + Check if file is a binary file. + + Args: + filename (`str` or `Path`): + The filename to check. + + Returns: + `bool`: `True` if the file passed is a binary file, `False` + otherwise. + """ + try: + with open(filename) as f: + f.read() + return False + except UnicodeDecodeError: + return True + + def files_to_be_staged(pattern: str, folder: Union[str, Path]) -> List[str]: """ Returns a list of filenames that are to be staged. @@ -981,6 +1001,41 @@ def lfs_enable_largefiles(self): except subprocess.CalledProcessError as exc: raise EnvironmentError(exc.stderr) + def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]: + """ + Automatically track binary files with git-lfs. + + Args: + pattern (`str`, *optional*, defaults to "."): + The pattern with which to track files that are above 10MBs. + + Returns: + `List[str]`: List of filenames that are now tracked due to being binary files + """ + files_to_be_tracked_with_lfs = [] + + deleted_files = self.list_deleted_files() + + for filename in files_to_be_staged(pattern, folder=self.local_dir): + if filename in deleted_files: + continue + + path_to_file = os.path.join(os.getcwd(), self.local_dir, filename) + is_binary = is_binary_file(path_to_file) + + if ( + is_binary + and not is_tracked_with_lfs(path_to_file) + and not is_git_ignored(path_to_file) + ): + self.lfs_track(filename) + files_to_be_tracked_with_lfs.append(filename) + + # Cleanup the .gitattributes if files were deleted + self.lfs_untrack(deleted_files) + + return files_to_be_tracked_with_lfs + def auto_track_large_files(self, pattern: Optional[str] = ".") -> List[str]: """ Automatically track large files (files that weigh more than 10MBs) with @@ -1094,7 +1149,10 @@ def git_add( file over 10MB in size will be automatically tracked. """ if auto_lfs_track: - tracked_files = self.auto_track_large_files(pattern) + tracked_files = [ + *self.auto_track_large_files(pattern), + *self.auto_track_binary_files(pattern) + ] if tracked_files: logger.warning( f"Adding files tracked by Git LFS: {tracked_files}. This may take a bit of time if the files are large." From 35b480cec5b399b1ba6542499a80dda1393b83e9 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Fri, 8 Apr 2022 17:22:24 -0300 Subject: [PATCH 2/9] Add null character check --- src/huggingface_hub/repository.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index 0c987a527d..6fc7af7d2d 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -240,8 +240,10 @@ def is_binary_file(filename: Union[str, Path]) -> bool: """ try: with open(filename) as f: - f.read() - return False + content = f.read() + + # Check for the presence of the null character in the string + return '\x00' in content except UnicodeDecodeError: return True From 57f86135ddc2f2e8addddf5cae35c5268eaa50c0 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Fri, 8 Apr 2022 17:39:51 -0300 Subject: [PATCH 3/9] Tests --- src/huggingface_hub/hf_api.py | 20 ++++--- src/huggingface_hub/repository.py | 25 ++++----- tests/test_repository.py | 89 +++++++++++++++++++++++++++++-- 3 files changed, 110 insertions(+), 24 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index f8eefb677b..f14c18acb2 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -569,19 +569,23 @@ def _validate_or_retrieve_token( function_name: Optional[str] = None, ): """ - Retrieves and validates stored token or validates passed token. Args: + Retrieves and validates stored token or validates passed token. token (``str``, `optional`): - Hugging Face token. Will default to the locally saved token if not provided. + Hugging Face token. Will default to the locally saved token if + not provided. name (``str``, `optional`): - Name of the repository. This is deprecated in favor of repo_id and will be removed in v0.7. + Name of the repository. This is deprecated in favor of repo_id + and will be removed in v0.7. function_name (``str``, `optional`): - If _validate_or_retrieve_token is called from a function, name of that function to be passed inside deprecation warning. + If _validate_or_retrieve_token is called from a function, name + of that function to be passed inside deprecation warning. Returns: Validated token and the name of the repository. Raises: - :class:`EnvironmentError`: If the token is not passed and there's no token saved locally. - :class:`ValueError`: If organization token or invalid token is passed. + :class:`EnvironmentError`: If the token is not passed and there's no + token saved locally. :class:`ValueError`: If organization token or + invalid token is passed. """ if token is None or token is True: token = HfFolder.get_token() @@ -1847,8 +1851,8 @@ def get_token(cls) -> Optional[str]: """ Get token or None if not existent. - Note that a token can be also provided using the `HUGGING_FACE_HUB_TOKEN` - environment variable. + Note that a token can be also provided using the + `HUGGING_FACE_HUB_TOKEN` environment variable. Returns: `str` or `None`: The token, `None` if it doesn't exist. diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index 6fc7af7d2d..e26c413e5e 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -235,15 +235,14 @@ def is_binary_file(filename: Union[str, Path]) -> bool: The filename to check. Returns: - `bool`: `True` if the file passed is a binary file, `False` - otherwise. + `bool`: `True` if the file passed is a binary file, `False` otherwise. """ try: with open(filename) as f: content = f.read() # Check for the presence of the null character in the string - return '\x00' in content + return "\x00" in content except UnicodeDecodeError: return True @@ -507,8 +506,8 @@ def __init__( skip_lfs_files (`bool`, *optional*, defaults to `False`): whether to skip git-LFS files or not. client (`HfApi`, *optional*): - Instance of HfApi to use when calling the HF Hub API. - A new instance will be created if this is left to `None`. + Instance of HfApi to use when calling the HF Hub API. A new + instance will be created if this is left to `None`. """ os.makedirs(local_dir, exist_ok=True) @@ -1012,7 +1011,8 @@ def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]: The pattern with which to track files that are above 10MBs. Returns: - `List[str]`: List of filenames that are now tracked due to being binary files + `List[str]`: List of filenames that are now tracked due to being + binary files """ files_to_be_tracked_with_lfs = [] @@ -1026,9 +1026,9 @@ def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]: is_binary = is_binary_file(path_to_file) if ( - is_binary - and not is_tracked_with_lfs(path_to_file) - and not is_git_ignored(path_to_file) + is_binary + and not is_tracked_with_lfs(path_to_file) + and not is_git_ignored(path_to_file) ): self.lfs_track(filename) files_to_be_tracked_with_lfs.append(filename) @@ -1147,13 +1147,14 @@ def git_add( pattern (`str`, *optional*, defaults to "."): The pattern with which to add files to staging. auto_lfs_track (`bool`, *optional*, defaults to `False`): - Whether to automatically track large files with git-lfs. Any - file over 10MB in size will be automatically tracked. + Whether to automatically track large and binaryfiles with + git-lfs. Any file over 10MB in size, or in binary format, will + be automatically tracked. """ if auto_lfs_track: tracked_files = [ *self.auto_track_large_files(pattern), - *self.auto_track_binary_files(pattern) + *self.auto_track_binary_files(pattern), ] if tracked_files: logger.warning( diff --git a/tests/test_repository.py b/tests/test_repository.py index 895dd8fc86..56a666d2b7 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -1075,12 +1075,18 @@ def test_is_tracked_with_lfs_with_pattern(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + os.makedirs(f"{WORKING_REPO_DIR}/dir", exist_ok=True) with open(f"{WORKING_REPO_DIR}/dir/large_file.txt", "w+") as f: @@ -1089,7 +1095,11 @@ def test_is_tracked_with_lfs_with_pattern(self): with open(f"{WORKING_REPO_DIR}/dir/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) + with open(f"{WORKING_REPO_DIR}/dir/binary_file.txt", "w+") as f: + f.write(binary_file) + repo.auto_track_large_files("dir") + repo.auto_track_binary_files("dir") self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt")) @@ -1097,12 +1107,18 @@ def test_is_tracked_with_lfs_with_pattern(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) + self.assertFalse( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) + ) self.assertTrue( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/large_file.txt")) ) self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/small_file.txt")) ) + self.assertTrue( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/binary_file.txt")) + ) def test_auto_track_large_files(self): repo = Repository(WORKING_REPO_DIR) @@ -1113,13 +1129,20 @@ def test_auto_track_large_files(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + repo.auto_track_large_files() + repo.auto_track_binary_files() self.assertTrue( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt")) @@ -1127,6 +1150,9 @@ def test_auto_track_large_files(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) + self.assertTrue( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) + ) def test_auto_track_large_files_ignored_with_gitignore(self): repo = Repository(WORKING_REPO_DIR) @@ -1134,14 +1160,17 @@ def test_auto_track_large_files_ignored_with_gitignore(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + # Test nested gitignores os.makedirs(f"{WORKING_REPO_DIR}/directory") with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f: - f.write("large_file.txt") + f.write("large_file.txt\nbinary_file.txt") with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f: - f.write("large_file_3.txt") + f.write("large_file_3.txt\nbinary_file_3.txt") with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) @@ -1157,6 +1186,21 @@ def test_auto_track_large_files_ignored_with_gitignore(self): repo.auto_track_large_files() + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + + with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f: + f.write(binary_file) + + with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f: + f.write(binary_file) + + with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f: + f.write(binary_file) + + repo.auto_track_binary_files() + + # Large files self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt")) ) @@ -1175,7 +1219,26 @@ def test_auto_track_large_files_ignored_with_gitignore(self): ) ) - def test_auto_track_large_files_through_git_add(self): + # Binary files + self.assertFalse( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) + ) + self.assertTrue( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file_2.txt")) + ) + + self.assertFalse( + is_tracked_with_lfs( + os.path.join(WORKING_REPO_DIR, "directory/binary_file_3.txt") + ) + ) + self.assertTrue( + is_tracked_with_lfs( + os.path.join(WORKING_REPO_DIR, "directory/binary_file_4.txt") + ) + ) + + def test_auto_track_files_through_git_add(self): repo = Repository(WORKING_REPO_DIR) # This content is 5MB (under 10MB) @@ -1184,12 +1247,18 @@ def test_auto_track_large_files_through_git_add(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + repo.git_add(auto_lfs_track=True) self.assertTrue( @@ -1198,8 +1267,11 @@ def test_auto_track_large_files_through_git_add(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) + self.assertTrue( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) + ) - def test_auto_no_track_large_files_through_git_add(self): + def test_auto_no_track_files_through_git_add(self): repo = Repository(WORKING_REPO_DIR) # This content is 5MB (under 10MB) @@ -1208,12 +1280,18 @@ def test_auto_no_track_large_files_through_git_add(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + repo.git_add(auto_lfs_track=False) self.assertFalse( @@ -1222,6 +1300,9 @@ def test_auto_no_track_large_files_through_git_add(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) + self.assertFalse( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) + ) def test_auto_track_updates_removed_gitattributes(self): repo = Repository(WORKING_REPO_DIR) From b8bbe4f60a340cf10e12ca48e56337cd836c04c2 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 12 Apr 2022 13:32:55 -0400 Subject: [PATCH 4/9] Apply suggestions from code review Co-authored-by: Omar Sanseviero --- src/huggingface_hub/repository.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index e26c413e5e..4aaacc3d7a 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -1008,7 +1008,7 @@ def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]: Args: pattern (`str`, *optional*, defaults to "."): - The pattern with which to track files that are above 10MBs. + The pattern with which to track files that are binary. Returns: `List[str]`: List of filenames that are now tracked due to being @@ -1147,7 +1147,7 @@ def git_add( pattern (`str`, *optional*, defaults to "."): The pattern with which to add files to staging. auto_lfs_track (`bool`, *optional*, defaults to `False`): - Whether to automatically track large and binaryfiles with + Whether to automatically track large and binary files with git-lfs. Any file over 10MB in size, or in binary format, will be automatically tracked. """ From 17a1331454925be14156a87fcf95c85ea1f17b35 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 12 Apr 2022 14:32:41 -0300 Subject: [PATCH 5/9] Only read 512 bytes --- src/huggingface_hub/repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index 4aaacc3d7a..6e95fdb283 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -239,7 +239,7 @@ def is_binary_file(filename: Union[str, Path]) -> bool: """ try: with open(filename) as f: - content = f.read() + content = f.read(512) # Check for the presence of the null character in the string return "\x00" in content From 5f48d6c15c882488ca74a0224fd603f3d6275cf0 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 12 Apr 2022 19:09:10 -0400 Subject: [PATCH 6/9] Address Eliott's comments --- src/huggingface_hub/repository.py | 35 +++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index 6e95fdb283..28254dfcf1 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -239,7 +239,7 @@ def is_binary_file(filename: Union[str, Path]) -> bool: """ try: with open(filename) as f: - content = f.read(512) + content = f.read() # Check for the presence of the null character in the string return "\x00" in content @@ -1023,15 +1023,22 @@ def auto_track_binary_files(self, pattern: Optional[str] = ".") -> List[str]: continue path_to_file = os.path.join(os.getcwd(), self.local_dir, filename) - is_binary = is_binary_file(path_to_file) - if ( - is_binary - and not is_tracked_with_lfs(path_to_file) - and not is_git_ignored(path_to_file) - ): - self.lfs_track(filename) - files_to_be_tracked_with_lfs.append(filename) + if not (is_tracked_with_lfs(path_to_file) or is_git_ignored(path_to_file)): + size_in_mb = os.path.getsize(path_to_file) / (1024 * 1024) + + if size_in_mb >= 10: + logger.warning( + "Parsing a large file to check if binary or not. Tracking large " + "files using `repository.auto_track_large_files` is recommended " + "so as to not load the full file in memory." + ) + + is_binary = is_binary_file(path_to_file) + + if is_binary: + self.lfs_track(filename) + files_to_be_tracked_with_lfs.append(filename) # Cleanup the .gitattributes if files were deleted self.lfs_untrack(deleted_files) @@ -1152,10 +1159,12 @@ def git_add( be automatically tracked. """ if auto_lfs_track: - tracked_files = [ - *self.auto_track_large_files(pattern), - *self.auto_track_binary_files(pattern), - ] + # Track files according to their size (>=10MB) + tracked_files = self.auto_track_large_files(pattern) + + # Read the remaining files and track them if they're binary + tracked_files.extend(self.auto_track_binary_files(pattern)) + if tracked_files: logger.warning( f"Adding files tracked by Git LFS: {tracked_files}. This may take a bit of time if the files are large." From 6d27a15cb3b8f9d6b6b87075701192569055136f Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Tue, 12 Apr 2022 19:13:25 -0400 Subject: [PATCH 7/9] Split test --- src/huggingface_hub/hf_api.py | 20 ++--- tests/test_repository.py | 152 +++++++++++++++++++++------------- 2 files changed, 104 insertions(+), 68 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index f14c18acb2..f8eefb677b 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -569,23 +569,19 @@ def _validate_or_retrieve_token( function_name: Optional[str] = None, ): """ - Args: Retrieves and validates stored token or validates passed token. + Args: token (``str``, `optional`): - Hugging Face token. Will default to the locally saved token if - not provided. + Hugging Face token. Will default to the locally saved token if not provided. name (``str``, `optional`): - Name of the repository. This is deprecated in favor of repo_id - and will be removed in v0.7. + Name of the repository. This is deprecated in favor of repo_id and will be removed in v0.7. function_name (``str``, `optional`): - If _validate_or_retrieve_token is called from a function, name - of that function to be passed inside deprecation warning. + If _validate_or_retrieve_token is called from a function, name of that function to be passed inside deprecation warning. Returns: Validated token and the name of the repository. Raises: - :class:`EnvironmentError`: If the token is not passed and there's no - token saved locally. :class:`ValueError`: If organization token or - invalid token is passed. + :class:`EnvironmentError`: If the token is not passed and there's no token saved locally. + :class:`ValueError`: If organization token or invalid token is passed. """ if token is None or token is True: token = HfFolder.get_token() @@ -1851,8 +1847,8 @@ def get_token(cls) -> Optional[str]: """ Get token or None if not existent. - Note that a token can be also provided using the - `HUGGING_FACE_HUB_TOKEN` environment variable. + Note that a token can be also provided using the `HUGGING_FACE_HUB_TOKEN` + environment variable. Returns: `str` or `None`: The token, `None` if it doesn't exist. diff --git a/tests/test_repository.py b/tests/test_repository.py index 56a666d2b7..de5d00eee1 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -1075,18 +1075,12 @@ def test_is_tracked_with_lfs_with_pattern(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) - # This content is binary (contains the null character) - binary_file = "\x00\x00\x00\x00" - with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) - with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: - f.write(binary_file) - os.makedirs(f"{WORKING_REPO_DIR}/dir", exist_ok=True) with open(f"{WORKING_REPO_DIR}/dir/large_file.txt", "w+") as f: @@ -1095,11 +1089,7 @@ def test_is_tracked_with_lfs_with_pattern(self): with open(f"{WORKING_REPO_DIR}/dir/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) - with open(f"{WORKING_REPO_DIR}/dir/binary_file.txt", "w+") as f: - f.write(binary_file) - repo.auto_track_large_files("dir") - repo.auto_track_binary_files("dir") self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt")) @@ -1107,18 +1097,12 @@ def test_is_tracked_with_lfs_with_pattern(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) - self.assertFalse( - is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) - ) self.assertTrue( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/large_file.txt")) ) self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/small_file.txt")) ) - self.assertTrue( - is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "dir/binary_file.txt")) - ) def test_auto_track_large_files(self): repo = Repository(WORKING_REPO_DIR) @@ -1129,20 +1113,13 @@ def test_auto_track_large_files(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) - # This content is binary (contains the null character) - binary_file = "\x00\x00\x00\x00" - with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) - with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: - f.write(binary_file) - repo.auto_track_large_files() - repo.auto_track_binary_files() self.assertTrue( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt")) @@ -1150,6 +1127,27 @@ def test_auto_track_large_files(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) + + def test_auto_track_binary_files(self): + repo = Repository(WORKING_REPO_DIR) + + # This content is non-binary + non_binary_file = [100] * int(1e6) + + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + + with open(f"{WORKING_REPO_DIR}/non_binary_file.txt", "w+") as f: + f.write(json.dumps(non_binary_file)) + + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + + repo.auto_track_binary_files() + + self.assertFalse( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary)file.txt")) + ) self.assertTrue( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) ) @@ -1160,17 +1158,14 @@ def test_auto_track_large_files_ignored_with_gitignore(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) - # This content is binary (contains the null character) - binary_file = "\x00\x00\x00\x00" - # Test nested gitignores os.makedirs(f"{WORKING_REPO_DIR}/directory") with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f: - f.write("large_file.txt\nbinary_file.txt") + f.write("large_file.txt") with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f: - f.write("large_file_3.txt\nbinary_file_3.txt") + f.write("large_file_3.txt") with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) @@ -1186,20 +1181,6 @@ def test_auto_track_large_files_ignored_with_gitignore(self): repo.auto_track_large_files() - with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: - f.write(binary_file) - - with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f: - f.write(binary_file) - - with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f: - f.write(binary_file) - - with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f: - f.write(binary_file) - - repo.auto_track_binary_files() - # Large files self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "large_file.txt")) @@ -1219,6 +1200,35 @@ def test_auto_track_large_files_ignored_with_gitignore(self): ) ) + def test_auto_track_binary_files_ignored_with_gitignore(self): + repo = Repository(WORKING_REPO_DIR) + + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + + # Test nested gitignores + os.makedirs(f"{WORKING_REPO_DIR}/directory") + + with open(f"{WORKING_REPO_DIR}/.gitignore", "w+") as f: + f.write("binary_file.txt") + + with open(f"{WORKING_REPO_DIR}/directory/.gitignore", "w+") as f: + f.write("binary_file_3.txt") + + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + + with open(f"{WORKING_REPO_DIR}/binary_file_2.txt", "w+") as f: + f.write(binary_file) + + with open(f"{WORKING_REPO_DIR}/directory/binary_file_3.txt", "w+") as f: + f.write(binary_file) + + with open(f"{WORKING_REPO_DIR}/directory/binary_file_4.txt", "w+") as f: + f.write(binary_file) + + repo.auto_track_binary_files() + # Binary files self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) @@ -1238,7 +1248,7 @@ def test_auto_track_large_files_ignored_with_gitignore(self): ) ) - def test_auto_track_files_through_git_add(self): + def test_auto_track_large_files_through_git_add(self): repo = Repository(WORKING_REPO_DIR) # This content is 5MB (under 10MB) @@ -1247,18 +1257,12 @@ def test_auto_track_files_through_git_add(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) - # This content is binary (contains the null character) - binary_file = "\x00\x00\x00\x00" - with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) - with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: - f.write(binary_file) - repo.git_add(auto_lfs_track=True) self.assertTrue( @@ -1267,11 +1271,32 @@ def test_auto_track_files_through_git_add(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) + + def test_auto_track_binary_files_through_git_add(self): + repo = Repository(WORKING_REPO_DIR) + + # This content is non binary + non_binary_file = [100] * int(1e6) + + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + + with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: + f.write(json.dumps(non_binary_file)) + + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + + repo.git_add(auto_lfs_track=True) + + self.assertFalse( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt")) + ) self.assertTrue( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) ) - def test_auto_no_track_files_through_git_add(self): + def test_auto_no_track_large_files_through_git_add(self): repo = Repository(WORKING_REPO_DIR) # This content is 5MB (under 10MB) @@ -1280,18 +1305,12 @@ def test_auto_no_track_files_through_git_add(self): # This content is 20MB (over 10MB) large_file = [100] * int(4e6) - # This content is binary (contains the null character) - binary_file = "\x00\x00\x00\x00" - with open(f"{WORKING_REPO_DIR}/large_file.txt", "w+") as f: f.write(json.dumps(large_file)) with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: f.write(json.dumps(small_file)) - with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: - f.write(binary_file) - repo.git_add(auto_lfs_track=False) self.assertFalse( @@ -1300,6 +1319,27 @@ def test_auto_no_track_files_through_git_add(self): self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "small_file.txt")) ) + + def test_auto_no_track_binary_files_through_git_add(self): + repo = Repository(WORKING_REPO_DIR) + + # This content is non-binary + non_binary_file = [100] * int(1e6) + + # This content is binary (contains the null character) + binary_file = "\x00\x00\x00\x00" + + with open(f"{WORKING_REPO_DIR}/small_file.txt", "w+") as f: + f.write(json.dumps(non_binary_file)) + + with open(f"{WORKING_REPO_DIR}/binary_file.txt", "w+") as f: + f.write(binary_file) + + repo.git_add(auto_lfs_track=False) + + self.assertFalse( + is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "non_binary_file.txt")) + ) self.assertFalse( is_tracked_with_lfs(os.path.join(WORKING_REPO_DIR, "binary_file.txt")) ) From 9fe6bb6f303d0b642bfed753866bf5aab1378376 Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 18 Apr 2022 15:59:58 -0400 Subject: [PATCH 8/9] Address review comments --- src/huggingface_hub/repository.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index 28254dfcf1..95988fa375 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -238,11 +238,15 @@ def is_binary_file(filename: Union[str, Path]) -> bool: `bool`: `True` if the file passed is a binary file, `False` otherwise. """ try: - with open(filename) as f: + with open(filename, "rb") as f: content = f.read() - # Check for the presence of the null character in the string - return "\x00" in content + # Code sample taken from the following stack overflow thread + # https://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python/7392391#7392391 + text_chars = bytearray( + {7, 8, 9, 10, 12, 13, 27} | set(range(0x20, 0x100)) - {0x7F} + ) + return bool(content.translate(None, text_chars)) except UnicodeDecodeError: return True From cbfdce5d3c5af3cfefc493e2111c63e382cb2f3d Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Thu, 21 Apr 2022 09:27:08 -0400 Subject: [PATCH 9/9] 10MB maximum --- src/huggingface_hub/repository.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/huggingface_hub/repository.py b/src/huggingface_hub/repository.py index 95988fa375..44432d791e 100644 --- a/src/huggingface_hub/repository.py +++ b/src/huggingface_hub/repository.py @@ -239,7 +239,7 @@ def is_binary_file(filename: Union[str, Path]) -> bool: """ try: with open(filename, "rb") as f: - content = f.read() + content = f.read(10 * (1024**2)) # Read a maximum of 10MB # Code sample taken from the following stack overflow thread # https://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python/7392391#7392391