In [None]:
#| default_exp helper.files_and_folders

In [None]:
#| export
import bz2
import errno
import gzip
import lzma
import os
from os import PathLike
from pathlib import Path
import platform
import tarfile
from typing import Optional
import winreg
import zipfile

from deprecated import deprecated
import glob
from natsort import natsorted


In [None]:
from fastcore.test import *
from unittest import mock

## Files and folders

#### File existence

In [None]:
#| export
def existing_path(
        path: PathLike,  # A file or directory path. Either absolute or relative to `relative_to`.
        relative_to: Optional[PathLike] = None  # Path to the directory that `file` is relative to.  If `None`, then `path` is an absolute path.
        ) -> Path: # The path formed by `relative_to` adjoined with `path`.  Defaults to `None`
    """Returns a path relative to a specified path as an absolute path
    that exists.

    **Raises**

    - `FileNotFoundError`
        - If `relative_to` is not `None` but does not exist, or if
        `file` does not exist.
    - `ValueError`
        - If `relative_to` is not `None` and yet not an absolute path, or
        if `relative_to` is `None` at yet `path` is not an absolute path.
    
    **Notes**
    - This function may add the string `'\\\\?\\'` in front, which identifies
    very long paths.
    """
    if relative_to is not None:
        if not os.path.isabs(relative_to):
            raise ValueError(
                f'The parameter `relative_to` is expected to be an'
                f' absolute path, but it is not: {relative_to}')
        if not os.path.exists(relative_to):
            raise FileNotFoundError(
                errno.ENOENT, os.strerror(errno.ENOENT), relative_to)
        path = Path(relative_to) / path
    elif not os.path.isabs(path):
        raise ValueError(
            f'The parameter `path` is expected to be an absolute path,'
            f' but it is not: {path}')
    if not os.path.exists(path) and platform.system() == 'Windows':
        path = f'\\\\?\\{str(path)}'  # For long file names
    if not os.path.exists(path):
        raise FileNotFoundError(
            errno.ENOENT, os.strerror(errno.ENOENT), path)
    return Path(path)


@deprecated(reason="The function has been renamed to `existing_path`")
def file_existence_test(
        path: PathLike,  # A file or directory path. Either absolute or relative to `relative_to`.
        relative_to: Optional[PathLike] = None  # Path to the directory that `file` is relative to.  If `None`, then `path` is an absolute path.
        ) -> Path: # The path formed by `relative_to` adjoined with `path`.  Defaults to `None`
    """
    **Deprecated. Use `existing_path` instead.**
    
    Returns a path relative to a specified path as an absolute path
    that exists.

    **Raises**
    - `FileNotFoundError`
        - If `relative_to` is not `None` but does not exist, or if
        `file` does not exist.
    
    **Notes**
    - This function may add the string `'\\\\?\\'` in front, which identifies
    very long paths.
    """
    if relative_to is not None:
        if not os.path.isabs(relative_to):
            raise ValueError(
                f'The parameter `relative_to` is expected to be an'
                f' absolute path, but it is not: {relative_to}')
        if not os.path.exists(relative_to):
            raise FileNotFoundError(
                errno.ENOENT, os.strerror(errno.ENOENT), relative_to)
        path = Path(relative_to) / path
    elif not os.path.isabs(path):
        raise ValueError(
            f'The parmaeter `path` is expected to be an absolute path,'
            f' but it is not: {path}')
    if not os.path.exists(path) and platform.system() == 'Windows':
        path = f'\\\\?\\{str(path)}'  # For long file names
    if not os.path.exists(path):
        raise FileNotFoundError(
            errno.ENOENT, os.strerror(errno.ENOENT), path)
    return Path(path)

In the following example, the `existing_path` method returns an existing absolute path $p_2 \backslash p_1$ which is equivalent to a specified path $p_1$ relative to an existing absolute path $p_2$. Note that all paths and `os` methods are mocked:

In [None]:
with (mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_path_exists.return_value = True
    mock_is_abs.return_value = True
    path_1 = existing_path('mock_existing_relative_path', 'mock_existing_absolute_path')
    test_eq(Path('mock_existing_absolute_path') / 'mock_existing_relative_path', path_1)


If the desired path is very long in Windows, then the `\\?\` may be appended in front of the absolute path so that Python can actually find the path, cf. https://stackoverflow.com/questions/36219317/pathname-too-long-to-open:

In [None]:
# TODO provide an example
with (mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
  print('hi') 

hi


If the parameter `relative_to`, which is supposed to be an absolute path, is not `None` and not absolute, then a `ValueError` is raised:

In [None]:
with (ExceptionExpected(ex=ValueError, regex='absolute path'),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    path = 'mock_relative_path_that_is_not_None'
    relative_to = 'mock_non_absolute_path'
    existing_path('mock_relative_to_that_is_not_None', relative_to)


with (ExceptionExpected(ex=ValueError, regex='absolute path'),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    # It does not matter what `path`` is - as long as `relative_to`` is not `None` and not absolute, the ValueError is raised.
    path = None  
    relative_to = 'mock_non_absolute_path'
    existing_path('mock_relative_to_that_is_not_None', relative_to)

In [None]:
# |hide 
# I am verifying that a mock method's return value is constant if it is not reset 
with (mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    print(mock_is_abs())
    print(mock_is_abs())
    print(mock_is_abs())

False
False
False


If the parameter `relative_to` is `None` and the paramether `path` is not absolute, then a `ValueError` is raised:

In [None]:
with (ExceptionExpected(ex=ValueError, regex='absolute path'),
      mock.patch('os.path.isabs') as mock_is_abs):
    mock_is_abs.return_value = False
    relative_to = None
    path = 'mock_non_absolute_path'
    existing_path(path, relative_to)

If `relative_to` does not exist or if `path` does not exist, then a `FileNotFoundError` is raised:

In [None]:
# In this example, both `relative_to` and `path` are specified, and `relative_to`
# is a non-existent path.`
with (ExceptionExpected(ex=FileNotFoundError),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    relative_to = 'mock_non_existent_absolute_path'
    path = 'mock_some_relative_path'
    def relative_to_does_not_exist(path_to_check):
      return False if path_to_check is relative_to else True
    def relative_to_is_absolute_path(path_to_check):
      return True if path_to_check is relative_to else False

    mock_path_exists.side_effect = relative_to_does_not_exist
    mock_is_abs.side_effect = relative_to_is_absolute_path
    existing_path(path, relative_to)


# In this example, both `relative_to` and `path` are specified, and `path`
# is a non-existent path.`, whereas `relative_to` exists.
with (ExceptionExpected(ex=FileNotFoundError),
      mock.patch('os.path.exists') as mock_path_exists,
      mock.patch('os.path.isabs') as mock_is_abs):
    relative_to = 'mock_existent_absolute_path'
    path = 'mock_non_existent_relative_path'
    def only_relative_to_exists(path_to_check):
      # only `relative_to` exists; all other paths of interest do not exist.
      return path_to_check is relative_to
    def relative_to_is_absolute_path(path_to_check):
      return True if path_to_check is relative_to else False

    mock_path_exists.side_effect = only_relative_to_exists
    mock_is_abs.side_effect = relative_to_is_absolute_path
    existing_path(path, relative_to)


#### Paths without extensions

In [None]:
#| export
def path_name_no_ext(
        path: PathLike # The path of the file or directory. This may be absolute or relative to any directory.
        ) -> str: # The name of the file or directory without the extension.
    """Return the name of a file or directory from its path without the
    extension.
    
    The file or directory does not have to exist.
    """
    name_with_extension = os.path.basename(path)
    return os.path.splitext(name_with_extension)[0]

Basic usage:

In [None]:
path = Path('hypothetical_directory')
test_eq(path_name_no_ext(path / 'hypothetical_subdirectory'),  'hypothetical_subdirectory')
test_eq(path_name_no_ext(path / 'hypotehtical_subdirectory' / 'hypothetical_file.md'),  'hypothetical_file')

The path does not have to exist.

In [None]:
test_eq(path_name_no_ext(path / 'this_folder_does_not_exist'), 'this_folder_does_not_exist')

On paths to files with "multiple extensions", the function returns the file name without the last extension only.

In [None]:
test_eq(path_name_no_ext('archived_file_somewhere.7z.zip.tar'),  'archived_file_somewhere.7z.zip')

In [None]:
#| export
def path_no_ext(
    path: PathLike # The path of the file or directory. This may be absolute or relative to any directory.
    ) -> str: # The path of the file or directory without the extension. If `path` is a path to a directory, then the output should be essentially the same as `path`.
    """Returns the path of a file or directory without the extension.
    
    The file or directory does not have to exist.
    """
    return os.path.splitext(str(path))[0]

Basic usage - the path does not have to exist:

In [None]:
assert path_no_ext('C:\\hi') == 'C:\\hi'
assert path_no_ext('greetings\\file.txt') == 'greetings\\file'

#### Read text from file

In [None]:
#| export
def text_from_file(
        path: PathLike, # The absolute path of the file.
        encoding: str = 'utf8' # The encoding of the file to be read. Defaults to `'utf8'`.
        ) -> str: # The entire text from a file
    """Return the entire text from a file.

    Assuems that the file can be encoded in the specified `encoding`
    """
    with open(path, 'r', encoding=encoding) as file:
        text = file.read()
        file.close()
    return text

The `text_from_file` method is a quick method to extract the text from a file.

In [None]:
# TODO: examples/tests

#### File extension

In [None]:
#| export
def files_of_format_sorted(
        directory: PathLike, # The directory in which to find the files
        extension: str = 'txt' # Extension of the files to find. Defaults to 'txt'.
        ) -> list[str]:
    """Return a list of path str of files in the directory (but not subdirectories)
    sorted via `natsort`.
    """
    return natsorted(glob.glob(str(Path(directory) / f'*.{extension}')))

In the following example, we mock a folder with numbered files. `files_of_format_sorted` returns them in the "natural" order.

In [None]:
with (mock.patch('glob.glob') as mock_glob):
    mock_directory = Path('some_directory')
    # `glob_results`` is not sorted in "natural" order. 
    glob_results = [mock_directory / f'{i}.txt' for i in range (10,0, -1)]
    mock_glob.return_value = glob_results

    # mock to make sure that natsorted was called.
    with mock.patch(__name__ + '.natsorted') as mock_natsorted:
      mock_files = files_of_format_sorted(mock_directory)
      mock_natsorted.assert_called_with(mock_glob.return_value)

    # Now print out that the files are sorted in "natural" order.
    mock_files = files_of_format_sorted(mock_directory)
    print(mock_files)
    test_shuffled(glob_results, mock_files)

[WindowsPath('some_directory/1.txt'), WindowsPath('some_directory/2.txt'), WindowsPath('some_directory/3.txt'), WindowsPath('some_directory/4.txt'), WindowsPath('some_directory/5.txt'), WindowsPath('some_directory/6.txt'), WindowsPath('some_directory/7.txt'), WindowsPath('some_directory/8.txt'), WindowsPath('some_directory/9.txt'), WindowsPath('some_directory/10.txt')]


## Compressed files

In [None]:
#| export
def file_is_compressed(
        filename: str
        ):
    # Define a set of common compressed file extensions
    compressed_extensions = {
        '.zip',
        '.gz',
        '.tar',
        '.tar.gz',
        '.tgz',
        '.bz2',
        '.xz',
        '.7z',
        '.rar',
        '.z',
    }

    # Get the file extension
    _, file_extension = os.path.splitext(filename)

    # Check if the file extension is in the set of compressed extensions
    return file_extension.lower() in compressed_extensions

In [None]:
assert file_is_compressed('asdf.tar')
assert file_is_compressed('asdf.tar.gz')
assert file_is_compressed('./hi/asdf.tar.gz')
assert not file_is_compressed('./hi/bye')
assert not file_is_compressed('./hi/bye.pdf')

In [None]:
#| export
def uncompress_file(
        file_path: PathLike,
        verbose: bool = False
        ):
    # Check the file extension
    file_path = str(file_path)
    _, file_extension = os.path.splitext(file_path)
    uncompressed_files = []

    try:
        if file_extension == '.zip':
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(os.path.dirname(file_path))
                uncompressed_files = [os.path.join(os.path.dirname(file_path), name) for name in zip_ref.namelist()]
            if verbose:
                print(f"Uncompressed {file_path} into {os.path.dirname(file_path)}")

        elif file_extension in ['.tar', '.tar.gz', '.tgz', '.tar.bz2', '.tbz']:
            with tarfile.open(file_path, 'r:*') as tar_ref:
                tar_ref.extractall(os.path.dirname(file_path))
                uncompressed_files = [os.path.join(os.path.dirname(file_path), name) for name in tar_ref.getnames()]
            if verbose:
                print(f"Uncompressed {file_path} into {os.path.dirname(file_path)}")

        elif file_extension == '.gz':
            output_file_path = file_path[:-3]  # Remove the .gz extension
            with gzip.open(file_path, 'rb') as gz_file:
                with open(output_file_path, 'wb') as out_file:
                    out_file.write(gz_file.read())
            uncompressed_files.append(output_file_path)
            if verbose:
                print(f"Uncompressed {file_path} into {output_file_path}")

        elif file_extension == '.bz2':
            output_file_path = file_path[:-4]  # Remove the .bz2 extension
            with bz2.open(file_path, 'rb') as bz2_file:
                with open(output_file_path, 'wb') as out_file:
                    out_file.write(bz2_file.read())
            uncompressed_files.append(output_file_path)
            if verbose:
                print(f"Uncompressed {file_path} into {output_file_path}")

        elif file_extension == '.xz':
            output_file_path = file_path[:-3]  # Remove the .xz extension
            with lzma.open(file_path, 'rb') as xz_file:
                with open(output_file_path, 'wb') as out_file:
                    out_file.write(xz_file.read())
            uncompressed_files.append(output_file_path)
            if verbose:
                print(f"Uncompressed {file_path} into {output_file_path}")

        else:
            if verbose:
                print(f"Unsupported file format: {file_extension}")

    except Exception as e:
        if verbose:
            print(f"An error occurred while uncompressing {file_path}: {e}")

    return uncompressed_files

# import os
# import zipfile
# import tarfile
# import gzip
# import bz2
# import lzma
# from pathlib import Path

# def uncompress_file(
#         file_path: PathLike,
#         verbose: bool = False
#         ):
#     file_path = Path(file_path)
#     output_dir = file_path.parent
#     uncompressed_files = []

#     try:
#         if file_path.suffix == '.zip':
#             with zipfile.ZipFile(file_path, 'r') as zip_ref:
#                 zip_ref.extractall(output_dir)
#                 uncompressed_files = [output_dir / name for name in zip_ref.namelist()]

#         elif file_path.suffix in ['.tar', '.gz', '.tgz', '.bz2', '.tbz']:
#             if file_path.suffix == '.gz' and not tarfile.is_tarfile(file_path):
#                 # Handle single gzipped file
#                 with gzip.open(file_path, 'rb') as gz_file:
#                     content = gz_file.read()
#                     # Try to get the original filename from the gzip header
#                     original_name = gz_file.name
#                     if original_name:
#                         output_file = output_dir / Path(original_name).name
#                     else:
#                         output_file = output_dir / file_path.stem
#                     output_file.write_bytes(content)
#                 uncompressed_files.append(output_file)
#             else:
#                 # Handle tar archives (including .tar.gz, .tgz, .tar.bz2, .tbz)
#                 with tarfile.open(file_path, 'r:*') as tar_ref:
#                     tar_ref.extractall(output_dir)
#                     uncompressed_files = [output_dir / name for name in tar_ref.getnames()]

#         elif file_path.suffix == '.bz2':
#             with bz2.open(file_path, 'rb') as bz2_file:
#                 content = bz2_file.read()
#                 output_file = output_dir / file_path.stem
#                 output_file.write_bytes(content)
#             uncompressed_files.append(output_file)

#         elif file_path.suffix == '.xz':
#             with lzma.open(file_path, 'rb') as xz_file:
#                 content = xz_file.read()
#                 output_file = output_dir / file_path.stem
#                 output_file.write_bytes(content)
#             uncompressed_files.append(output_file)

#         else:
#             if verbose:
#                 print(f"Unsupported file format: {file_path.suffix}")

#         if verbose:
#             print(f"Uncompressed {file_path} into {output_dir}")

#     except Exception as e:
#         if verbose:
#             print(f"An error occurred while uncompressing {file_path}: {e}")

#     return uncompressed_files


In [None]:


# TODO : test

# Example usage
# file_to_uncompress = './1605.08386v1.Heat_bath_random_walks_with_Markov_bases.tar'  # Replace with your file name
# hi = uncompress_file(file_to_uncompress)

In [None]:
# hi

## Downloads folder

In [None]:
#| export
def get_download_path() -> str:
    """
    Return the user's download folder
    """
    if os.name == 'nt':  # For Windows
        sub_key = r'SOFTWARE\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders'
        downloads_guid = '{374DE290-123F-4565-9164-39C4925E467B}'
        with winreg.OpenKey(winreg.HKEY_CURRENT_USER, sub_key) as key:
            location = winreg.QueryValueEx(key, downloads_guid)[0]
        return location
    else:  # For Unix-based systems (Linux, macOS)
        return os.path.join(os.path.expanduser('~'), 'Downloads')

## HuggingFace cache

In [None]:
#| export
def get_huggingface_cache_dir():
    # Determine the cache directory
    cache_dir = os.environ.get("HF_HOME") or os.environ.get("XDG_CACHE_HOME")
    if not cache_dir:
        home = Path.home()
        cache_dir = home / ".cache" / "huggingface" / "hub"
    else:
        cache_dir = Path(cache_dir) / "huggingface" / "hub"
    return cache_dir

In [None]:
# get_huggingface_cache_dir()