Skip to content

Commit

Permalink
add docs and release script; remove validate parameter; add aliases p…
Browse files Browse the repository at this point in the history
…roperty to CompressionFileFormat
  • Loading branch information
jdidion committed Nov 11, 2016
1 parent b214ba4 commit 7c0ce90
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 22 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ dist/
*.pyc
xphyle.egg-info/
docs/_build/
release.sh
22 changes: 20 additions & 2 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
xphyle: extraordinarily simple file handling
===================================
============================================

.. image:: logo.png
:height: 200px
Expand Down Expand Up @@ -68,13 +68,31 @@ The following are functionally equivalent ways to open a gzip file::
f = gzip.open('input.gz', 'rt')
from xphyle import xopen
f = xopen('input.gz', 'tr')
f = xopen('input.gz', 'rt')

So then why use xphyle? Two reasons:

1. The ``gzip.open`` method of opening a gzip file above requires you to know that you are expecting a gzip file and only a gzip file. If your program optionally accepts either a compressed or an uncompressed file, then you'll need several extra lines of code to either detect the file format or to make the user specify the format of the file they are providing. This becomes increasingly cumbersome with each additional format you want to support. On the other hand, ``xopen`` has the same interface regardless of the compression format. Furthermore, if xphyle doesn't currently support a file format that you would like to use, it enables you to add it via a simple API.
2. The ``gzip.open`` method of opening a gzip file uses python code to uncompress the file. It's well written, highly optimized python code, but unfortunately it's still slower than your natively compiled system-level applications (e.g. pigz or gzip). The ``xopen`` method of opening a gzip file first tries to use pigz or gzip to uncompress the file and provides access to the resulting stream of uncompressed data (as a file-like object), and only falls back to ``gzip.open`` if neither program is available.

If you want to be explicit about whether to expect a compressed file, what type of compression to expect, or whether to try and use system programs, you can::
# Expect the file to not be compressed
f = xopen('input', 'rb', compression=False)
# Open a remote file. Expect the file to be compressed, and throw an error
# if it's not, or if the compression format cannot be determined.
f = xopen('http://foo.com/input.gz', 'rt', compression=True)
# Open stdin. Expect the input to be gzip compressed, and throw an error if
# it's not
f = xopen(STDIN, 'rt', compression='gzip')
# Do not try to use the system-level gzip program for decompression
f = xopen('input.gz', 'rt', compression='gzip', use_system=False)



Other useful tools
------------------

Expand Down
10 changes: 10 additions & 0 deletions release.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
version=$1
# tag
git tag $version
# build
python setup.py install
nose2 -C tests --coverage-report term-missing --coverage-config .coveragerc
python setup.py sdist bdist_wheel
# release
twine register dist/xphyle-${version}.tar.gz
twine upload dist/xphyle-${version}.tar.gz
8 changes: 8 additions & 0 deletions tests/testformats.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ def read_file(fmt, path, ext, use_system, mode='rt'):
xz_path = get_format('xz').executable_path

class CompressionTests(TestCase):
def test_list_formats(self):
self.assertSetEqual(
set(('gzip','bz2','lzma')),
set(list_compression_formats()))
self.assertSetEqual(
set(('gzip','gz','pigz')),
set(get_format('gzip').aliases))

def test_guess_format(self):
self.assertEqual('gzip', guess_compression_format('gz'))
self.assertEqual('gzip', guess_compression_format('.gz'))
Expand Down
6 changes: 2 additions & 4 deletions xphyle/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def open_(f, mode : 'str' = 'r', errors : 'bool' = True, **kwargs):

def xopen(path : 'str', mode : 'str' = 'r', compression : 'bool|str' = None,
use_system : 'bool' = True, context_wrapper : 'bool' = True,
validate : 'bool' = True, **kwargs) -> 'file':
**kwargs) -> 'file':
"""
Replacement for the `open` function that automatically handles
compressed files. If `use_system==True` and the file is compressed,
Expand Down Expand Up @@ -133,8 +133,6 @@ def xopen(path : 'str', mode : 'str' = 'r', compression : 'bool|str' = None,
context_wrapper: If True and ``path`` == '-' or '_', returns
a ContextManager (i.e. usable with ``with``) that wraps the
system stream and is no-op on close.
validate: Whether to validate that a file is acutally of the format
specified by ``compression``.
kwargs: Additional keyword arguments to pass to ``open``.
Returns:
Expand Down Expand Up @@ -168,7 +166,7 @@ def xopen(path : 'str', mode : 'str' = 'r', compression : 'bool|str' = None,
# Whether to try and guess file format
guess_format = compression in (None, True)
# Whether to validate that the actually compression format matches expected
validate = validate and compression and not guess_format
validate = compression and not guess_format
# Guessed compression type, if compression in (None, True)
guess = None
# Whether the file object is a stream (e.g. stdout or URL)
Expand Down
46 changes: 31 additions & 15 deletions xphyle/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ def __exit__(self, *exc_info):
compression_formats = {}
"""Dict of registered compression formats"""

compression_format_aliases = {}
"""Dict mapping aliases to cmopression format names."""

magic_bytes = {}
"""Dict mapping the first byte in a 'magic' sequence to a tuple of
(format, rest_of_sequence)
Expand All @@ -204,40 +207,39 @@ def register_compression_format(format_class : 'class'):
``format_class`` -- a subclass of CompressionFormat
"""
fmt = format_class()
aliases = set(fmt.exts)
#if isinstance(fmt.system_commands, dict):
# aliases = aliases | set(fmt.system_commands.values())
#else:
aliases.update(fmt.system_commands)
aliases.add(fmt.name)
for alias in aliases:
compression_formats[fmt.name] = fmt
for alias in fmt.aliases:
# TODO: warn about overriding existing format?
compression_formats[alias] = fmt
compression_format_aliases[alias] = fmt.name
for magic in fmt.magic_bytes:
global max_magic_bytes
max_magic_bytes = max(max_magic_bytes, len(magic))
magic_bytes[magic[0]] = (fmt.name, magic[1:])
for mime in fmt.mime_types:
mime_types[mime] = fmt.name

def list_compression_formats() -> 'tuple':
return tuple(compression_formats.keys())

def get_compression_format(name : 'str') -> 'CompressionFormat':
"""Returns the CompressionFormat associated with the given name, or raises
ValueError if that format is not supported.
"""
if name in compression_formats:
if name in compression_format_aliases:
name = compression_format_aliases[name]
return compression_formats[name]
raise ValueError("Unsupported compression format: {}".format(name))

def guess_compression_format(name : 'str') -> 'str':
"""Guess the compression format by name or file extension.
"""
if name in compression_formats:
return compression_formats[name].name
if name in compression_format_aliases:
return compression_format_aliases[name]
i = name.rfind(os.extsep)
if i >= 0:
ext = name[(i+1):]
if ext in compression_formats:
return compression_formats[ext].name
if ext in compression_format_aliases:
return compression_format_aliases[ext]
return None

def guess_format_from_file_header(path : 'str') -> 'str':
Expand Down Expand Up @@ -295,8 +297,22 @@ class CompressionFormat(FileFormat):
"""Base class for classes that provide access to system-level and
python-level implementations of compression formats.
"""
@property
def aliases(self) -> 'tuple':
"""All of the aliases by which this format is known.
"""
aliases = set(self.exts)
#if isinstance(fmt.system_commands, dict):
# aliases = aliases | set(fmt.system_commands.values())
#else:
aliases.update(self.system_commands)
aliases.add(self.name)
return tuple(aliases)

@property
def default_ext(self) -> 'str':
"""The default file extension for this format.
"""
return self.exts[0]

def _get_compresslevel(self, level=None):
Expand All @@ -310,14 +326,14 @@ def _get_compresslevel(self, level=None):

@property
def can_use_system_compression(self) -> 'bool':
"""Returns True if at least one command in ``self.system_commands``
"""Whether at least one command in ``self.system_commands``
resolves to an existing, executable file.
"""
return self.compress_path is not None

@property
def can_use_system_uncompression(self) -> 'bool':
"""Returns True if at least one command in ``self.system_commands``
"""Whether at least one command in ``self.system_commands``
resolves to an existing, executable file.
"""
return self.uncompress_path is not None
Expand Down

0 comments on commit 7c0ce90

Please sign in to comment.