From 04e0b93669a0c40e8c623edf851e5c37662b224f Mon Sep 17 00:00:00 2001 From: GaelVaroquaux Date: Tue, 3 Jan 2012 09:58:57 +0100 Subject: [PATCH] DOC: improve doc/docstrings --- joblib/__init__.py | 15 +-- joblib/numpy_pickle.py | 203 +++++++++++++++++++++++------------------ 2 files changed, 125 insertions(+), 93 deletions(-) diff --git a/joblib/__init__.py b/joblib/__init__.py index 3f8dbfc85..0fcc4c70d 100644 --- a/joblib/__init__.py +++ b/joblib/__init__.py @@ -38,14 +38,15 @@ issue is error-prone and often leads to unreproducible results * **Persist to disk transparently**: persisting in an efficient way - arbitrary objects containing large data is hard. In addition, - hand-written persistence does not link easily the file on disk to the - execution context of the original Python object. As a result, it is - challenging to resume a application status or computational job, eg + arbitrary objects containing large data is hard. Using + joblib's caching mechanism avoids hand-written persistence and + implicitely links the file on disk to the execution context of + the original Python object. As a result, joblib's persistence is + good for resuming an application status or computational job, eg after a crash. -It strives to address these problems while **leaving your code and your -flow control as unmodified as possible** (no framework, no new +Joblib strives to address these problems while **leaving your code and +your flow control as unmodified as possible** (no framework, no new paradigms). Main features @@ -91,6 +92,8 @@ display streams, and provide a way of compiling a report. We want to be able to quickly inspect what has been run. +4) **Fast compressed Persistence**: a replacement for pickle to work + efficiently on Python objects containing large data. .. >>> import shutil ; shutil.rmtree('/tmp/joblib/') diff --git a/joblib/numpy_pickle.py b/joblib/numpy_pickle.py index ef17cdbfb..d245f0abd 100644 --- a/joblib/numpy_pickle.py +++ b/joblib/numpy_pickle.py @@ -1,5 +1,5 @@ """ -A pickler to save numpy arrays in separate .npy files. +Utilities for fast persistence of big data, with optional compression. """ # Author: Gael Varoquaux @@ -39,6 +39,9 @@ def asbytes(s): # Compressed file with Zlib def _read_magic(file_handle): + """ Utility to check the magic signature of a file identifying it as a + Zfile + """ magic = file_handle.read(len(_ZFILE_PREFIX)) # Pickling needs file-handles at the beginning of the file file_handle.seek(0) @@ -46,14 +49,20 @@ def _read_magic(file_handle): def read_zfile(file_handle): - "Read the z-file and return the content as a string" - # Uncompress the file in a buffer + """Read the z-file and return the content as a string + + Z-files are raw data compressed with zlib used internally by joblib + for persistence. Backward compatibility is not garantied. Do not + use for external purposes. + """ file_handle.seek(0) assert _read_magic(file_handle) == _ZFILE_PREFIX, \ "File does not have the right magic" length = file_handle.read(len(_ZFILE_PREFIX) + _MAX_LEN) length = length[len(_ZFILE_PREFIX):] length = int(length, 16) + # We use the known length of the data to tell Zlib the size of the + # buffer to allocate. data = zlib.decompress(file_handle.read(), 15, length) assert len(data) == length, ( "Incorrect data length while decompressing %s." @@ -62,13 +71,17 @@ def read_zfile(file_handle): def write_zfile(file_handle, data, compress=1): + """Write the data in the given file as a Z-file. + + Z-files are raw data compressed with zlib used internally by joblib + for persistence. Backward compatibility is not garantied. Do not + use for external purposes. + """ file_handle.write(_ZFILE_PREFIX) - length = len(data) + length = hex(len(data)) if sys.version_info[0] < 3 and type(length) is long: - # We need to remove the trailing 'L' - length = hex(length)[:-1] - else: - length = hex(length) + # We need to remove the trailing 'L' in the hex representation + length = length[:-1] # Store the length of the data file_handle.write(length.ljust(_MAX_LEN)) file_handle.write(zlib.compress(data, compress)) @@ -80,8 +93,8 @@ def write_zfile(file_handle, data, compress=1): class NDArrayWrapper(object): """ An object to be persisted instead of numpy arrays. - The only thing this object does, is store the filename in wich - the array has been persisted. + The only thing this object does, is to carrus the filename in wich + the array has been persisted, and the array subclass. """ def __init__(self, filename, subclass): "Store the useful information for later" @@ -110,17 +123,19 @@ def read(self, unpickler): class ZNDArrayWrapper(NDArrayWrapper): - """ An object to be persisted instead of numpy arrays. - - This object store the Zfile filename in wich - the data array has been persisted, and the meta information to - retrieve it. - - The reason that we use this, rather than standard writing or - representation routine (tostring) is that it is uses completely - the strided model to avoid memory copies (a and a.T store as - fast), and saves the heavy information separately. This may be - important when unpickling data with large arrays. + """An object to be persisted instead of numpy arrays. + + This object store the Zfile filename in wich + the data array has been persisted, and the meta information to + retrieve it. + + The reason that we store the raw buffer data of the array and + the meta information, rather than array representation routine + (tostring) is that it enables us to use completely the strided + model to avoid memory copies (a and a.T store as fast). In + addition saving the heavy information separately can avoid + creating large temporary buffers when unpickling data with + large arrays. """ def __init__(self, filename, init_args, state): "Store the useful information for later" @@ -130,6 +145,8 @@ def __init__(self, filename, init_args, state): def read(self, unpickler): "Reconstruct the array from the meta-information and the z-file" + # Here we a simply reproducing the unpickling mechanism for numpy + # arrays filename = os.path.join(unpickler._dirname, self.filename) array = unpickler.np.core.multiarray._reconstruct(*self.init_args) data = read_zfile(open(filename, 'rb')) @@ -142,8 +159,15 @@ def read(self, unpickler): # Pickler classes class NumpyPickler(pickle.Pickler): - """ A pickler subclass that extracts ndarrays and saves them in .npy - files outside of the pickle. + """A pickler to persist of big data efficiently. + + The main features of this object are: + + * persistence of numpy arrays in separate .npy files, for which + I/O is fast. + + * optional compression using Zlib, with a special care on avoid + temporaries. """ def __init__(self, filename, compress=0, cache_size=100): @@ -223,7 +247,7 @@ def close(self): class NumpyUnpickler(Unpickler): - """ A subclass of the Unpickler to unpickle our numpy pickles. + """A subclass of the Unpickler to unpickle our numpy pickles. """ dispatch = Unpickler.dispatch.copy() @@ -250,7 +274,7 @@ def load_build(self): We capture it to replace our place-holder objects, NDArrayWrapper, by the array we are interested in. We - replace directly in the stack of pickler. + replace them directly in the stack of pickler. """ Unpickler.load_build(self) if isinstance(self.stack[-1], NDArrayWrapper): @@ -266,8 +290,8 @@ def load_build(self): class ZipNumpyUnpickler(NumpyUnpickler): - """ A subclass of our Unpickler to unpickle on the fly from zips. - """ + """A subclass of our Unpickler to unpickle on the fly from + compressed storage.""" def __init__(self, filename): NumpyUnpickler.__init__(self, filename, @@ -282,36 +306,42 @@ def _open_pickle(self): # Utility functions def dump(value, filename, compress=0, cache_size=100): - """ Persist an arbitrary Python object into a filename, with numpy arrays - saved as separate .npy files. - - Parameters - ----------- - value: any Python object - The object to store to disk - filename: string - The name of the file in which it is to be stored - compress: boolean, optional - Whether to compress the data on the disk or not - cache_size: positive number, optional - Fixes the order of magnitude (in megabytes) of the cache used - for in-memory compression. Note that this is just an order of - magnitude estimate and that for big arrays, the code will go - over this value at dump and at load time. - - Returns - ------- - filenames: list of strings - The list of file names in which the data is stored. If - compress is false, each array is stored in a different file. - - See Also - -------- - joblib.load : corresponding loader - - Notes - ----- - compressed files take extra extra memory during dump and load. + """Fast persistence of an arbitrary Python object into a files, with + dedicated storage for numpy arrays. + + Parameters + ----------- + value: any Python object + The object to store to disk + filename: string + The name of the file in which it is to be stored + compress: integer for 0 to 9, optional + Optional compression level for the data. 0 is no compression. + Higher means more compression, but also slower read and + write times. Using a value of 3 is often a good compromise. + See the notes for more details. + cache_size: positive number, optional + Fixes the order of magnitude (in megabytes) of the cache used + for in-memory compression. Note that this is just an order of + magnitude estimate and that for big arrays, the code will go + over this value at dump and at load time. + + Returns + ------- + filenames: list of strings + The list of file names in which the data is stored. If + compress is false, each array is stored in a different file. + + See Also + -------- + joblib.load : corresponding loader + + Notes + ----- + Memmapping on load cannot be used for compressed files. Thus + using compression can significantly slow down loading. In + addition, compressed files take extra extra memory during + dump and load. """ try: pickler = NumpyPickler(filename, compress=compress, @@ -326,36 +356,35 @@ def dump(value, filename, compress=0, cache_size=100): def load(filename, mmap_mode=None): - """ Reconstruct a Python object and the numpy arrays it contains from - a persisted file. - - Parameters - ----------- - filename: string - The name of the file from which to load the object - mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional - If not None, the arrays are memory-mapped from the disk. This - mode has not effect for compressed files. Note that in this - case the reconstructed object might not longer match exactly - the originally pickled object. - - Returns - ------- - result: any Python object - The object stored in the file. - - See Also - -------- - joblib.dump : function to save an object - - Notes - ----- - - This function loads the numpy array files saved separately. If - the mmap_mode argument is given, it is passed to np.save and - arrays are loaded as memmaps. As a consequence, the reconstructed - object might not match the original pickled object. - + """Reconstruct a Python object from a file persisted with joblib.load. + + Parameters + ----------- + filename: string + The name of the file from which to load the object + mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional + If not None, the arrays are memory-mapped from the disk. This + mode has not effect for compressed files. Note that in this + case the reconstructed object might not longer match exactly + the originally pickled object. + + Returns + ------- + result: any Python object + The object stored in the file. + + See Also + -------- + joblib.dump : function to save an object + + Notes + ----- + + This function can load numpy array files saved separately during the + dump. If the mmap_mode argument is given, it is passed to np.load and + arrays are loaded as memmaps. As a consequence, the reconstructed + object might not match the original pickled object. Note that if the + file was saved with compression, the arrays cannot be memmaped. """ file_handle = open(filename, 'rb') if _read_magic(file_handle) == _ZFILE_PREFIX: