Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 309 lines (259 sloc) 10.577 kb
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
1 """
2 A pickler to save numpy arrays in separate .npy files.
3 """
4
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
5 # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org>
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
6 # Copyright (c) 2009 Gael Varoquaux
7 # License: BSD Style, 3 clauses.
8
9 import pickle
10 import traceback
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
11 import sys
12 import os
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
13 import shutil
14 import tempfile
15 import zipfile
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
16 import warnings
78289c9 @fabianp Py3K compatibility.
fabianp authored
17
18 if sys.version_info[0] == 3:
19 from pickle import _Unpickler as Unpickler
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
20 from cStringIO import StringIO as BytesIO
78289c9 @fabianp Py3K compatibility.
fabianp authored
21 else:
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
22 from io import BytesIO
78289c9 @fabianp Py3K compatibility.
fabianp authored
23 from pickle import Unpickler
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
24
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
25
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
26 ###############################################################################
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
27 # Utility objects for persistence.
28
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
29
2071c60 @GaelVaroquaux ENH: Add memmap persistence to the numpy pickler.
GaelVaroquaux authored
30 class NDArrayWrapper(object):
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
31 """ An object to be persisted instead of numpy arrays.
32
33 The only thing this object does, is store the filename in wich
34 the array has been persisted.
35 """
98e31ee @GaelVaroquaux ENH: fast pickling for memmap and matrix
GaelVaroquaux authored
36 def __init__(self, filename, subclass=None):
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
37 self.filename = filename
98e31ee @GaelVaroquaux ENH: fast pickling for memmap and matrix
GaelVaroquaux authored
38 self.subclass = subclass
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
39
40
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
41 ###############################################################################
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
42 # Pickler classes
43
44 class NumpyPickler(pickle.Pickler):
103d9c4 @GaelVaroquaux ENH: Improve test coverage, and handling of misc cases.
GaelVaroquaux authored
45 """ A pickler subclass that extracts ndarrays and saves them in .npy
46 files outside of the pickle.
47 """
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
48
49 def __init__(self, filename):
50 self._filename = filename
51 self._filenames = [filename, ]
25f4386 @GaelVaroquaux BUG: Make numpy_pickle work under Windows.
GaelVaroquaux authored
52 self.file = open(filename, 'wb')
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
53 # Count the number of npy files that we have created:
54 self._npy_counter = 0
fb73a21 @GaelVaroquaux TEST: Make the tests run.
GaelVaroquaux authored
55 pickle.Pickler.__init__(self, self.file,
56 protocol=pickle.HIGHEST_PROTOCOL)
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
57 # delayed import of numpy, to avoid tight coupling
e2e5eae @GaelVaroquaux ENH: pickle even without numpy
GaelVaroquaux authored
58 try:
59 import numpy as np
60 except ImportError:
61 np = None
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
62 self.np = np
63
64 def save(self, obj):
65 """ Subclass the save method, to save ndarray subclasses in npy
6665add @kcarnold Cosmetic: fix typos
kcarnold authored
66 files, rather than pickling them. Of course, this is a
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
67 total abuse of the Pickler class.
68 """
25a8672 @GaelVaroquaux MISC: take in account @kcarnold's rmks
GaelVaroquaux authored
69 if self.np is not None and type(obj) in (self.np.ndarray,
70 self.np.matrix, self.np.memmap):
a43124a @kcarnold Only use NDArrayWrapper for exact instances of ndarray.
kcarnold authored
71 self._npy_counter += 1
98e31ee @GaelVaroquaux ENH: fast pickling for memmap and matrix
GaelVaroquaux authored
72 try:
73 filename = '%s_%02i.npy' % (self._filename,
74 self._npy_counter)
75 self.np.save(filename, obj)
25a8672 @GaelVaroquaux MISC: take in account @kcarnold's rmks
GaelVaroquaux authored
76 self._filenames.append(filename)
98e31ee @GaelVaroquaux ENH: fast pickling for memmap and matrix
GaelVaroquaux authored
77 obj = NDArrayWrapper(os.path.basename(filename),
78 type(obj))
79 except:
80 self._npy_counter -= 1
81 # XXX: We should have a logging mechanism
82 print 'Failed to save %s to .npy file:\n%s' % (
83 type(obj),
84 traceback.format_exc())
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
85 pickle.Pickler.save(self, obj)
86
87
78289c9 @fabianp Py3K compatibility.
fabianp authored
88 class NumpyUnpickler(Unpickler):
2071c60 @GaelVaroquaux ENH: Add memmap persistence to the numpy pickler.
GaelVaroquaux authored
89 """ A subclass of the Unpickler to unpickle our numpy pickles.
90 """
78289c9 @fabianp Py3K compatibility.
fabianp authored
91 dispatch = Unpickler.dispatch.copy()
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
92
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
93 def __init__(self, filename, file_handle=None, mmap_mode=None):
283c2b4 @GaelVaroquaux ENH: refactor unpickler for fast unziper
GaelVaroquaux authored
94 self._filename = os.path.basename(filename)
2071c60 @GaelVaroquaux ENH: Add memmap persistence to the numpy pickler.
GaelVaroquaux authored
95 self.mmap_mode = mmap_mode
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
96 self._dirname = os.path.dirname(filename)
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
97 if file_handle is None:
98 file_handle = self._open_file(self._filename)
99 if isinstance(file_handle, basestring):
100 # To handle memmap, we need to have file names
101 file_handle = open(file_handle, 'rb')
283c2b4 @GaelVaroquaux ENH: refactor unpickler for fast unziper
GaelVaroquaux authored
102 self.file_handle = file_handle
103 Unpickler.__init__(self, self.file_handle)
8b94acc @GaelVaroquaux BUG: The unpickler should work without numpy
GaelVaroquaux authored
104 try:
105 import numpy as np
106 except ImportError:
107 np = None
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
108 self.np = np
109
283c2b4 @GaelVaroquaux ENH: refactor unpickler for fast unziper
GaelVaroquaux authored
110 def _open_file(self, name):
111 "Return the path of the given file in our store"
112 return os.path.join(self._dirname, name)
113
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
114 def load_build(self):
6665add @kcarnold Cosmetic: fix typos
kcarnold authored
115 """ This method is called to set the state of a newly created
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
116 object.
117
2071c60 @GaelVaroquaux ENH: Add memmap persistence to the numpy pickler.
GaelVaroquaux authored
118 We capture it to replace our place-holder objects,
119 NDArrayWrapper, by the array we are interested in. We
120 replace directly in the stack of pickler.
121 """
78289c9 @fabianp Py3K compatibility.
fabianp authored
122 Unpickler.load_build(self)
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
123 if isinstance(self.stack[-1], NDArrayWrapper):
8b94acc @GaelVaroquaux BUG: The unpickler should work without numpy
GaelVaroquaux authored
124 if self.np is None:
125 raise ImportError('Trying to unpickle an ndarray, '
126 "but numpy didn't import correctly")
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
127 nd_array_wrapper = self.stack.pop()
66dd16d @GaelVaroquaux MRG
GaelVaroquaux authored
128 if self.np.__version__ >= '1.3':
283c2b4 @GaelVaroquaux ENH: refactor unpickler for fast unziper
GaelVaroquaux authored
129 array = self.np.load(
130 self._open_file(nd_array_wrapper.filename),
131 mmap_mode=self.mmap_mode)
66dd16d @GaelVaroquaux MRG
GaelVaroquaux authored
132 else:
133 # Numpy does not have mmap_mode before 1.3
283c2b4 @GaelVaroquaux ENH: refactor unpickler for fast unziper
GaelVaroquaux authored
134 array = self.np.load(
135 self._open_file(nd_array_wrapper.filename),
136 mmap_mode=self.mmap_mode)
98e31ee @GaelVaroquaux ENH: fast pickling for memmap and matrix
GaelVaroquaux authored
137 if not nd_array_wrapper.subclass is self.np.ndarray:
138 # We need to reconstruct another subclass
139 new_array = self.np.core.multiarray._reconstruct(
140 nd_array_wrapper.subclass, (0,), 'b')
141 new_array.__array_prepare__(array)
142 array = new_array
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
143 self.stack.append(array)
144
145 # Be careful to register our new method.
146 dispatch[pickle.BUILD] = load_build
147
148
3a42862 @GaelVaroquaux ENH: fast unziper
GaelVaroquaux authored
149 class ZipNumpyUnpickler(NumpyUnpickler):
150 """ A subclass of our Unpickler to unpickle on the fly from zips.
151 """
152
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
153 def __init__(self, file_handle):
154 kwargs = dict(compression=zipfile.ZIP_DEFLATED)
3a42862 @GaelVaroquaux ENH: fast unziper
GaelVaroquaux authored
155 if sys.version_info >= (2, 5):
156 kwargs['allowZip64'] = True
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
157 self._zip_file = zipfile.ZipFile(file_handle, **kwargs)
3a42862 @GaelVaroquaux ENH: fast unziper
GaelVaroquaux authored
158 NumpyUnpickler.__init__(self, 'joblib_dump.pkl',
159 mmap_mode=None)
160
161 def _open_file(self, name):
162 "Return the path of the given file in our store"
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
163 decompression_buffer = BytesIO(
3a42862 @GaelVaroquaux ENH: fast unziper
GaelVaroquaux authored
164 self._zip_file.read(os.path.join('dump_file', name)))
165 return decompression_buffer
166
167
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
168 ###############################################################################
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
169 # Utility functions
170
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
171 def dump(value, filename, zipped=False):
764d761 @rgommers STY: PEP8 cleanup.
rgommers authored
172 """ Persist an arbitrary Python object into a filename, with numpy arrays
f17abfc @GaelVaroquaux ENH: Make pretty error reporting work better in latest IPython.
GaelVaroquaux authored
173 saved as separate .npy files.
0cdafef @GaelVaroquaux DOC: Polish the documentation.
GaelVaroquaux authored
174
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
175 Parameters
176 -----------
177 value: any Python object
178 The object to store to disk
179 filename: string
180 The name of the file in which it is to be stored
181 zipped: boolean, optional
182 Whether to compress the data on the disk or not
183
184 Returns
185 -------
186 filenames: list of strings
187 The list of file names in which the data is stored. If zipped
188 is false, each array is stored in a different file.
189
0cdafef @GaelVaroquaux DOC: Polish the documentation.
GaelVaroquaux authored
190 See Also
191 --------
192 joblib.load : corresponding loader
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
193
194 Notes
195 -----
196 zipped file take extra disk space during the dump, and extra
197 memory during the loading.
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
198 """
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
199 if zipped:
200 return _dump_zipped(value, filename)
201 else:
202 return _dump(value, filename)
203
204
205 def _dump(value, filename):
074b5f9 @GaelVaroquaux ENH: The numpy pickler is now working.
GaelVaroquaux authored
206 try:
207 pickler = NumpyPickler(filename)
208 pickler.dump(value)
209 finally:
210 if 'pickler' in locals() and hasattr(pickler, 'file'):
211 pickler.file.flush()
212 pickler.file.close()
213 return pickler._filenames
214
215
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
216 def _dump_zipped(value, filename):
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
217 """ Persist an arbitrary Python object into a compressed zip
218 filename.
219 """
220 kwargs = dict(compression=zipfile.ZIP_DEFLATED, mode='w')
221 if sys.version_info >= (2, 5):
222 kwargs['allowZip64'] = True
223 dump_file = zipfile.ZipFile(filename, **kwargs)
224
225 # Stage file in a temporary dir on disk, before writing to zip.
226 tmp_dir = tempfile.mkdtemp(prefix='joblib-',
227 dir=os.path.dirname(filename))
228 try:
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
229 _dump(value, os.path.join(tmp_dir, 'joblib_dump.pkl'))
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
230 for sub_file in os.listdir(tmp_dir):
231 # We use a different arcname (archive name) to avoid having
232 # the name of our tmp_dir in the archive
233 dump_file.write(os.path.join(tmp_dir, sub_file),
234 arcname=os.path.join('dump_file', sub_file))
235 finally:
236 shutil.rmtree(tmp_dir)
237
238 dump_file.close()
239 return [filename]
240
241
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
242 def load(filename, mmap_mode=None):
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
243 """ Reconstruct a Python object and the numpy arrays it contains from
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
244 a persisted file.
245
246 Parameters
247 -----------
248 filename: string
249 The name of the file from which to load the object
250 mmap_mode: {None, 'r+', 'r', 'w+', 'c'}, optional
251 If not None, the arrays are memory-mapped from the disk. This
252 mode has not effect for zipped files. Note that in this
253 case the reconstructed object might not longer match exactly
6665add @kcarnold Cosmetic: fix typos
kcarnold authored
254 the originally pickled object.
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
255
256 Returns
257 -------
258 result: any Python object
259 The object stored in the file.
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
260
261 See Also
262 --------
263 joblib.dump : function to save an object
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
264
265 Notes
266 -----
267
268 This function loads the numpy array files saved separately. If
269 the mmap_mode argument is given, it is passed to np.save and
270 arrays are loaded as memmaps. As a consequence, the reconstructed
271 object might not match the original pickled object.
272
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
273 """
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
274 # Code to detect zip files
275 _ZIP_PREFIX = 'PK\x03\x04'
276 try:
277 # Py3k compatibility
278 from numpy.compat import asbytes
279 _ZIP_PREFIX = asbytes(_ZIP_PREFIX)
280 except ImportError:
281 pass
282
283 file_handle = open(filename, 'rb')
284 if file_handle.read(len(_ZIP_PREFIX)) == _ZIP_PREFIX:
285 if mmap_mode is not None:
286 warnings.warn('file "%(filename)s" appears to be a zip, '
287 'ignoring mmap_mode "%(mmap_mode)s" flag passed'
288 % locals(),
289 Warning, stacklevel=2)
290 unpickler = ZipNumpyUnpickler(file_handle=file_handle)
291 else:
292 # Pickling needs file-handles at the beginning of the file
293 file_handle.seek(0)
294 unpickler = NumpyUnpickler(filename,
295 file_handle=file_handle,
296 mmap_mode=mmap_mode)
297
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
298 try:
3a42862 @GaelVaroquaux ENH: fast unziper
GaelVaroquaux authored
299 obj = unpickler.load()
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
300 finally:
dbcf1e6 @GaelVaroquaux ENH: same function to dump/load zipped + unzipped
GaelVaroquaux authored
301 if 'unpickler' in locals():
302 if hasattr(unpickler, 'file'):
303 unpickler.file.close()
304 if hasattr(unpickler, '_zip_file'):
305 unpickler._zip_file.close()
3a42862 @GaelVaroquaux ENH: fast unziper
GaelVaroquaux authored
306 return obj
4b718d1 @GaelVaroquaux ENH: add zip-based pickling
GaelVaroquaux authored
307
308
Something went wrong with that request. Please try again.