pickler: handle arbitrary bytestreams using base64

The quoted-printable encoding seems to fall down under certain scenarios, e.g. HDF5 data, so make our bytes transport robust by using base64 to encode bytes for json. Use a new "B64" tag in the data stream to allow for backwards-compatibility. Support for decoding the old "BYTES" tag remains intact. New versions of jsonpickle now emit a new "B64" tag when bytes are encountered. Add a new unit test with the example HDF5 data to ensure that we will forever support it. Closes #143 Reported-by: Jason Liang <jasonzliang@berkeley.edu> Signed-off-by: David Aguilar <davvid@gmail.com>
jsonpickle · Mar 19, 2016 · 16e7945 · 16e7945
1 parent 4f05e76
commit 16e7945
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 10 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -1,6 +1,11 @@
 Change Log
 ==========
 
+Version 0.9.4 - Unreleased
+--------------------------
+    * Arbitrary byte streams are now better supported.
+      (`#143 <https://github.com/jsonpickle/jsonpickle/issues/143>`_).
+
 Version 0.9.3 - March 9, 2016
 -----------------------------
     * UUID objects can now be serialized

diff --git a/jsonpickle/pickler.py b/jsonpickle/pickler.py
@@ -7,9 +7,9 @@
 # This software is licensed as described in the file COPYING, which
 # you should have received as part of this distribution.
 
+import base64
 import warnings
 import sys
-import quopri
 from itertools import chain, islice
 
 import jsonpickle.util as util
@@ -251,7 +251,7 @@ def _flatten_bytestring(self, obj):
                 return obj.decode('utf-8')
             except:
                 pass
-        return {tags.BYTES: quopri.encodestring(obj).decode('utf-8')}
+        return {tags.B64: base64.encodestring(obj).decode('utf-8')}
 
     def _flatten_obj_instance(self, obj):
         """Recursively flatten an instance and return a json-friendly dict

diff --git a/jsonpickle/tags.py b/jsonpickle/tags.py
@@ -12,6 +12,7 @@
 
 
 BYTES = 'py/bytes'
+B64 = 'py/b64'
 FUNCTION = 'py/function'
 ID = 'py/id'
 INITARGS = 'py/initargs'

diff --git a/jsonpickle/unpickler.py b/jsonpickle/unpickler.py
@@ -7,8 +7,9 @@
 # This software is licensed as described in the file COPYING, which
 # you should have received as part of this distribution.
 
-import sys
+import base64
 import quopri
+import sys
 
 import jsonpickle.util as util
 import jsonpickle.tags as tags
@@ -128,8 +129,10 @@ def _swap_proxies(self):
         self._proxies = []
 
     def _restore(self, obj):
-        if has_tag(obj, tags.BYTES):
-            restore = self._restore_bytestring
+        if has_tag(obj, tags.B64):
+            restore = self._restore_base64
+        elif has_tag(obj, tags.BYTES):  # Backwards compatibility
+            restore = self._restore_quopri
         elif has_tag(obj, tags.ID):
             restore = self._restore_id
         elif has_tag(obj, tags.REF):  # Backwards compatibility
@@ -158,7 +161,11 @@ def _restore(self, obj):
             restore = lambda x: x
         return restore(obj)
 
-    def _restore_bytestring(self, obj):
+    def _restore_base64(self, obj):
+        return base64.decodestring(obj[tags.B64].encode('utf-8'))
+
+    #: For backwards compatibility with bytes data produced by older versions
+    def _restore_quopri(self, obj):
         return quopri.decodestring(obj[tags.BYTES].encode('utf-8'))
 
     def _restore_iterator(self, obj):

diff --git a/tests/object_test.py b/tests/object_test.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 
+import base64
 import collections
 import decimal
 import re
@@ -762,9 +763,10 @@ def test_bytes_unicode(self):
             self.assertEqual(encoded, u1)
             self.assertEqual(type(encoded), unicode)
         else:
-            self.assertTrue(encoded == {tags.BYTES: 'foo'})
-            self.assertTrue(type(encoded[tags.BYTES]) is unicode)
             self.assertNotEqual(encoded, u1)
+            b64ustr= base64.encodestring(b'foo').decode('utf-8')
+            self.assertEqual({tags.B64: b64ustr}, encoded)
+            self.assertEqual(type(encoded[tags.B64]), unicode)
         decoded = self.unpickler.restore(encoded)
         self.assertTrue(decoded == b1)
         if PY2:
@@ -774,13 +776,29 @@ def test_bytes_unicode(self):
 
         # bytestrings that we can't decode to UTF-8 will always be wrapped
         encoded = self.pickler.flatten(b2)
-        self.assertTrue(encoded == {tags.BYTES: 'foo=FF'})
-        self.assertTrue(type(encoded[tags.BYTES]) is unicode)
         self.assertNotEqual(encoded, b2)
+        b64ustr= base64.encodestring(b'foo\xff').decode('utf-8')
+        self.assertEqual({tags.B64: b64ustr}, encoded)
+        self.assertEqual(type(encoded[tags.B64]), unicode)
         decoded = self.unpickler.restore(encoded)
         self.assertEqual(decoded, b2)
         self.assertTrue(type(decoded) is bytes)
 
+    def test_backcompat_bytes_quoted_printable(self):
+        """Test decoding bytes objects from older jsonpickle versions"""
+
+        b1 = b'foo'
+        b2 = b'foo\xff'
+
+        # older versions of jsonpickle used a quoted-printable encoding
+        expect = b1
+        actual = self.unpickler.restore({tags.BYTES: unicode('foo')})
+        self.assertEqual(expect, actual)
+
+        expect = b2
+        actual = self.unpickler.restore({tags.BYTES: unicode('foo=FF')})
+        self.assertEqual(expect, actual)
+
     def test_nested_objects(self):
         obj = ThingWithTimedeltaAttribute(99)
         flattened = self.pickler.flatten(obj)

diff --git a/tests/stdlib_test.py b/tests/stdlib_test.py
@@ -28,6 +28,22 @@ def test_known_uuid(self):
         self.assertEqual(expect, actual)
 
 
+class BytesTestCase(unittest.TestCase):
+
+    def test_bytestream(self):
+        expect = (b'\x89HDF\r\n\x1a\n\x00\x00\x00\x00\x00\x08\x08\x00'
+                  b'\x04\x00\x10\x00\x00\x00\x00\x00\x00\x00\x00\x00'
+                  b'\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xffh'
+                  b'\x848\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff'
+                  b'\xff\xff\x00\x00\x00\x00\x00\x00\x00\x00`\x00\x00'
+                  b'\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00'
+                  b'\x00\x88\x00\x00\x00\x00\x00\x00\x00\xa8\x02\x00'
+                  b'\x00\x00\x00\x00\x00\x01\x00\x01\x00')
+        encoded = jsonpickle.encode(expect)
+        actual = jsonpickle.decode(encoded)
+        self.assertEqual(expect, actual)
+
+
 def suite():
     suite = unittest.TestSuite()
     suite.addTest(unittest.makeSuite(UUIDTestCase))