libgit2 · erikvanzijst · Apr 27, 2018 · erikvanzijst · May 1, 2018
diff --git a/src/patch.c b/src/patch.c
@@ -180,30 +180,109 @@ Patch_create_from(PyObject *self, PyObject *args, PyObject *kwds)
   return wrap_patch(patch, oldblob, newblob);
 }
 
+PyObject *
+decode(Patch *patch, char *encoding, char *errors)
+{
+    git_buf buf = {NULL};
+    int err;
+    PyObject *ret;
+
+    assert(patch->patch);
+    err = git_patch_to_buf(&buf, patch->patch);
+    if (err < 0)
+        return Error_set(err);
+
+    ret = PyUnicode_Decode(buf.ptr, buf.size, encoding, errors);
+    git_buf_free(&buf);
+    return ret;
+}
 
 PyDoc_STRVAR(Patch_patch__doc__,
-    "Patch diff string. Can be None in some cases, such as empty commits.");
+    "Patch diff string (deprecated -- use Patch.decode() instead).\n"
+    "Can be None in some cases, such as empty commits. "
+    "Note that this decodes the content to unicode assuming UTF-8 encoding. "
+    "For non-UTF-8 content that can lead be a lossy, non-reversible process. "
+    "To access the raw, un-decoded patch, use `str(patch)` (Python 2), or "
+    "`bytes(patch)` (Python 3).");
 
 PyObject *
 Patch_patch__get__(Patch *self)
+{
+    PyErr_WarnEx(PyExc_DeprecationWarning, "`Patch.patch` assumes UTF-8 encoding and can have unexpected results on "
+                                           "other encodings. If decoded text is needed, use `Patch.decode()` "
+                                           "instead. Otherwise use `bytes(Patch)`.", 1);
+    return decode(self, "utf-8", "replace");
+}
+
+PyObject *
+Patch__str__(PyObject *self)
 {
     git_buf buf = {NULL};
     int err;
-    PyObject *py_patch;
+    PyObject *ret;
 
     assert(self->patch);
-    err = git_patch_to_buf(&buf, self->patch);
+    err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
     if (err < 0)
         return Error_set(err);
 
-    py_patch = to_unicode(buf.ptr, NULL, NULL);
+#if PY_MAJOR_VERSION == 2
+    ret = Py_BuildValue("s#", buf.ptr, buf.size);
+#else
+    ret = to_unicode(buf.ptr, NULL, NULL);
+#endif
     git_buf_free(&buf);
-    return py_patch;
+    return ret;
+}
+
+PyDoc_STRVAR(Patch__bytes____doc__, "The raw bytes of the patch's contents.");
+
+PyObject *
+Patch__bytes__(PyObject *self)
+{
+#if PY_MAJOR_VERSION == 2
+    return Patch__str__(self);
+#else
+    git_buf buf = {NULL};
+    int err;
+    PyObject *bytes;
+
+    assert(self->patch);
+    err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
+    if (err < 0)
+        return Error_set(err);
+
+    bytes = PyBytes_FromStringAndSize(buf.ptr, buf.size);
+    git_buf_free(&buf);
+    return bytes;
+#endif
+}
+
+PyDoc_STRVAR(Patch_decode__doc__, "decode(encoding=\"utf-8\", errors=\"strict\")\n"
+                                  "\n"
+                                  "Decodes the patch's bytes using the codec registered for encoding.\n"
+                                  "Encoding defaults to 'utf-8'. `errors` may be given to set a different error\n"
+                                  "handling scheme. Default is 'strict' meaning that decoding errors raise\n"
+                                  "a `UnicodeError`. Other possible values are 'ignore' and 'replace'\n"
+                                  "as well as any other name registered with `codecs.register_error`.");
+
+PyObject *
+Patch_decode(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    char *keywords[] = {"encoding", "errors", NULL};
+    char *encoding = NULL, *errors = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|zz", keywords, &encoding, &errors))
+        return NULL;
+
+    return decode((Patch *)self, encoding == NULL ? "utf-8" : encoding, errors);
 }
 
 PyMethodDef Patch_methods[] = {
     {"create_from", (PyCFunction) Patch_create_from,
       METH_KEYWORDS | METH_VARARGS | METH_STATIC, Patch_create_from__doc__},
+    {"__bytes__", (PyCFunction) Patch__bytes__, METH_NOARGS, Patch__bytes____doc__},
+    {"decode", (PyCFunction) Patch_decode, METH_KEYWORDS | METH_VARARGS, Patch_decode__doc__},
     {NULL}
 };
 
@@ -237,7 +316,7 @@ PyTypeObject PatchType = {
     0,                                         /* tp_as_mapping     */
     0,                                         /* tp_hash           */
     0,                                         /* tp_call           */
-    0,                                         /* tp_str            */
+    Patch__str__,                              /* tp_str            */
     0,                                         /* tp_getattro       */
     0,                                         /* tp_setattro       */
     0,                                         /* tp_as_buffer      */

diff --git a/test/data/encoding.tar b/test/data/encoding.tar
diff --git a/test/test_patch.py b/test/test_patch.py
@@ -28,6 +28,8 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
+import six
+
 import pygit2
 from . import utils
 
@@ -84,6 +86,76 @@
 """
 
 
+class PatchEncodingTest(utils.AutoRepoTestCase):
+    repo_spec = 'tar', 'encoding'
+    expected_diff = b"""diff --git a/iso-8859-1.txt b/iso-8859-1.txt
+index e84e339..201e0c9 100644
+--- a/iso-8859-1.txt
++++ b/iso-8859-1.txt
+@@ -1 +1,2 @@
+ Kristian H\xf8gsberg
++foo
+"""
+
+    def test_patch_from_non_utf8(self):
+        # blobs encoded in ISO-8859-1
+        old_content = b'Kristian H\xf8gsberg\n'
+        new_content = old_content + b'foo\n'
+        patch = pygit2.Patch.create_from(
+            old_content,
+            new_content,
+            old_as_path='iso-8859-1.txt',
+            new_as_path='iso-8859-1.txt',
+        )
+
+        self.assertIsInstance(str(patch), str)
+
+        # `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
+        # decoding, so assert that we cannot get the original content back:
+        self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)
+
+        # since the patch contain non-UTF-8 sequences, .decode() should fail:
+        self.assertRaises(UnicodeDecodeError, patch.decode)
+
+        self.assertEqual(patch.decode(errors='replace'),
+                         self.expected_diff.decode('utf-8', errors='replace'))
+
+        if six.PY2:
+            self.assertEqual(str(patch), self.expected_diff)
+            self.assertIsInstance(patch.__bytes__(), str)
+            self.assertEqual(patch.__bytes__(), self.expected_diff)
+
+        else:
+            self.assertEqual(bytes(patch), self.expected_diff)
+            self.assertEqual(str(patch),
+                             str(self.expected_diff, 'utf8', errors='replace'))
+            self.assertRaises(UnicodeDecodeError, patch.decode)
+
+    def test_patch_create_from_blobs(self):
+        patch = pygit2.Patch.create_from(
+            self.repo['e84e339ac7fcc823106efa65a6972d7a20016c85'],
+            self.repo['201e0c908e3d9f526659df3e556c3d06384ef0df'],
+            old_as_path='iso-8859-1.txt',
+            new_as_path='iso-8859-1.txt',
+        )
+        # `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
+        # decoding, so assert that we cannot get the original content back:
+        self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)
+
+        if six.PY2:
+            self.assertIsInstance(str(patch), str)
+            self.assertEqual(str(patch), self.expected_diff)
+
+            self.assertIsInstance(patch.__bytes__(), str)
+            self.assertEqual(patch.__bytes__(), self.expected_diff)
+
+        else:
+            self.assertIsInstance(str(patch), str)
+            self.assertEqual(bytes(patch), self.expected_diff)
+            self.assertEqual(str(patch),
+                             str(self.expected_diff, 'utf8', errors='replace'))
+
+
 class PatchTest(utils.RepoTestCase):
 
     def test_patch_create_from_buffers(self):