Skip to content

Commit

Permalink
Patch: Add __str__ and __bytes__ for undecoded content.
Browse files Browse the repository at this point in the history
Patch.patch assumes all content to be encoded in UTF-8 and forcefully
replaces any non-decodable sequences. This can lead to corruption for
content that either does not conform to any specific encoding altogether, or
uses an encoding that is incompatible with, or ambinuous to UTF-8.

This change adds __str__ and __bytes__ implementations to Patch that
return the unmodified, raw bytes.
  • Loading branch information
erikvanzijst committed Apr 27, 2018
1 parent 795adc7 commit 5ff0818
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 2 deletions.
54 changes: 52 additions & 2 deletions src/patch.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,11 @@ Patch_create_from(PyObject *self, PyObject *args, PyObject *kwds)


PyDoc_STRVAR(Patch_patch__doc__,
"Patch diff string. Can be None in some cases, such as empty commits.");
"Patch diff string. Can be None in some cases, such as empty commits. "
"Note that this decodes the content to unicode assuming UTF-8 encoding. "
"For non-UTF-8 content that can lead be a lossy, non-reversible process. "
"To access the raw, un-decoded patch, use `str(patch)` (Python 2), or "
"`bytes(patch)` (Python 3).");

PyObject *
Patch_patch__get__(Patch *self)
Expand All @@ -201,9 +205,55 @@ Patch_patch__get__(Patch *self)
return py_patch;
}

PyObject *
Patch__str__(PyObject *self)
{
git_buf buf = {NULL};
int err;
PyObject *ret;

assert(self->patch);
err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
if (err < 0)
return Error_set(err);

#if PY_MAJOR_VERSION == 2
ret = Py_BuildValue("s#", buf.ptr, buf.size);
#else
ret = to_unicode(buf.ptr, NULL, NULL);
#endif
git_buf_free(&buf);
return ret;
}

PyDoc_STRVAR(Patch__bytes____doc__, "The raw bytes of the patch's contents.");

PyObject *
Patch__bytes__(PyObject *self)
{
#if PY_MAJOR_VERSION == 2
return Patch__str__(self);

#else
git_buf buf = {NULL};
int err;

assert(self->patch);
err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
if (err < 0)
return Error_set(err);

PyObject *bytes = PyBytes_FromStringAndSize(buf.ptr, buf.size);
git_buf_free(&buf);
return bytes;
#endif
}

PyMethodDef Patch_methods[] = {
{"create_from", (PyCFunction) Patch_create_from,
METH_KEYWORDS | METH_VARARGS | METH_STATIC, Patch_create_from__doc__},
{"__bytes__", (PyCFunction) Patch__bytes__,
METH_NOARGS, Patch__bytes____doc__},
{NULL}
};

Expand Down Expand Up @@ -237,7 +287,7 @@ PyTypeObject PatchType = {
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
Patch__str__, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Expand Down
Binary file added test/data/encoding.tar
Binary file not shown.
66 changes: 66 additions & 0 deletions test/test_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from __future__ import absolute_import
from __future__ import unicode_literals

import six

import pygit2
from . import utils

Expand Down Expand Up @@ -84,6 +86,70 @@
"""


class PatchEncodingTest(utils.AutoRepoTestCase):
repo_spec = 'tar', 'encoding'
expected_diff = b"""diff --git a/iso-8859-1.txt b/iso-8859-1.txt
index e84e339..201e0c9 100644
--- a/iso-8859-1.txt
+++ b/iso-8859-1.txt
@@ -1 +1,2 @@
Kristian H\xf8gsberg
+foo
"""

def test_patch_from_non_utf8(self):
# blobs encoded in ISO-8859-1
old_content = b'Kristian H\xf8gsberg\n'
new_content = old_content + b'foo\n'
patch = pygit2.Patch.create_from(
old_content,
new_content,
old_as_path='iso-8859-1.txt',
new_as_path='iso-8859-1.txt',
)

# `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
# decoding, so assert that we cannot get the original content back:
self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)

if six.PY2:
self.assertIsInstance(str(patch), str)
self.assertEqual(str(patch), self.expected_diff)

self.assertIsInstance(patch.__bytes__(), str)
self.assertEqual(patch.__bytes__(), self.expected_diff)

else:
self.assertIsInstance(str(patch), str)
self.assertEqual(bytes(patch), self.expected_diff)
self.assertEqual(str(patch),
str(self.expected_diff, 'utf8', errors='replace'))

def test_patch_create_from_blobs(self):
patch = pygit2.Patch.create_from(
self.repo['e84e339ac7fcc823106efa65a6972d7a20016c85'],
self.repo['201e0c908e3d9f526659df3e556c3d06384ef0df'],
old_as_path='iso-8859-1.txt',
new_as_path='iso-8859-1.txt',
)
# `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
# decoding, so assert that we cannot get the original content back:
self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)

if six.PY2:
self.assertIsInstance(str(patch), str)
self.assertEqual(str(patch), self.expected_diff)

self.assertIsInstance(patch.__bytes__(), str)
self.assertEqual(patch.__bytes__(), self.expected_diff)

else:
self.assertIsInstance(str(patch), str)
self.assertEqual(bytes(patch), self.expected_diff)
self.assertEqual(str(patch),
str(self.expected_diff, 'utf8', errors='replace'))


class PatchTest(utils.RepoTestCase):

def test_patch_create_from_buffers(self):
Expand Down

0 comments on commit 5ff0818

Please sign in to comment.