Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch: Add __str__ and __bytes__ for undecoded content. #790

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
91 changes: 85 additions & 6 deletions src/patch.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,30 +180,109 @@ Patch_create_from(PyObject *self, PyObject *args, PyObject *kwds)
return wrap_patch(patch, oldblob, newblob);
}

PyObject *
decode(Patch *patch, char *encoding, char *errors)
{
git_buf buf = {NULL};
int err;
PyObject *ret;

assert(patch->patch);
err = git_patch_to_buf(&buf, patch->patch);
if (err < 0)
return Error_set(err);

ret = PyUnicode_Decode(buf.ptr, buf.size, encoding, errors);
git_buf_free(&buf);
return ret;
}

PyDoc_STRVAR(Patch_patch__doc__,
"Patch diff string. Can be None in some cases, such as empty commits.");
"Patch diff string (deprecated -- use Patch.decode() instead).\n"
"Can be None in some cases, such as empty commits. "
"Note that this decodes the content to unicode assuming UTF-8 encoding. "
"For non-UTF-8 content that can lead be a lossy, non-reversible process. "
"To access the raw, un-decoded patch, use `str(patch)` (Python 2), or "
"`bytes(patch)` (Python 3).");

PyObject *
Patch_patch__get__(Patch *self)
{
PyErr_WarnEx(PyExc_DeprecationWarning, "`Patch.patch` assumes UTF-8 encoding and can have unexpected results on "
"other encodings. If decoded text is needed, use `Patch.decode()` "
"instead. Otherwise use `bytes(Patch)`.", 1);
return decode(self, "utf-8", "replace");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These hardwired parameters are equivalent to the original logic of to_unicode(buf.ptr, NULL, NULL).

}

PyObject *
Patch__str__(PyObject *self)
{
git_buf buf = {NULL};
int err;
PyObject *py_patch;
PyObject *ret;

assert(self->patch);
err = git_patch_to_buf(&buf, self->patch);
err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
if (err < 0)
return Error_set(err);

py_patch = to_unicode(buf.ptr, NULL, NULL);
#if PY_MAJOR_VERSION == 2
ret = Py_BuildValue("s#", buf.ptr, buf.size);
#else
ret = to_unicode(buf.ptr, NULL, NULL);
#endif
git_buf_free(&buf);
return py_patch;
return ret;
}

PyDoc_STRVAR(Patch__bytes____doc__, "The raw bytes of the patch's contents.");

PyObject *
Patch__bytes__(PyObject *self)
{
#if PY_MAJOR_VERSION == 2
return Patch__str__(self);
#else
git_buf buf = {NULL};
int err;
PyObject *bytes;

assert(self->patch);
err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
if (err < 0)
return Error_set(err);

bytes = PyBytes_FromStringAndSize(buf.ptr, buf.size);
git_buf_free(&buf);
return bytes;
#endif
}

PyDoc_STRVAR(Patch_decode__doc__, "decode(encoding=\"utf-8\", errors=\"strict\")\n"
"\n"
"Decodes the patch's bytes using the codec registered for encoding.\n"
"Encoding defaults to 'utf-8'. `errors` may be given to set a different error\n"
"handling scheme. Default is 'strict' meaning that decoding errors raise\n"
"a `UnicodeError`. Other possible values are 'ignore' and 'replace'\n"
"as well as any other name registered with `codecs.register_error`.");

PyObject *
Patch_decode(PyObject *self, PyObject *args, PyObject *kwds)
{
char *keywords[] = {"encoding", "errors", NULL};
char *encoding = NULL, *errors = NULL;

if (!PyArg_ParseTupleAndKeywords(args, kwds, "|zz", keywords, &encoding, &errors))
return NULL;

return decode((Patch *)self, encoding == NULL ? "utf-8" : encoding, errors);
}

PyMethodDef Patch_methods[] = {
{"create_from", (PyCFunction) Patch_create_from,
METH_KEYWORDS | METH_VARARGS | METH_STATIC, Patch_create_from__doc__},
{"__bytes__", (PyCFunction) Patch__bytes__, METH_NOARGS, Patch__bytes____doc__},
{"decode", (PyCFunction) Patch_decode, METH_KEYWORDS | METH_VARARGS, Patch_decode__doc__},
{NULL}
};

Expand Down Expand Up @@ -237,7 +316,7 @@ PyTypeObject PatchType = {
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
Patch__str__, /* tp_str */
0, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Expand Down
Binary file added test/data/encoding.tar
Binary file not shown.
72 changes: 72 additions & 0 deletions test/test_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from __future__ import absolute_import
from __future__ import unicode_literals

import six

import pygit2
from . import utils

Expand Down Expand Up @@ -84,6 +86,76 @@
"""


class PatchEncodingTest(utils.AutoRepoTestCase):
repo_spec = 'tar', 'encoding'
expected_diff = b"""diff --git a/iso-8859-1.txt b/iso-8859-1.txt
index e84e339..201e0c9 100644
--- a/iso-8859-1.txt
+++ b/iso-8859-1.txt
@@ -1 +1,2 @@
Kristian H\xf8gsberg
+foo
"""

def test_patch_from_non_utf8(self):
# blobs encoded in ISO-8859-1
old_content = b'Kristian H\xf8gsberg\n'
new_content = old_content + b'foo\n'
patch = pygit2.Patch.create_from(
old_content,
new_content,
old_as_path='iso-8859-1.txt',
new_as_path='iso-8859-1.txt',
)

self.assertIsInstance(str(patch), str)

# `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
# decoding, so assert that we cannot get the original content back:
self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)

# since the patch contain non-UTF-8 sequences, .decode() should fail:
self.assertRaises(UnicodeDecodeError, patch.decode)

self.assertEqual(patch.decode(errors='replace'),
self.expected_diff.decode('utf-8', errors='replace'))

if six.PY2:
self.assertEqual(str(patch), self.expected_diff)
self.assertIsInstance(patch.__bytes__(), str)
self.assertEqual(patch.__bytes__(), self.expected_diff)

else:
self.assertEqual(bytes(patch), self.expected_diff)
self.assertEqual(str(patch),
str(self.expected_diff, 'utf8', errors='replace'))
self.assertRaises(UnicodeDecodeError, patch.decode)

def test_patch_create_from_blobs(self):
patch = pygit2.Patch.create_from(
self.repo['e84e339ac7fcc823106efa65a6972d7a20016c85'],
self.repo['201e0c908e3d9f526659df3e556c3d06384ef0df'],
old_as_path='iso-8859-1.txt',
new_as_path='iso-8859-1.txt',
)
# `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
# decoding, so assert that we cannot get the original content back:
self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)

if six.PY2:
self.assertIsInstance(str(patch), str)
self.assertEqual(str(patch), self.expected_diff)

self.assertIsInstance(patch.__bytes__(), str)
self.assertEqual(patch.__bytes__(), self.expected_diff)

else:
self.assertIsInstance(str(patch), str)
self.assertEqual(bytes(patch), self.expected_diff)
self.assertEqual(str(patch),
str(self.expected_diff, 'utf8', errors='replace'))


class PatchTest(utils.RepoTestCase):

def test_patch_create_from_buffers(self):
Expand Down