Avoid rendering, then parsing diff.

Fixes jonashaag#78
jelmer · Mar 22, 2017 · c8300db · c8300db
1 parent be69325
commit c8300db
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 192 deletions.
diff --git a/klaus/diff.py b/klaus/diff.py
@@ -8,168 +8,78 @@
     :copyright: 2007 by Armin Ronacher.
     :license: BSD
 """
-import re
-from cgi import escape
 
-
-def prepare_udiff(udiff, **kwargs):
-    """Prepare an udiff for a template."""
-    return DiffRenderer(udiff).prepare(**kwargs)
-
-
-class DiffRenderer(object):
-    """Give it a unified diff and it renders you a beautiful
-    html diff :-)
-    """
-    _chunk_re = re.compile(r'@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@')
-
-    def __init__(self, udiff):
-        """:param udiff:   a text in udiff format"""
-        self.lines = [escape(line) for line in udiff.splitlines()]
-
-    def _extract_filename(self, line):
-        """
-        Extract file name from unified diff line:
-            --- a/foo/bar   ==>   foo/bar
-            +++ b/foo/bar   ==>   foo/bar
-        """
-        if line.startswith(("--- /dev/null", "+++ /dev/null")):
-            return line[len("--- "):]
-        else:
-            return line[len("--- a/"):]
-
-    def _highlight_line(self, line, next):
-        """Highlight inline changes in both lines."""
-        start = 0
-        limit = min(len(line['line']), len(next['line']))
-        while start < limit and line['line'][start] == next['line'][start]:
-            start += 1
-        end = -1
-        limit -= start
-        while -end <= limit and line['line'][end] == next['line'][end]:
-            end -= 1
-        end += 1
-        if start or end:
-            def do(l):
-                last = end + len(l['line'])
-                if l['action'] == 'add':
-                    tag = 'ins'
-                else:
-                    tag = 'del'
-                l['line'] = u'%s<%s>%s</%s>%s' % (
-                    l['line'][:start],
-                    tag,
-                    l['line'][start:last],
-                    tag,
-                    l['line'][last:]
-                )
-            do(line)
-            do(next)
-
-    def prepare(self, want_header=True):
-        """Parse the diff an return data for the template."""
-        in_header = True
-        header = []
-        lineiter = iter(self.lines)
-        files = []
-        try:
-            line = next(lineiter)
-            while 1:
-                # continue until we found the old file
-                if not line.startswith('--- '):
-                    if in_header:
-                        header.append(line)
-                    line = next(lineiter)
-                    continue
-
-                if header and all(x.strip() for x in header):
-                    if want_header:
-                        files.append({'is_header': True, 'lines': header})
-                    header = []
-
-                in_header = False
-                chunks = []
-                files.append({
-                    'is_header':        False,
-                    'old_filename':     self._extract_filename(line),
-                    'new_filename':     self._extract_filename(next(lineiter)),
-                    'additions':        0,
-                    'deletions':        0,
-                    'chunks':           chunks
-                })
-
-                line = next(lineiter)
-                while line:
-                    match = self._chunk_re.match(line)
-                    if not match:
-                        in_header = True
-                        break
-
-                    lines = []
-                    chunks.append(lines)
-
-                    old_line, old_end, new_line, new_end = \
-                        [int(x or 1) for x in match.groups()]
-                    old_line -= 1
-                    new_line -= 1
-                    old_end += old_line
-                    new_end += new_line
-                    line = next(lineiter)
-
-                    while old_line < old_end or new_line < new_end:
-                        if line:
-                            command, line = line[0], line[1:]
-                        else:
-                            command = ' '
-                        affects_old = affects_new = False
-
-                        if command == '+':
-                            affects_new = True
-                            action = 'add'
-                            files[-1]['additions'] += 1
-                        elif command == '-':
-                            affects_old = True
-                            action = 'del'
-                            files[-1]['deletions'] += 1
-                        else:
-                            affects_old = affects_new = True
-                            action = 'unmod'
-
-                        old_line += affects_old
-                        new_line += affects_new
-                        lines.append({
-                            'old_lineno':   affects_old and old_line or u'',
-                            'new_lineno':   affects_new and new_line or u'',
-                            'action':       action,
-                            'line':         line,
-                            'no_newline':   False,
-                        })
-
-                        # Skip "no newline at end of file" markers
-                        line = next(lineiter)
-                        if line == r"\ No newline at end of file":
-                            lines[-1]['no_newline'] = True
-                            line = next(lineiter)
-
-        except StopIteration:
-            pass
-
-        # highlight inline changes
-        for file in files:
-            if file['is_header']:
-                continue
-            for chunk in file['chunks']:
-                lineiter = iter(chunk)
-                try:
-                    while True:
-                        line = next(lineiter)
-                        if line['action'] != 'unmod':
-                            nextline = next(lineiter)
-                            if nextline['action'] == 'unmod' or \
-                               nextline['action'] == line['action']:
-                                continue
-                            self._highlight_line(line, nextline)
-                except StopIteration:
-                    pass
-
-        return files
+from difflib import SequenceMatcher
+
+def highlight_line(old_line, new_line):
+    """Highlight inline changes in both lines."""
+    start = 0
+    limit = min(len(old_line), len(new_line))
+    while start < limit and old_line[start] == new_line[start]:
+        start += 1
+    end = -1
+    limit -= start
+    while -end <= limit and old_line[end] == new_line[end]:
+        end -= 1
+    end += 1
+    if start or end:
+        def do(l, tag):
+            last = end + len(l)
+            return b'%s<%s>%s</%s>%s' % (
+                l[:start],
+                tag,
+                l[start:last],
+                tag,
+                l[last:]
+            )
+        old_line = do(old_line, 'del')
+        new_line = do(new_line, 'ins')
+    return old_line, new_line
+
+
+def render_diff(a, b, n=3):
+    """Parse the diff an return data for the template."""
+    additions = 0
+    deletions = 0
+    chunks = []
+    for group in SequenceMatcher(None, a, b).get_grouped_opcodes(n):
+        old_line, old_end, new_line, new_end = group[0][1], group[-1][2], group[0][3], group[-1][4]
+        lines = []
+        def add_line(old_lineno, new_lineno, action, line):
+            if action == 'add':
+                additions += 1
+            if action == 'del':
+                deletions += 1
+            lines.append({
+                'old_lineno': old_lineno,
+                'new_lineno': new_lineno,
+                'action': action,
+                'line': line,
+                'no_newline': (line[-1:] != b'\n')
+            })
+            return lines[-1]
+        chunks.append(lines)
+        for tag, i1, i2, j1, j2 in group:
+            if tag == 'equal':
+                for c, line in enumerate(a[i1:i2]):
+                   add_line(i1+c, j1+c, 'unmod', line)
+            elif tag == 'insert':
+                for c, line in enumerate(b[j1:j2]):
+                   add_line(None, j1+c, 'add', line)
+            elif tag == 'delete':
+                for c, line in enumerate(a[i1:i2]):
+                   add_line(i1+c, None, 'del', line)
+            elif tag == 'replace':
+		# TODO: not sure if this is the best way to deal with replace
+		# blocks, but it's consistent with the previous version.
+                for c, line in enumerate(a[i1:i2-1]):
+                   add_line(i1+c, None, 'del', line)
+                old_line, new_line = highlight_line(a[i2-1], b[j1])
+                add_line(i2-1, None, 'del', old_line)
+                add_line(None, j1, 'add', new_line)
+                for c, line in enumerate(b[j1+1:j2]):
+                   add_line(None, j1+c+1, 'add', line)
+            else:
+                raise AssertionError('unknown tag %s' % tag)
+
+    return additions, deletions, chunks
diff --git a/klaus/repo.py b/klaus/repo.py
@@ -3,11 +3,12 @@
 import stat
 
 from dulwich.object_store import tree_lookup_path
+from dulwich.objects import Blob
 from dulwich.errors import NotTreeError
 import dulwich, dulwich.patch
 
 from klaus.utils import check_output, force_unicode, parent_directory, encode_for_git, decode_from_git
-from klaus.diff import prepare_udiff
+from klaus.diff import render_diff
 
 
 class FancyRepo(dulwich.repo.Repo):
@@ -192,43 +193,38 @@ def commit_diff(self, commit):
         dulwich_changes = self.object_store.tree_changes(parent_tree, commit.tree)
         for (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) in dulwich_changes:
             summary['nfiles'] += 1
-
             try:
-                # Check for binary files -- can't show diffs for these
-                if newsha and guess_is_binary(self[newsha]) or \
-                   oldsha and guess_is_binary(self[oldsha]):
-                    file_changes.append({
-                        'is_binary': True,
-                        'old_filename': oldpath or '/dev/null',
-                        'new_filename': newpath or '/dev/null',
-                        'chunks': None
-                    })
-                    continue
+                oldblob = self.object_store[oldsha] if oldsha else Blob.from_string(b'')
+                newblob = self.object_store[newsha] if newsha else Blob.from_string(b'')
             except KeyError:
                 # newsha/oldsha are probably related to submodules.
                 # Dulwich will handle that.
                 pass
 
-            bytesio = io.BytesIO()
-            dulwich.patch.write_object_diff(bytesio, self.object_store,
-                                            (oldpath, oldmode, oldsha),
-                                            (newpath, newmode, newsha))
-            files = prepare_udiff(decode_from_git(bytesio.getvalue()), want_header=False)
-            if not files:
-                # the diff module doesn't handle deletions/additions
-                # of empty files correctly.
+            # Check for binary files -- can't show diffs for these
+            if guess_is_binary(newblob) or \
+               guess_is_binary(oldblob):
                 file_changes.append({
+                    'is_binary': True,
                     'old_filename': oldpath or '/dev/null',
                     'new_filename': newpath or '/dev/null',
-                    'chunks': [],
-                    'additions': 0,
-                    'deletions': 0,
+                    'chunks': None
                 })
-            else:
-                change = files[0]
-                summary['nadditions'] += change['additions']
-                summary['ndeletions'] += change['deletions']
-                file_changes.append(change)
+                continue
+
+            additions, deletions, chunks = render_diff(
+                oldblob.splitlines(), newblob.splitlines())
+            change = {
+                'is_binary': False,
+                'old_filename': oldpath or '/dev/null',
+                'new_filename': newpath or '/dev/null',
+                'chunks': chunks,
+                'additions': additions,
+                'deletions': deletions,
+            }
+            summary['nadditions'] += additions
+            summary['ndeletions'] += deletions
+            file_changes.append(change)
 
         return summary, file_changes