Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Use git cat-file --batch and git log --name-status to avoid repeated fork+exec #12

Merged
merged 3 commits into from over 2 years ago

2 participants

Anders Kaseorg Herbert Valerio Riedel
Anders Kaseorg

This significantly improves performance, especially for a Git repository stored on a network filesystem with many loose objects, by launching fewer copies of git and hence reading the repository fewer times.

added some commits
Anders Kaseorg GitCore: Abstract out __pipe
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
aa80f45
Anders Kaseorg Use git cat-file --batch, to avoid repeated fork+exec
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
0283ffb
Anders Kaseorg last_change: Use git log --name-status to avoid repeated fork+exec
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
9541c2a
Herbert Valerio Riedel
Owner

Good work, thanks!

Herbert Valerio Riedel hvr merged commit 722342e into from
Herbert Valerio Riedel hvr closed this
Herbert Valerio Riedel
Owner

@andersk Could you maybe take a look at http://trac-hacks.org/ticket/9560 this commit seems to have broken Python 2.5 support

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Showing 3 unique commits by 1 author.

Nov 18, 2011
Anders Kaseorg GitCore: Abstract out __pipe
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
aa80f45
Anders Kaseorg Use git cat-file --batch, to avoid repeated fork+exec
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
0283ffb
Anders Kaseorg last_change: Use git log --name-status to avoid repeated fork+exec
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
9541c2a
This page is out of date. Refresh to see the latest.
97  tracext/git/PyGIT.py
@@ -14,6 +14,7 @@
14 14
 from threading import Lock
15 15
 from subprocess import Popen, PIPE
16 16
 from operator import itemgetter
  17
+from contextlib import contextmanager
17 18
 import cStringIO
18 19
 import codecs
19 20
 
@@ -48,24 +49,34 @@ def __build_git_cmd(self, gitcmd, *args):
48 49
 
49 50
         return cmd
50 51
 
  52
+    def __pipe(self, git_cmd, *cmd_args, **kw):
  53
+        if sys.platform == "win32":
  54
+            return Popen(self.__build_git_cmd(git_cmd, *cmd_args), **kw)
  55
+        else:
  56
+            return Popen(self.__build_git_cmd(git_cmd, *cmd_args),
  57
+                         close_fds=True, **kw)
  58
+
51 59
     def __execute(self, git_cmd, *cmd_args):
52 60
         "execute git command and return file-like object of stdout"
53 61
 
54 62
         #print >>sys.stderr, "DEBUG:", git_cmd, cmd_args
55 63
 
56  
-        if sys.platform == "win32":
57  
-            p = Popen(self.__build_git_cmd(git_cmd, *cmd_args),
58  
-                      stdin=None, stdout=PIPE, stderr=PIPE)
59  
-        else:
60  
-            p = Popen(self.__build_git_cmd(git_cmd, *cmd_args),
61  
-                      stdin=None, stdout=PIPE, stderr=PIPE, close_fds=True)
  64
+        p = self.__pipe(git_cmd, *cmd_args, stdout=PIPE, stderr=PIPE)
62 65
 
63 66
         stdout_data, stderr_data = p.communicate()
64 67
         #TODO, do something with p.returncode, e.g. raise exception
65 68
 
66 69
         return stdout_data
67 70
 
  71
+    def cat_file_batch(self):
  72
+        return self.__pipe('cat-file', '--batch', stdin=PIPE, stdout=PIPE)
  73
+
  74
+    def log_pipe(self, *cmd_args):
  75
+        return self.__pipe('log', *cmd_args, stdout=PIPE)
  76
+
68 77
     def __getattr__(self, name):
  78
+        if name[0] == '_' or name in ['cat_file_batch', 'log_pipe']:
  79
+            raise AttributeError, name
69 80
         return partial(self.__execute, name.replace('_','-'))
70 81
 
71 82
     __is_sha_pat = re.compile(r'[0-9A-Fa-f]*$')
@@ -254,6 +265,13 @@ def __init__(self, git_dir, log, git_bin='git', git_fs_encoding=None):
254 265
         self.__commit_msg_cache = SizedDict(200)
255 266
         self.__commit_msg_lock = Lock()
256 267
 
  268
+        self.__cat_file_pipe = None
  269
+
  270
+    def __del__(self):
  271
+        if self.__cat_file_pipe is not None:
  272
+            self.__cat_file_pipe.stdin.close()
  273
+            self.__cat_file_pipe.wait()
  274
+
257 275
     #
258 276
     # cache handling
259 277
     #
@@ -473,6 +491,20 @@ def head(self):
473 491
         "get current HEAD commit id"
474 492
         return self.verifyrev("HEAD")
475 493
 
  494
+    def cat_file(self, kind, sha):
  495
+        if self.__cat_file_pipe is None:
  496
+            self.__cat_file_pipe = self.repo.cat_file_batch()
  497
+
  498
+        self.__cat_file_pipe.stdin.write(sha + '\n')
  499
+        self.__cat_file_pipe.stdin.flush()
  500
+        _sha, _type, _size = self.__cat_file_pipe.stdout.readline().split()
  501
+
  502
+        if _type != kind:
  503
+            raise TracError("internal error (got unexpected object kind '%s')" % k)
  504
+
  505
+        size = int(_size)
  506
+        return self.__cat_file_pipe.stdout.read(size + 1)[:size]
  507
+
476 508
     def verifyrev(self, rev):
477 509
         "verify/lookup given revision object and return a sha id or None if lookup failed"
478 510
         rev = str(rev)
@@ -494,7 +526,7 @@ def verifyrev(self, rev):
494 526
             return rc
495 527
 
496 528
         if rc in _rev_cache.tag_set:
497  
-            sha = self.repo.cat_file("tag", rc).split(None, 2)[:2]
  529
+            sha = self.cat_file("tag", rc).split(None, 2)[:2]
498 530
             if sha[0] != 'object':
499 531
                 self.logger.debug("unexpected result from 'git-cat-file tag %s'" % rc)
500 532
                 return None
@@ -604,7 +636,7 @@ def read_commit(self, commit_id):
604 636
                 return result[0], dict(result[1])
605 637
 
606 638
             # cache miss
607  
-            raw = self.repo.cat_file("commit", commit_id)
  639
+            raw = self.cat_file("commit", commit_id)
608 640
             raw = unicode(raw, self.get_commit_encoding(), 'replace')
609 641
             lines = raw.splitlines()
610 642
 
@@ -625,7 +657,7 @@ def read_commit(self, commit_id):
625 657
             return result[0], dict(result[1])
626 658
 
627 659
     def get_file(self, sha):
628  
-        return cStringIO.StringIO(self.repo.cat_file("blob", str(sha)))
  660
+        return cStringIO.StringIO(self.cat_file("blob", str(sha)))
629 661
 
630 662
     def get_obj_size(self, sha):
631 663
         sha = str(sha)
@@ -685,7 +717,52 @@ def sync(self):
685 717
         rev = self.repo.rev_list("--max-count=1", "--topo-order", "--all").strip()
686 718
         return self.__rev_cache_sync(rev)
687 719
 
688  
-    def last_change(self, sha, path):
  720
+    @contextmanager
  721
+    def get_historian(self, sha, base_path):
  722
+        p = []
  723
+        change = {}
  724
+        next_path = []
  725
+
  726
+        def name_status_gen():
  727
+            p[:] = [self.repo.log_pipe('--pretty=format:%n%H', '--name-status',
  728
+                                       sha, '--', base_path)]
  729
+            f = p[0].stdout
  730
+            for l in f:
  731
+                if l == '\n': continue
  732
+                old_sha = l.rstrip('\n')
  733
+                for l in f:
  734
+                    if l == '\n': break
  735
+                    _, path = l.rstrip('\n').split('\t', 1)
  736
+                    while path not in change:
  737
+                        change[path] = old_sha
  738
+                        if next_path == [path]: yield old_sha
  739
+                        try:
  740
+                            path, _ = path.rsplit('/', 1)
  741
+                        except ValueError:
  742
+                            break
  743
+            f.close()
  744
+            p[0].terminate()
  745
+            p[0].wait()
  746
+            p[:] = []
  747
+            while True: yield None
  748
+        gen = name_status_gen()
  749
+
  750
+        def historian(path):
  751
+            try:
  752
+                return change[path]
  753
+            except KeyError:
  754
+                next_path[:] = [path]
  755
+                return gen.next()
  756
+        yield historian
  757
+
  758
+        if p:
  759
+            p[0].stdout.close()
  760
+            p[0].terminate()
  761
+            p[0].wait()
  762
+
  763
+    def last_change(self, sha, path, historian=None):
  764
+        if historian is not None:
  765
+            return historian(path)
689 766
         return self.repo.rev_list("--max-count=1",
690 767
                                   sha, "--",
691 768
                                   self._fs_from_unicode(path)).strip() or None
41  tracext/git/git_fs.py
@@ -387,8 +387,8 @@ def display_rev(self, rev):
387 387
     def short_rev(self, rev):
388 388
         return self.git.shortrev(self.normalize_rev(rev), min_len=self._shortrev_len)
389 389
 
390  
-    def get_node(self, path, rev=None):
391  
-        return GitNode(self, path, rev, self.log)
  390
+    def get_node(self, path, rev=None, historian=None):
  391
+        return GitNode(self, path, rev, self.log, None, historian)
392 392
 
393 393
     def get_quickjump_entries(self, rev):
394 394
         for bname, bsha in self.git.get_branches():
@@ -412,24 +412,26 @@ def get_changes(self, old_path, old_rev, new_path, new_rev, ignore_ancestry=0):
412 412
         if old_path != new_path:
413 413
             raise TracError("not supported in git_fs")
414 414
 
415  
-        for chg in self.git.diff_tree(old_rev, new_rev, self.normalize_path(new_path)):
416  
-            mode1, mode2, obj1, obj2, action, path, path2 = chg
  415
+        with self.git.get_historian(old_rev, old_path.strip('/')) as old_historian:
  416
+            with self.git.get_historian(new_rev, new_path.strip('/')) as new_historian:
  417
+                for chg in self.git.diff_tree(old_rev, new_rev, self.normalize_path(new_path)):
  418
+                    mode1, mode2, obj1, obj2, action, path, path2 = chg
417 419
 
418  
-            kind = Node.FILE
419  
-            if mode2.startswith('04') or mode1.startswith('04'):
420  
-                kind = Node.DIRECTORY
  420
+                    kind = Node.FILE
  421
+                    if mode2.startswith('04') or mode1.startswith('04'):
  422
+                        kind = Node.DIRECTORY
421 423
 
422  
-            change = GitChangeset.action_map[action]
  424
+                    change = GitChangeset.action_map[action]
423 425
 
424  
-            old_node = None
425  
-            new_node = None
  426
+                    old_node = None
  427
+                    new_node = None
426 428
 
427  
-            if change != Changeset.ADD:
428  
-                old_node = self.get_node(path, old_rev)
429  
-            if change != Changeset.DELETE:
430  
-                new_node = self.get_node(path, new_rev)
  429
+                    if change != Changeset.ADD:
  430
+                        old_node = self.get_node(path, old_rev, old_historian)
  431
+                    if change != Changeset.DELETE:
  432
+                        new_node = self.get_node(path, new_rev, new_historian)
431 433
 
432  
-            yield old_node, new_node, kind, change
  434
+                    yield old_node, new_node, kind, change
433 435
 
434 436
     def next_rev(self, rev, path=''):
435 437
         return self.git.hist_next_revision(rev)
@@ -469,7 +471,7 @@ def sync(self, rev_callback=None, clean=None):
469 471
                 rev_callback(rev)
470 472
 
471 473
 class GitNode(Node):
472  
-    def __init__(self, repos, path, rev, log, ls_tree_info=None):
  474
+    def __init__(self, repos, path, rev, log, ls_tree_info=None, historian=None):
473 475
         self.log = log
474 476
         self.repos = repos
475 477
         self.fs_sha = None # points to either tree or blobs
@@ -491,7 +493,7 @@ def __init__(self, repos, path, rev, log, ls_tree_info=None):
491 493
             self.fs_perm, k, self.fs_sha, self.fs_size, _ = ls_tree_info
492 494
 
493 495
             # fix-up to the last commit-rev that touched this node
494  
-            rev = repos.git.last_change(rev, p)
  496
+            rev = repos.git.last_change(rev, p, historian)
495 497
 
496 498
             if k == 'tree':
497 499
                 pass
@@ -537,8 +539,9 @@ def get_entries(self):
537 539
         if not self.isdir:
538 540
             return
539 541
 
540  
-        for ent in self.repos.git.ls_tree(self.rev, self.__git_path()):
541  
-            yield GitNode(self.repos, ent[-1], self.rev, self.log, ent)
  542
+        with self.repos.git.get_historian(self.rev, self.path.strip('/')) as historian:
  543
+            for ent in self.repos.git.ls_tree(self.rev, self.__git_path()):
  544
+                yield GitNode(self.repos, ent[-1], self.rev, self.log, ent, historian)
542 545
 
543 546
     def get_content_type(self):
544 547
         if self.isdir:
Commit_comment_tip

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.