Skip to content

Commit

Permalink
inrepoconfig: optimize secondary clones
Browse files Browse the repository at this point in the history
The only data we need from a clone inrepoconfig repo is the .prow
contents (aside: we clone it in the first place so that we can merge to
the base branch).

The underlying Git client actually uses 2 clones --- first a mirror
clone (git clone --mirror ...), and then a secondary local clone into a
temporary folder (git clone ...). This secondary clone is the one we are
optimizing here.

This adds 2 optimizations:

(1) Avoid populating object files with "--shared" flag. This way, the
.git folder of the secondary clone can just have a pointer to the mirror
clone. This saves a lot of disk space for large repos (not to mention
that both the CPU time and disk usage for creating this clone is vastly
reduced, especially when combined with the next optimization).

(2) Avoid a full checkout with sparse-checkout (supported since Git
2.25). This way, we can avoid deflating Git blob objects into file paths
on disk for everything except (a) files at the toplevel and (b) files
specified by the "git sparse-checkout set <PATH> [...<PATH>]" command.
The toplevel files are included as part of "cone mode" which is the
recommended mode for sparse checkouts. In our case we specify the
".prow" folder as the only path to make sure we get those inrepoconfig
files checked out.
  • Loading branch information
Linus Arver committed Aug 17, 2023
1 parent b30a519 commit 070db4b
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 3 deletions.
8 changes: 7 additions & 1 deletion prow/config/inrepoconfig.go
Expand Up @@ -379,7 +379,13 @@ func (c *InRepoConfigGitCache) ClientFor(org, repo string) (git.RepoClient, erro
if cached != nil || err != nil {
return cached, err
}
coreClient, err := c.ClientFactory.ClientFor(org, repo)
repoOpts := c.ClientFactory.RepoOpts{
// We only need ".prow" because the default "cone mode" of sparse
// checkouts already include files at the toplevel.
sparseCheckoutDirs: []string{".prow"},
shareObjectsWithSourceRepo: true,
}
coreClient, err := c.ClientFactory.ClientForWithRepoOpts(org, repo, repoOpts)
if err != nil {
return nil, err
}
Expand Down
6 changes: 6 additions & 0 deletions prow/git/v2/adapter.go
Expand Up @@ -59,6 +59,12 @@ func (a *clientFactoryAdapter) ClientFor(org, repo string) (RepoClient, error) {
return &repoClientAdapter{Repo: r}, err
}

// v1 clients do not support customizing pre-repo options.
func (a *clientFactoryAdapter) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) {
r, err := a.Client.Clone(org, repo)
return &repoClientAdapter{Repo: r}, err
}

type repoClientAdapter struct {
*git.Repo
}
Expand Down
32 changes: 31 additions & 1 deletion prow/git/v2/client_factory.go
Expand Up @@ -35,6 +35,10 @@ type ClientFactory interface {
ClientFromDir(org, repo, dir string) (RepoClient, error)
// ClientFor creates a client that operates on a new clone of the repo.
ClientFor(org, repo string) (RepoClient, error)
// ClientForWithRepoOpts is like ClientFor, but allows you to customize the
// setup of the cloned repo (such as sparse checkouts instead of using the
// default full clone).
ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error)

// Clean removes the caches used to generate clients
Clean() error
Expand Down Expand Up @@ -75,6 +79,21 @@ type ClientFactoryOpts struct {
Persist *bool
}

// These options are scoped to the repo, not the ClientFactory level. The reason
// for the separation is to allow a single process to have for example repos
// that are both sparsely checked out and non-sparsely checked out.
type RepoOpts struct {
// sparseCheckoutDirs is the list of directories that the working tree
// should have. If non-nil and empty, then the working tree only has files
// reachable from the root. If non-nil and non-empty, then those additional
// directories from the root are also checked out (populated) in the working
// tree, recursively.
sparseCheckoutDirs *[]string
// This is the `--share` flag to `git clone`. For cloning from a local
// source, it allows bypassing the copying of all objects.
shareObjectsWithSourceRepo bool
}

// Apply allows to use a ClientFactoryOpts as Opt
func (cfo *ClientFactoryOpts) Apply(target *ClientFactoryOpts) {
if cfo.Host != "" {
Expand Down Expand Up @@ -287,6 +306,17 @@ func (c *clientFactory) ClientFromDir(org, repo, dir string) (RepoClient, error)
// org and repo are used for determining where the repo is cloned, cloneURI
// overrides org/repo for cloning.
func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) {
// These RepoOpts are the default, and is here to preserve behavior of
// existing callsites that did not previously know about
// ClientForWithRepoOpts.
repoOpts := RepoOpts{
sparseCheckoutDirs: nil,
shareObjectsWithSourceRepo: false,
}
return c.ClientForWithRepoOpts(org, repo, repoOpts)
}

func (c *clientFactory) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) {
cacheDir := path.Join(c.cacheDir, org, repo)
c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir}).Debug("Creating a client from the cache.")
cacheClientCacher, _, _, err := c.bootstrapClients(org, repo, cacheDir)
Expand Down Expand Up @@ -324,7 +354,7 @@ func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) {
}

// initialize the new derivative repo from the cache
if err := repoClientCloner.Clone(cacheDir); err != nil {
if err := repoClientCloner.CloneWithRepoOpts(cacheDir, repoOpts); err != nil {
return nil, err
}

Expand Down
40 changes: 39 additions & 1 deletion prow/git/v2/interactor.go
Expand Up @@ -88,6 +88,7 @@ type cacher interface {
type cloner interface {
// Clone clones the repository from a local path.
Clone(from string) error
CloneWithRepoOpts(from string, repoOpts RepoOpts) error
}

// MergeOpt holds options for git merge operations.
Expand Down Expand Up @@ -139,10 +140,47 @@ func (i *interactor) IsDirty() (bool, error) {

// Clone clones the repository from a local path.
func (i *interactor) Clone(from string) error {
repoOpts := RepoOpts{
sparseCheckoutDirs: nil,
shareObjectsWithSourceRepo: false,
}
return i.CloneWithRepoOpts(from, repoOpts)
}

// Clone clones the repository from a local path, but additionally use any
// repository options to customize the clone behavior.
func (i *interactor) CloneWithRepoOpts(from string, repoOpts RepoOpts) error {
i.logger.Infof("Creating a clone of the repo at %s from %s", i.dir, from)
if out, err := i.executor.Run("clone", from, i.dir); err != nil {
cloneArgs := []string{"clone"}

if repoOpts.shareObjectsWithSourceRepo {
cloneArgs = append(cloneArgs, "--shared")
}

// Handle sparse checkouts.
if repoOpts.sparseCheckoutDirs != nil {
cloneArgs = append(cloneArgs, "--sparse")
}
cloneArgs = append(cloneArgs, from)
cloneArgs = append(cloneArgs, i.dir)

if out, err := i.executor.Run(cloneArgs...); err != nil {
return fmt.Errorf("error creating a clone: %w %v", err, string(out))
}

// For sparse checkouts, we have to do some additional housekeeping after
// the clone is completed. We use Git's global "-C <directory>" flag to
// switch to that directory before running the "sparse-checkout" command,
// because otherwise the command will fail (because it will try to run the
// command in the $PWD, which is not the same as the just-created clone
// directory (i.dir)).
if repoOpts.sparseCheckoutDirs != nil {
sparseCheckoutArgs := []string{"-C", i.dir, "sparse-checkout", "set"}
sparseCheckoutArgs = append(sparseCheckoutArgs, (*repoOpts.sparseCheckoutDirs)...)
if out, err := i.executor.Run(sparseCheckoutArgs...); err != nil {
return fmt.Errorf("error setting it to a sparse checkout: %w %v", err, string(out))
}
}
return nil
}

Expand Down
94 changes: 94 additions & 0 deletions prow/git/v2/interactor_test.go
Expand Up @@ -93,6 +93,100 @@ func TestInteractor_Clone(t *testing.T) {
}
}

func TestInteractor_CloneWithRepoOpts(t *testing.T) {
var testCases = []struct {
name string
dir string
from string
remote RemoteResolver
repoOpts RepoOpts
responses map[string]execResponse
expectedCalls [][]string
expectedErr bool
}{
{
name: "blank RepoOpts",
dir: "/else",
from: "/somewhere",
repoOpts: RepoOpts{},
responses: map[string]execResponse{
"clone /somewhere /else": {
out: []byte(`ok`),
},
},
expectedCalls: [][]string{
{"clone", "/somewhere", "/else"},
},
expectedErr: false,
},
{
name: "shared git objects",
dir: "/else",
from: "/somewhere",
repoOpts: RepoOpts{
sparseCheckoutDirs: nil,
shareObjectsWithSourceRepo: true,
},
responses: map[string]execResponse{
"clone --shared /somewhere /else": {
out: []byte(`ok`),
},
},
expectedCalls: [][]string{
{"clone", "--shared", "/somewhere", "/else"},
},
expectedErr: false,
},
{
name: "shared git objects and sparse checkout",
dir: "/else",
from: "/somewhere",
repoOpts: RepoOpts{
sparseCheckoutDirs: &[]string{"a", "b"},
shareObjectsWithSourceRepo: true,
},
responses: map[string]execResponse{
"clone --shared --sparse /somewhere /else": {
out: []byte(`ok`),
},
"-C /else sparse-checkout set a b": {
out: []byte(`ok`),
},
},
expectedCalls: [][]string{
{"clone", "--shared", "--sparse", "/somewhere", "/else"},
{"-C", "/else", "sparse-checkout", "set", "a", "b"},
},
expectedErr: false,
},
}

for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
e := fakeExecutor{
records: [][]string{},
responses: testCase.responses,
}
i := interactor{
executor: &e,
remote: testCase.remote,
dir: testCase.dir,
logger: logrus.WithField("test", testCase.name),
}
actualErr := i.CloneWithRepoOpts(testCase.from, testCase.repoOpts)
if testCase.expectedErr && actualErr == nil {
t.Errorf("%s: expected an error but got none", testCase.name)
}
if !testCase.expectedErr && actualErr != nil {
t.Errorf("%s: expected no error but got one: %v", testCase.name, actualErr)
}
if actual, expected := e.records, testCase.expectedCalls; !reflect.DeepEqual(actual, expected) {
t.Errorf("%s: got incorrect git calls: %v", testCase.name, diff.ObjectReflectDiff(actual, expected))
}
})
}
}

func TestInteractor_MirrorClone(t *testing.T) {
var testCases = []struct {
name string
Expand Down

0 comments on commit 070db4b

Please sign in to comment.