From d44c183e8e8cfa324fae00fd61f17a525ee4eb42 Mon Sep 17 00:00:00 2001 From: Linus Arver Date: Wed, 16 Aug 2023 17:26:32 -0700 Subject: [PATCH] inrepoconfig: optimize secondary clones The only data we need from a clone inrepoconfig repo is the .prow contents (aside: we clone it in the first place so that we can merge to the base branch). The underlying Git client actually uses 2 clones --- first a mirror clone (git clone --mirror ...), and then a secondary local clone into a temporary folder (git clone ...). This secondary clone is the one we are optimizing here. This adds 2 optimizations: (1) Avoid populating object files with "--shared" flag. This way, the .git folder of the secondary clone can just have a pointer to the mirror clone. This saves a lot of disk space for large repos (not to mention that both the CPU time and disk usage for creating this clone is vastly reduced, especially when combined with the next optimization). (2) Avoid a full checkout with sparse-checkout (supported since Git 2.25). This way, we can avoid deflating Git blob objects into file paths on disk for everything except (a) files at the toplevel and (b) files specified by the "git sparse-checkout set [...]" command. The toplevel files are included as part of "cone mode" which is the recommended mode for sparse checkouts. In our case we specify the ".prow" folder (inRepoConfigDirName) which is the only path to make sure we get those inrepoconfig files checked out. Sadly, for Git versions 2.32 (which we currently use in our components), the toplevel files are *not* included by default. So we have to specify it separately, which is why we have inRepoConfigFileName also in SparseCheckoutDirs. In Git 2.41 (probably since 2.36) specifying inRepoConfigFileName will make Git exit with an error because Git will see that the `.prow.yaml` is a file, not a folder [1]. So before we upgrade our components to use Git 2.36+ we have to delete the inRepoConfigFileName from this list. [1] https://github.com/git/git/commit/4ce504360bc3b240e570281fabe00a85027532c3 --- prow/config/inrepoconfig.go | 23 ++++++- prow/config/inrepoconfig_test.go | 7 ++ prow/git/v2/adapter.go | 6 ++ prow/git/v2/client_factory.go | 39 +++++++++-- prow/git/v2/interactor.go | 43 +++++++++++- prow/git/v2/interactor_test.go | 112 +++++++++++++++++++++++++++++++ 6 files changed, 222 insertions(+), 8 deletions(-) diff --git a/prow/config/inrepoconfig.go b/prow/config/inrepoconfig.go index f978e311e59cb..e4ecc8aa55e15 100644 --- a/prow/config/inrepoconfig.go +++ b/prow/config/inrepoconfig.go @@ -43,6 +43,25 @@ const ( inRepoConfigDirName = ".prow" ) +var inrepoconfigRepoOpts = git.RepoOpts{ + // Technically we only need inRepoConfigDirName (".prow") because the + // default "cone mode" of sparse checkouts already include files at the + // toplevel (which would include ".prow.yaml"). + // + // TODO (listx): The version of git shipped in kubekins-e2e (itself + // derived from the bootstrap image) uses git version 2.30.2, which does + // not populate files at the toplevel. So we have to also set a sparse + // checkout of ".prow.yaml". Later when that image is updated, we can + // remove the use of inRepoConfigFileName (".prow.yaml"), so that the + // unit tests in CI can pass. As for the Prow components themselves, + // they use a different version of Git based on alpine (see .ko.yaml in + // the root). + SparseCheckoutDirs: &[]string{inRepoConfigDirName, inRepoConfigFileName}, + // The sparse checkout would avoid creating another copy of Git objects + // from the mirror clone into the secondary clone. + ShareObjectsWithSourceRepo: true, +} + var inrepoconfigMetrics = struct { gitCloneDuration *prometheus.HistogramVec gitOtherDuration *prometheus.HistogramVec @@ -119,7 +138,7 @@ func prowYAMLGetter( } timeBeforeClone := time.Now() - repo, err := gc.ClientFor(orgRepo.Org, orgRepo.Repo) + repo, err := gc.ClientForWithRepoOpts(orgRepo.Org, orgRepo.Repo, inrepoconfigRepoOpts) inrepoconfigMetrics.gitCloneDuration.WithLabelValues(orgRepo.Org, orgRepo.Repo).Observe((float64(time.Since(timeBeforeClone).Seconds()))) if err != nil { return nil, fmt.Errorf("failed to clone repo for %q: %w", identifier, err) @@ -379,7 +398,7 @@ func (c *InRepoConfigGitCache) ClientFor(org, repo string) (git.RepoClient, erro if cached != nil || err != nil { return cached, err } - coreClient, err := c.ClientFactory.ClientFor(org, repo) + coreClient, err := c.ClientFactory.ClientForWithRepoOpts(org, repo, inrepoconfigRepoOpts) if err != nil { return nil, err } diff --git a/prow/config/inrepoconfig_test.go b/prow/config/inrepoconfig_test.go index 255934d77b690..54e37e4fcf076 100644 --- a/prow/config/inrepoconfig_test.go +++ b/prow/config/inrepoconfig_test.go @@ -681,6 +681,13 @@ func (cf *testClientFactory) ClientFor(org, repo string) (git.RepoClient, error) return &fetchOnlyNoCleanRepoClient{cf.rcMap[repo]}, nil } +func (cf *testClientFactory) ClientForWithRepoOpts(org, repo string, repoOpts git.RepoOpts) (git.RepoClient, error) { + cf.clientsCreated++ + // Returning this RepoClient ensures that only Fetch() is called and that Close() is not. + + return &fetchOnlyNoCleanRepoClient{cf.rcMap[repo]}, nil +} + type fetchOnlyNoCleanRepoClient struct { git.RepoClient // This will be nil during testing, we override the functions that are allowed to be used. } diff --git a/prow/git/v2/adapter.go b/prow/git/v2/adapter.go index e2715db701574..540b1afcc3c3a 100644 --- a/prow/git/v2/adapter.go +++ b/prow/git/v2/adapter.go @@ -59,6 +59,12 @@ func (a *clientFactoryAdapter) ClientFor(org, repo string) (RepoClient, error) { return &repoClientAdapter{Repo: r}, err } +// v1 clients do not support customizing pre-repo options. +func (a *clientFactoryAdapter) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) { + r, err := a.Client.Clone(org, repo) + return &repoClientAdapter{Repo: r}, err +} + type repoClientAdapter struct { *git.Repo } diff --git a/prow/git/v2/client_factory.go b/prow/git/v2/client_factory.go index d44198ed27bbe..a567cdd034756 100644 --- a/prow/git/v2/client_factory.go +++ b/prow/git/v2/client_factory.go @@ -35,6 +35,10 @@ type ClientFactory interface { ClientFromDir(org, repo, dir string) (RepoClient, error) // ClientFor creates a client that operates on a new clone of the repo. ClientFor(org, repo string) (RepoClient, error) + // ClientForWithRepoOpts is like ClientFor, but allows you to customize the + // setup of the cloned repo (such as sparse checkouts instead of using the + // default full clone). + ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) // Clean removes the caches used to generate clients Clean() error @@ -75,6 +79,21 @@ type ClientFactoryOpts struct { Persist *bool } +// These options are scoped to the repo, not the ClientFactory level. The reason +// for the separation is to allow a single process to have for example repos +// that are both sparsely checked out and non-sparsely checked out. +type RepoOpts struct { + // sparseCheckoutDirs is the list of directories that the working tree + // should have. If non-nil and empty, then the working tree only has files + // reachable from the root. If non-nil and non-empty, then those additional + // directories from the root are also checked out (populated) in the working + // tree, recursively. + SparseCheckoutDirs *[]string + // This is the `--share` flag to `git clone`. For cloning from a local + // source, it allows bypassing the copying of all objects. + ShareObjectsWithSourceRepo bool +} + // Apply allows to use a ClientFactoryOpts as Opt func (cfo *ClientFactoryOpts) Apply(target *ClientFactoryOpts) { if cfo.Host != "" { @@ -278,15 +297,25 @@ func (c *clientFactory) ClientFromDir(org, repo, dir string) (RepoClient, error) return client, err } -// ClientFor returns a repository client for the specified repository. +// ClientFor wraps around ClientForWithRepoOpts using the default RepoOpts{} +// (empty value). Originally, ClientFor was not a wrapper at all and did the +// work inside ClientForWithRepoOpts itself, but it did this without RepoOpts. +// When RepoOpts was created, we made ClientFor wrap around +// ClientForWithRepoOpts to preserve behavior of existing callers of ClientFor. +func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) { + return c.ClientForWithRepoOpts(org, repo, RepoOpts{}) +} + +// ClientForWithRepoOpts returns a repository client for the specified repository. // This function may take a long time if it is the first time cloning the repo. // In that case, it must do a full git mirror clone. For large repos, this can -// take a while. Once that is done, it will do a git fetch instead of a clone, -// which will usually take at most a few seconds. +// take a while. Once that is done, it will do a git remote update (essentially +// git fetch) for the mirror clone, which will usually take at most a few +// seconds, before creating a secondary clone from this (updated) mirror. // // org and repo are used for determining where the repo is cloned, cloneURI // overrides org/repo for cloning. -func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) { +func (c *clientFactory) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) { cacheDir := path.Join(c.cacheDir, org, repo) c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir}).Debug("Creating a client from the cache.") cacheClientCacher, _, _, err := c.bootstrapClients(org, repo, cacheDir) @@ -324,7 +353,7 @@ func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) { } // initialize the new derivative repo from the cache - if err := repoClientCloner.Clone(cacheDir); err != nil { + if err := repoClientCloner.CloneWithRepoOpts(cacheDir, repoOpts); err != nil { return nil, err } diff --git a/prow/git/v2/interactor.go b/prow/git/v2/interactor.go index c82af6585d08c..bdee56965e815 100644 --- a/prow/git/v2/interactor.go +++ b/prow/git/v2/interactor.go @@ -88,6 +88,7 @@ type cacher interface { type cloner interface { // Clone clones the repository from a local path. Clone(from string) error + CloneWithRepoOpts(from string, repoOpts RepoOpts) error } // MergeOpt holds options for git merge operations. @@ -139,10 +140,50 @@ func (i *interactor) IsDirty() (bool, error) { // Clone clones the repository from a local path. func (i *interactor) Clone(from string) error { + repoOpts := RepoOpts{ + SparseCheckoutDirs: nil, + ShareObjectsWithSourceRepo: false, + } + return i.CloneWithRepoOpts(from, repoOpts) +} + +// CloneWithRepoOpts clones the repository from a local path, but additionally +// use any repository options (RepoOpts) to customize the clone behavior. +func (i *interactor) CloneWithRepoOpts(from string, repoOpts RepoOpts) error { i.logger.Infof("Creating a clone of the repo at %s from %s", i.dir, from) - if out, err := i.executor.Run("clone", from, i.dir); err != nil { + cloneArgs := []string{"clone"} + + if repoOpts.ShareObjectsWithSourceRepo { + cloneArgs = append(cloneArgs, "--shared") + } + + // Handle sparse checkouts. + if repoOpts.SparseCheckoutDirs != nil { + cloneArgs = append(cloneArgs, "--sparse") + } + cloneArgs = append(cloneArgs, from) + cloneArgs = append(cloneArgs, i.dir) + + if out, err := i.executor.Run(cloneArgs...); err != nil { return fmt.Errorf("error creating a clone: %w %v", err, string(out)) } + + // For sparse checkouts, we have to do some additional housekeeping after + // the clone is completed. We use Git's global "-C " flag to + // switch to that directory before running the "sparse-checkout" command, + // because otherwise the command will fail (because it will try to run the + // command in the $PWD, which is not the same as the just-created clone + // directory (i.dir)). + if repoOpts.SparseCheckoutDirs != nil { + if len(*repoOpts.SparseCheckoutDirs) == 0 { + return nil + } + sparseCheckoutArgs := []string{"-C", i.dir, "sparse-checkout", "set"} + sparseCheckoutArgs = append(sparseCheckoutArgs, (*repoOpts.SparseCheckoutDirs)...) + if out, err := i.executor.Run(sparseCheckoutArgs...); err != nil { + return fmt.Errorf("error setting it to a sparse checkout: %w %v", err, string(out)) + } + } return nil } diff --git a/prow/git/v2/interactor_test.go b/prow/git/v2/interactor_test.go index 17d4ad5ee663f..441f21e85e278 100644 --- a/prow/git/v2/interactor_test.go +++ b/prow/git/v2/interactor_test.go @@ -93,6 +93,118 @@ func TestInteractor_Clone(t *testing.T) { } } +func TestInteractor_CloneWithRepoOpts(t *testing.T) { + var testCases = []struct { + name string + dir string + from string + remote RemoteResolver + repoOpts RepoOpts + responses map[string]execResponse + expectedCalls [][]string + expectedErr bool + }{ + { + name: "blank RepoOpts", + dir: "/secondaryclone", + from: "/mirrorclone", + repoOpts: RepoOpts{}, + responses: map[string]execResponse{ + "clone /mirrorclone /secondaryclone": { + out: []byte(`ok`), + }, + }, + expectedCalls: [][]string{ + {"clone", "/mirrorclone", "/secondaryclone"}, + }, + expectedErr: false, + }, + { + name: "shared git objects", + dir: "/secondaryclone", + from: "/mirrorclone", + repoOpts: RepoOpts{ + SparseCheckoutDirs: nil, + ShareObjectsWithSourceRepo: true, + }, + responses: map[string]execResponse{ + "clone --shared /mirrorclone /secondaryclone": { + out: []byte(`ok`), + }, + }, + expectedCalls: [][]string{ + {"clone", "--shared", "/mirrorclone", "/secondaryclone"}, + }, + expectedErr: false, + }, + { + name: "shared git objects and sparse checkout (toplevel only)", + dir: "/secondaryclone", + from: "/mirrorclone", + repoOpts: RepoOpts{ + SparseCheckoutDirs: &[]string{}, + ShareObjectsWithSourceRepo: true, + }, + responses: map[string]execResponse{ + "clone --shared --sparse /mirrorclone /secondaryclone": { + out: []byte(`ok`), + }, + }, + expectedCalls: [][]string{ + {"clone", "--shared", "--sparse", "/mirrorclone", "/secondaryclone"}, + }, + expectedErr: false, + }, + { + name: "shared git objects and sparse checkout (toplevel+subdirs)", + dir: "/secondaryclone", + from: "/mirrorclone", + repoOpts: RepoOpts{ + SparseCheckoutDirs: &[]string{"a", "b"}, + ShareObjectsWithSourceRepo: true, + }, + responses: map[string]execResponse{ + "clone --shared --sparse /mirrorclone /secondaryclone": { + out: []byte(`ok`), + }, + "-C /secondaryclone sparse-checkout set a b": { + out: []byte(`ok`), + }, + }, + expectedCalls: [][]string{ + {"clone", "--shared", "--sparse", "/mirrorclone", "/secondaryclone"}, + {"-C", "/secondaryclone", "sparse-checkout", "set", "a", "b"}, + }, + expectedErr: false, + }, + } + + for _, testCase := range testCases { + t.Run(testCase.name, func(t *testing.T) { + e := fakeExecutor{ + records: [][]string{}, + responses: testCase.responses, + } + i := interactor{ + executor: &e, + remote: testCase.remote, + dir: testCase.dir, + logger: logrus.WithField("test", testCase.name), + } + actualErr := i.CloneWithRepoOpts(testCase.from, testCase.repoOpts) + if testCase.expectedErr && actualErr == nil { + t.Errorf("%s: expected an error but got none", testCase.name) + } + if !testCase.expectedErr && actualErr != nil { + t.Errorf("%s: expected no error but got one: %v", testCase.name, actualErr) + } + if actual, expected := e.records, testCase.expectedCalls; !reflect.DeepEqual(actual, expected) { + t.Errorf("%s: got incorrect git calls: %v", testCase.name, diff.ObjectReflectDiff(actual, expected)) + } + }) + } +} + func TestInteractor_MirrorClone(t *testing.T) { var testCases = []struct { name string