Skip to content

Commit

Permalink
inrepoconfig: optimize secondary clones
Browse files Browse the repository at this point in the history
The only data we need from a clone inrepoconfig repo is the .prow
contents (aside: we clone it in the first place so that we can merge to
the base branch).

The underlying Git client actually uses 2 clones --- first a mirror
clone (git clone --mirror ...), and then a secondary local clone into a
temporary folder (git clone ...). This secondary clone is the one we are
optimizing here.

This adds 2 optimizations:

(1) Avoid populating object files with "--shared" flag. This way, the
.git folder of the secondary clone can just have a pointer to the mirror
clone. This saves a lot of disk space for large repos (not to mention
that both the CPU time and disk usage for creating this clone is vastly
reduced, especially when combined with the next optimization).

(2) Avoid a full checkout with sparse-checkout (supported since Git
2.25). This way, we can avoid deflating Git blob objects into file paths
on disk for everything except (a) files at the toplevel and (b) files
specified by the "git sparse-checkout set <PATH> [...<PATH>]" command.
The toplevel files are included as part of "cone mode" which is the
recommended mode for sparse checkouts. In our case we specify the
".prow" folder (inRepoConfigDirName) which is the only path to make sure
we get those inrepoconfig files checked out.

Sadly, for Git versions 2.32 (which we currently use in our components),
the toplevel files are *not* included by default. So we have to specify
it separately, which is why we have inRepoConfigFileName also in
SparseCheckoutDirs. In Git 2.41 (probably since 2.36) specifying
inRepoConfigFileName will make Git exit with an error because Git will
see that the `.prow.yaml` is a file, not a folder [1]. So before we
upgrade our components to use Git 2.36+ we have to delete the
inRepoConfigFileName from this list.

[1] git/git@4ce5043
  • Loading branch information
Linus Arver committed Aug 17, 2023
1 parent 21f8136 commit d44c183
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 8 deletions.
23 changes: 21 additions & 2 deletions prow/config/inrepoconfig.go
Expand Up @@ -43,6 +43,25 @@ const (
inRepoConfigDirName = ".prow"
)

var inrepoconfigRepoOpts = git.RepoOpts{
// Technically we only need inRepoConfigDirName (".prow") because the
// default "cone mode" of sparse checkouts already include files at the
// toplevel (which would include ".prow.yaml").
//
// TODO (listx): The version of git shipped in kubekins-e2e (itself
// derived from the bootstrap image) uses git version 2.30.2, which does
// not populate files at the toplevel. So we have to also set a sparse
// checkout of ".prow.yaml". Later when that image is updated, we can
// remove the use of inRepoConfigFileName (".prow.yaml"), so that the
// unit tests in CI can pass. As for the Prow components themselves,
// they use a different version of Git based on alpine (see .ko.yaml in
// the root).
SparseCheckoutDirs: &[]string{inRepoConfigDirName, inRepoConfigFileName},
// The sparse checkout would avoid creating another copy of Git objects
// from the mirror clone into the secondary clone.
ShareObjectsWithSourceRepo: true,
}

var inrepoconfigMetrics = struct {
gitCloneDuration *prometheus.HistogramVec
gitOtherDuration *prometheus.HistogramVec
Expand Down Expand Up @@ -119,7 +138,7 @@ func prowYAMLGetter(
}

timeBeforeClone := time.Now()
repo, err := gc.ClientFor(orgRepo.Org, orgRepo.Repo)
repo, err := gc.ClientForWithRepoOpts(orgRepo.Org, orgRepo.Repo, inrepoconfigRepoOpts)
inrepoconfigMetrics.gitCloneDuration.WithLabelValues(orgRepo.Org, orgRepo.Repo).Observe((float64(time.Since(timeBeforeClone).Seconds())))
if err != nil {
return nil, fmt.Errorf("failed to clone repo for %q: %w", identifier, err)
Expand Down Expand Up @@ -379,7 +398,7 @@ func (c *InRepoConfigGitCache) ClientFor(org, repo string) (git.RepoClient, erro
if cached != nil || err != nil {
return cached, err
}
coreClient, err := c.ClientFactory.ClientFor(org, repo)
coreClient, err := c.ClientFactory.ClientForWithRepoOpts(org, repo, inrepoconfigRepoOpts)
if err != nil {
return nil, err
}
Expand Down
7 changes: 7 additions & 0 deletions prow/config/inrepoconfig_test.go
Expand Up @@ -681,6 +681,13 @@ func (cf *testClientFactory) ClientFor(org, repo string) (git.RepoClient, error)
return &fetchOnlyNoCleanRepoClient{cf.rcMap[repo]}, nil
}

func (cf *testClientFactory) ClientForWithRepoOpts(org, repo string, repoOpts git.RepoOpts) (git.RepoClient, error) {
cf.clientsCreated++
// Returning this RepoClient ensures that only Fetch() is called and that Close() is not.

return &fetchOnlyNoCleanRepoClient{cf.rcMap[repo]}, nil
}

type fetchOnlyNoCleanRepoClient struct {
git.RepoClient // This will be nil during testing, we override the functions that are allowed to be used.
}
Expand Down
6 changes: 6 additions & 0 deletions prow/git/v2/adapter.go
Expand Up @@ -59,6 +59,12 @@ func (a *clientFactoryAdapter) ClientFor(org, repo string) (RepoClient, error) {
return &repoClientAdapter{Repo: r}, err
}

// v1 clients do not support customizing pre-repo options.
func (a *clientFactoryAdapter) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) {
r, err := a.Client.Clone(org, repo)
return &repoClientAdapter{Repo: r}, err
}

type repoClientAdapter struct {
*git.Repo
}
Expand Down
39 changes: 34 additions & 5 deletions prow/git/v2/client_factory.go
Expand Up @@ -35,6 +35,10 @@ type ClientFactory interface {
ClientFromDir(org, repo, dir string) (RepoClient, error)
// ClientFor creates a client that operates on a new clone of the repo.
ClientFor(org, repo string) (RepoClient, error)
// ClientForWithRepoOpts is like ClientFor, but allows you to customize the
// setup of the cloned repo (such as sparse checkouts instead of using the
// default full clone).
ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error)

// Clean removes the caches used to generate clients
Clean() error
Expand Down Expand Up @@ -75,6 +79,21 @@ type ClientFactoryOpts struct {
Persist *bool
}

// These options are scoped to the repo, not the ClientFactory level. The reason
// for the separation is to allow a single process to have for example repos
// that are both sparsely checked out and non-sparsely checked out.
type RepoOpts struct {
// sparseCheckoutDirs is the list of directories that the working tree
// should have. If non-nil and empty, then the working tree only has files
// reachable from the root. If non-nil and non-empty, then those additional
// directories from the root are also checked out (populated) in the working
// tree, recursively.
SparseCheckoutDirs *[]string
// This is the `--share` flag to `git clone`. For cloning from a local
// source, it allows bypassing the copying of all objects.
ShareObjectsWithSourceRepo bool
}

// Apply allows to use a ClientFactoryOpts as Opt
func (cfo *ClientFactoryOpts) Apply(target *ClientFactoryOpts) {
if cfo.Host != "" {
Expand Down Expand Up @@ -278,15 +297,25 @@ func (c *clientFactory) ClientFromDir(org, repo, dir string) (RepoClient, error)
return client, err
}

// ClientFor returns a repository client for the specified repository.
// ClientFor wraps around ClientForWithRepoOpts using the default RepoOpts{}
// (empty value). Originally, ClientFor was not a wrapper at all and did the
// work inside ClientForWithRepoOpts itself, but it did this without RepoOpts.
// When RepoOpts was created, we made ClientFor wrap around
// ClientForWithRepoOpts to preserve behavior of existing callers of ClientFor.
func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) {
return c.ClientForWithRepoOpts(org, repo, RepoOpts{})
}

// ClientForWithRepoOpts returns a repository client for the specified repository.
// This function may take a long time if it is the first time cloning the repo.
// In that case, it must do a full git mirror clone. For large repos, this can
// take a while. Once that is done, it will do a git fetch instead of a clone,
// which will usually take at most a few seconds.
// take a while. Once that is done, it will do a git remote update (essentially
// git fetch) for the mirror clone, which will usually take at most a few
// seconds, before creating a secondary clone from this (updated) mirror.
//
// org and repo are used for determining where the repo is cloned, cloneURI
// overrides org/repo for cloning.
func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) {
func (c *clientFactory) ClientForWithRepoOpts(org, repo string, repoOpts RepoOpts) (RepoClient, error) {
cacheDir := path.Join(c.cacheDir, org, repo)
c.logger.WithFields(logrus.Fields{"org": org, "repo": repo, "dir": cacheDir}).Debug("Creating a client from the cache.")
cacheClientCacher, _, _, err := c.bootstrapClients(org, repo, cacheDir)
Expand Down Expand Up @@ -324,7 +353,7 @@ func (c *clientFactory) ClientFor(org, repo string) (RepoClient, error) {
}

// initialize the new derivative repo from the cache
if err := repoClientCloner.Clone(cacheDir); err != nil {
if err := repoClientCloner.CloneWithRepoOpts(cacheDir, repoOpts); err != nil {
return nil, err
}

Expand Down
43 changes: 42 additions & 1 deletion prow/git/v2/interactor.go
Expand Up @@ -88,6 +88,7 @@ type cacher interface {
type cloner interface {
// Clone clones the repository from a local path.
Clone(from string) error
CloneWithRepoOpts(from string, repoOpts RepoOpts) error
}

// MergeOpt holds options for git merge operations.
Expand Down Expand Up @@ -139,10 +140,50 @@ func (i *interactor) IsDirty() (bool, error) {

// Clone clones the repository from a local path.
func (i *interactor) Clone(from string) error {
repoOpts := RepoOpts{
SparseCheckoutDirs: nil,
ShareObjectsWithSourceRepo: false,
}
return i.CloneWithRepoOpts(from, repoOpts)
}

// CloneWithRepoOpts clones the repository from a local path, but additionally
// use any repository options (RepoOpts) to customize the clone behavior.
func (i *interactor) CloneWithRepoOpts(from string, repoOpts RepoOpts) error {
i.logger.Infof("Creating a clone of the repo at %s from %s", i.dir, from)
if out, err := i.executor.Run("clone", from, i.dir); err != nil {
cloneArgs := []string{"clone"}

if repoOpts.ShareObjectsWithSourceRepo {
cloneArgs = append(cloneArgs, "--shared")
}

// Handle sparse checkouts.
if repoOpts.SparseCheckoutDirs != nil {
cloneArgs = append(cloneArgs, "--sparse")
}
cloneArgs = append(cloneArgs, from)
cloneArgs = append(cloneArgs, i.dir)

if out, err := i.executor.Run(cloneArgs...); err != nil {
return fmt.Errorf("error creating a clone: %w %v", err, string(out))
}

// For sparse checkouts, we have to do some additional housekeeping after
// the clone is completed. We use Git's global "-C <directory>" flag to
// switch to that directory before running the "sparse-checkout" command,
// because otherwise the command will fail (because it will try to run the
// command in the $PWD, which is not the same as the just-created clone
// directory (i.dir)).
if repoOpts.SparseCheckoutDirs != nil {
if len(*repoOpts.SparseCheckoutDirs) == 0 {
return nil
}
sparseCheckoutArgs := []string{"-C", i.dir, "sparse-checkout", "set"}
sparseCheckoutArgs = append(sparseCheckoutArgs, (*repoOpts.SparseCheckoutDirs)...)
if out, err := i.executor.Run(sparseCheckoutArgs...); err != nil {
return fmt.Errorf("error setting it to a sparse checkout: %w %v", err, string(out))
}
}
return nil
}

Expand Down
112 changes: 112 additions & 0 deletions prow/git/v2/interactor_test.go
Expand Up @@ -93,6 +93,118 @@ func TestInteractor_Clone(t *testing.T) {
}
}

func TestInteractor_CloneWithRepoOpts(t *testing.T) {
var testCases = []struct {
name string
dir string
from string
remote RemoteResolver
repoOpts RepoOpts
responses map[string]execResponse
expectedCalls [][]string
expectedErr bool
}{
{
name: "blank RepoOpts",
dir: "/secondaryclone",
from: "/mirrorclone",
repoOpts: RepoOpts{},
responses: map[string]execResponse{
"clone /mirrorclone /secondaryclone": {
out: []byte(`ok`),
},
},
expectedCalls: [][]string{
{"clone", "/mirrorclone", "/secondaryclone"},
},
expectedErr: false,
},
{
name: "shared git objects",
dir: "/secondaryclone",
from: "/mirrorclone",
repoOpts: RepoOpts{
SparseCheckoutDirs: nil,
ShareObjectsWithSourceRepo: true,
},
responses: map[string]execResponse{
"clone --shared /mirrorclone /secondaryclone": {
out: []byte(`ok`),
},
},
expectedCalls: [][]string{
{"clone", "--shared", "/mirrorclone", "/secondaryclone"},
},
expectedErr: false,
},
{
name: "shared git objects and sparse checkout (toplevel only)",
dir: "/secondaryclone",
from: "/mirrorclone",
repoOpts: RepoOpts{
SparseCheckoutDirs: &[]string{},
ShareObjectsWithSourceRepo: true,
},
responses: map[string]execResponse{
"clone --shared --sparse /mirrorclone /secondaryclone": {
out: []byte(`ok`),
},
},
expectedCalls: [][]string{
{"clone", "--shared", "--sparse", "/mirrorclone", "/secondaryclone"},
},
expectedErr: false,
},
{
name: "shared git objects and sparse checkout (toplevel+subdirs)",
dir: "/secondaryclone",
from: "/mirrorclone",
repoOpts: RepoOpts{
SparseCheckoutDirs: &[]string{"a", "b"},
ShareObjectsWithSourceRepo: true,
},
responses: map[string]execResponse{
"clone --shared --sparse /mirrorclone /secondaryclone": {
out: []byte(`ok`),
},
"-C /secondaryclone sparse-checkout set a b": {
out: []byte(`ok`),
},
},
expectedCalls: [][]string{
{"clone", "--shared", "--sparse", "/mirrorclone", "/secondaryclone"},
{"-C", "/secondaryclone", "sparse-checkout", "set", "a", "b"},
},
expectedErr: false,
},
}

for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
e := fakeExecutor{
records: [][]string{},
responses: testCase.responses,
}
i := interactor{
executor: &e,
remote: testCase.remote,
dir: testCase.dir,
logger: logrus.WithField("test", testCase.name),
}
actualErr := i.CloneWithRepoOpts(testCase.from, testCase.repoOpts)
if testCase.expectedErr && actualErr == nil {
t.Errorf("%s: expected an error but got none", testCase.name)
}
if !testCase.expectedErr && actualErr != nil {
t.Errorf("%s: expected no error but got one: %v", testCase.name, actualErr)
}
if actual, expected := e.records, testCase.expectedCalls; !reflect.DeepEqual(actual, expected) {
t.Errorf("%s: got incorrect git calls: %v", testCase.name, diff.ObjectReflectDiff(actual, expected))
}
})
}
}

func TestInteractor_MirrorClone(t *testing.T) {
var testCases = []struct {
name string
Expand Down

0 comments on commit d44c183

Please sign in to comment.