From fd00d79efca2a7d8b3b04ce9d1f4d988dc1d956b Mon Sep 17 00:00:00 2001 From: Guillaume Bouvignies Date: Wed, 26 Jul 2023 20:40:13 +0200 Subject: [PATCH] fix: Fix docker image pull hanging forever (#994) ## Description: ## Is this change user facing? NO ## References (if applicable): --- .../backend_creator/backend_creator.go | 2 +- .../docker/docker_manager/docker_manager.go | 24 ++++++++++++------- .../docker_images_validator.go | 5 ++-- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/backend_creator/backend_creator.go b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/backend_creator/backend_creator.go index c9e25ecaf2..c1c90bc5ba 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/backend_creator/backend_creator.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/backend_creator/backend_creator.go @@ -18,7 +18,7 @@ import ( ) const ( - dockerClientTimeout = 30 * time.Second + dockerClientTimeout = 5 * time.Minute ) // TODO Delete this when we split up KurtosisBackend into various parts diff --git a/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go b/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go index d38740365c..2b7e05133d 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go @@ -1096,7 +1096,7 @@ func (manager *DockerManager) isImageAvailableLocally(ctx context.Context, image } func (manager *DockerManager) pullImage(context context.Context, imageName string) (err error) { - logrus.Infof("Pulling image '%s'...", imageName) + logrus.Infof("Pulling image '%s'", imageName) err, retryWithLinuxAmd64 := pullImage(context, manager.dockerClient, imageName, defaultPlatform) if err == nil { return nil @@ -1105,10 +1105,12 @@ func (manager *DockerManager) pullImage(context context.Context, imageName strin return stacktrace.Propagate(err, "Tried pulling image '%v' but failed", imageName) } // we retry with linux/amd64 + logrus.Debugf("Retrying pulling image '%s' for '%s'", imageName, linuxAmd64) err, _ = pullImage(context, manager.dockerClient, imageName, linuxAmd64) if err != nil { return stacktrace.Propagate(err, "Had previously failed with a manifest error so tried pulling image '%v' for platform '%v' but failed", imageName, linuxAmd64) } + logrus.Warnf("Image '%s' successfully pulled for '%s' which is not the architecture this OS is running on.", imageName, linuxAmd64) return nil } @@ -1719,6 +1721,7 @@ func getEndpointSettingsForIpAddress(ipAddress string, alias string) *network.En } func pullImage(ctx context.Context, dockerClient *client.Client, imageName string, platform string) (error, bool) { + logrus.Tracef("Starting pulling '%s' for platform '%s'", imageName, platform) out, err := dockerClient.ImagePull(ctx, imageName, types.ImagePullOptions{ All: false, RegistryAuth: "", @@ -1729,18 +1732,21 @@ func pullImage(ctx context.Context, dockerClient *client.Client, imageName strin return stacktrace.Propagate(err, "Tried pulling image '%v' with platform '%v' but failed", imageName, platform), false } defer out.Close() + logrus.Tracef("Finished pulling '%s' for platform '%s'. Analyzing response to check for errors", imageName, platform) responseDecoder := json.NewDecoder(out) - var jsonMessage *jsonmessage.JSONMessage for { - if err = responseDecoder.Decode(&jsonMessage); err != nil { - if err == io.EOF { - break - } + jsonMessage := new(jsonmessage.JSONMessage) + err = responseDecoder.Decode(&jsonMessage) + if err == io.EOF { + break } - if jsonMessage.Error == nil { - continue + if err != nil { + return stacktrace.Propagate(err, "ImagePull for '%s' on platform '%s' failed with an unexpected error", imageName, platform), false + } + if jsonMessage.Error != nil { + return stacktrace.NewError("ImagePull failed with the following error '%v'", jsonMessage.Error.Message), strings.HasPrefix(jsonMessage.Error.Message, architectureErrorString) } - return stacktrace.NewError("ImagePull failed with the following error '%v'", jsonMessage.Error.Message), strings.HasPrefix(jsonMessage.Error.Message, architectureErrorString) } + logrus.Tracef("No error pulling '%s' for platform '%s'. Returning", imageName, platform) return nil, false } diff --git a/core/server/api_container/server/startosis_engine/startosis_validator/docker_images_validator.go b/core/server/api_container/server/startosis_engine/startosis_validator/docker_images_validator.go index 2db140ac24..74c37ddec3 100644 --- a/core/server/api_container/server/startosis_engine/startosis_validator/docker_images_validator.go +++ b/core/server/api_container/server/startosis_engine/startosis_validator/docker_images_validator.go @@ -45,15 +45,14 @@ func (validator *DockerImagesValidator) Validate(ctx context.Context, environmen wg := &sync.WaitGroup{} for image := range environment.requiredDockerImages { wg.Add(1) - logrus.Debugf("Starting the download of image: '%s'", image) go fetchImageFromBackend(ctx, wg, imageCurrentlyDownloading, validator.kurtosisBackend, image, pullErrors, imageDownloadStarted, imageDownloadFinished) } wg.Wait() - logrus.Debug("All image validation submitted, currently in progress.") } func fetchImageFromBackend(ctx context.Context, wg *sync.WaitGroup, imageCurrentlyDownloading chan bool, backend *backend_interface.KurtosisBackend, image string, pullErrors chan<- error, imageDownloadStarted chan<- string, imageDownloadFinished chan<- string) { + logrus.Debugf("Requesting the download of image: '%s'", image) defer wg.Done() imageCurrentlyDownloading <- true imageDownloadStarted <- image @@ -62,8 +61,10 @@ func fetchImageFromBackend(ctx context.Context, wg *sync.WaitGroup, imageCurrent imageDownloadFinished <- image }() + logrus.Debugf("Starting the download of image: '%s'", image) err := (*backend).FetchImage(ctx, image) if err != nil { + logrus.Warnf("Container image '%s' download failed. Error was: '%s'", image, err.Error()) pullErrors <- startosis_errors.NewValidationError("Failed fetching the required image '%v', make sure that the image exists and is public", image) } logrus.Debugf("Container image '%s' successfully downloaded", image)