diff --git a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go index 2242f8e3e3..d1948e7d5e 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go @@ -2,6 +2,7 @@ package docker_kurtosis_backend import ( "context" + "github.com/sirupsen/logrus" "io" "sync" @@ -408,14 +409,26 @@ func (backend *DockerKurtosisBackend) CreateLogsCollectorForEnclave( *logs_collector.LogsCollector, error, ) { - logsAggregator, err := backend.GetLogsAggregator(ctx) + var logsAggregator *logs_aggregator.LogsAggregator + maybeLogsAggregator, err := logs_aggregator_functions.GetLogsAggregator(ctx, backend.dockerManager) if err != nil { - return nil, stacktrace.Propagate(err, "An error occurred getting the logs aggregator; the logs collector cannot be run without a logs aggregator") - } - - if logsAggregator == nil || logsAggregator.GetStatus() != container.ContainerStatus_Running { - return nil, stacktrace.NewError("The logs aggregator is not running; the logs collector cannot be run without a running logs aggregator") + return nil, stacktrace.Propagate(err, "An error occurred getting the logs aggregator. The logs collector cannot be run without a logs aggregator.") + } + if maybeLogsAggregator == nil { + logrus.Warnf("Logs aggregator container does not exist. This is unexpected as docker should have restarted the container automatically.") + logrus.Warnf("This can be fixed by restarting the engine using `kurtosis engine restart` and attempting to create the enclave again.") + return nil, stacktrace.Propagate(err, "No logs aggregator container exists. The logs collector cannot be run without a logs aggregator.") + } + if maybeLogsAggregator.GetStatus() != container.ContainerStatus_Running { + logrus.Warnf("Logs aggregator exists but is not running. Instead container status is '%v'. This is unexpected as docker should have restarted the container automatically.", + maybeLogsAggregator.GetStatus()) + logrus.Warnf("This can be fixed by restarting the engine using `kurtosis engine restart` and attempting to create the enclave again.") + return nil, stacktrace.Propagate(err, + "The logs aggregator container exists but is not running. Instead container status is '%v'. The logs collector cannot be run without a logs aggregator.", + maybeLogsAggregator.GetStatus(), + ) } + logsAggregator = maybeLogsAggregator //Declaring the implementation logsCollectorContainer := fluentbit.NewFluentbitLogsCollectorContainer() diff --git a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/engine_functions/create_engine.go b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/engine_functions/create_engine.go index 79c3ebd36a..e2fabdad9d 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/engine_functions/create_engine.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/engine_functions/create_engine.go @@ -104,6 +104,12 @@ func CreateEngine( return nil, stacktrace.Propagate(err, "An error occurred attempting to create logging components for engine with GUID '%v' in Docker network with network id '%v'.", engineGuidStr, targetNetworkId) } + shouldRemoveLogsAggregator := true + defer func() { + if shouldRemoveLogsAggregator { + removeLogsAggregatorFunc() + } + }() // schedule log removal for log retention go func() { @@ -118,13 +124,6 @@ func CreateEngine( logRemover.Run() } }() - - shouldRemoveCentralizedLogComponents := true - defer func() { - if shouldRemoveCentralizedLogComponents { - removeLogsAggregatorFunc() - } - }() logrus.Infof("Centralized logs components started.") enclaveManagerUIPortSpec, err := port_spec.NewPortSpec(uint16(enclaveManagerUIPort), consts.EngineTransportProtocol, consts.HttpApplicationProtocol, defaultWait) @@ -265,7 +264,7 @@ func CreateEngine( return nil, stacktrace.Propagate(err, "An error occurred creating an engine object from container with GUID '%v'", containerId) } - shouldRemoveCentralizedLogComponents = false + shouldRemoveLogsAggregator = false shouldKillEngineContainer = false return result, nil } diff --git a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go index f7822deca2..9698cb2fd1 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go @@ -52,6 +52,10 @@ func (vector *vectorContainerConfigProvider) GetContainerArgs( ), } + // The logs aggregator should ALWAYS be running to ensure that no logs are lost for services in enclaves + // Thus, instruct docker to restart the container if it exits with non-zero status code for whatever reason + restartPolicy := docker_manager.RestartPolicy("on-failure") + createAndStartArgs := docker_manager.NewCreateAndStartContainerArgsBuilder( containerImage, containerName, @@ -66,6 +70,8 @@ func (vector *vectorContainerConfigProvider) GetContainerArgs( }, ).WithCmdArgs( overrideCmd, + ).WithRestartPolicy( + restartPolicy, ).Build() return createAndStartArgs, nil