Skip to content

Commit

Permalink
fix: restart log aggregator on failure (#1371)
Browse files Browse the repository at this point in the history
## Description:
This change adjusts the log aggregator container configuration so that
it's Docker always restarts the container when it detects it has exited
with non-zero exit code. It improve how we handle errors related to log
aggregator not existing when creating an enclave.

This is a bandaid that should address
#1311 but doesn't
address the issue entirely. More context in issue's discussion.

## Is this change user facing?
YES (users should not experience
#1311 anymore)

## References (if applicable):
#1311
  • Loading branch information
tedim52 committed Sep 26, 2023
1 parent 8d58f71 commit 7f171ce
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 14 deletions.
Expand Up @@ -2,6 +2,7 @@ package docker_kurtosis_backend

import (
"context"
"github.com/sirupsen/logrus"
"io"
"sync"

Expand Down Expand Up @@ -408,14 +409,26 @@ func (backend *DockerKurtosisBackend) CreateLogsCollectorForEnclave(
*logs_collector.LogsCollector,
error,
) {
logsAggregator, err := backend.GetLogsAggregator(ctx)
var logsAggregator *logs_aggregator.LogsAggregator
maybeLogsAggregator, err := logs_aggregator_functions.GetLogsAggregator(ctx, backend.dockerManager)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred getting the logs aggregator; the logs collector cannot be run without a logs aggregator")
}

if logsAggregator == nil || logsAggregator.GetStatus() != container.ContainerStatus_Running {
return nil, stacktrace.NewError("The logs aggregator is not running; the logs collector cannot be run without a running logs aggregator")
return nil, stacktrace.Propagate(err, "An error occurred getting the logs aggregator. The logs collector cannot be run without a logs aggregator.")
}
if maybeLogsAggregator == nil {
logrus.Warnf("Logs aggregator container does not exist. This is unexpected as docker should have restarted the container automatically.")
logrus.Warnf("This can be fixed by restarting the engine using `kurtosis engine restart` and attempting to create the enclave again.")
return nil, stacktrace.Propagate(err, "No logs aggregator container exists. The logs collector cannot be run without a logs aggregator.")
}
if maybeLogsAggregator.GetStatus() != container.ContainerStatus_Running {
logrus.Warnf("Logs aggregator exists but is not running. Instead container status is '%v'. This is unexpected as docker should have restarted the container automatically.",
maybeLogsAggregator.GetStatus())
logrus.Warnf("This can be fixed by restarting the engine using `kurtosis engine restart` and attempting to create the enclave again.")
return nil, stacktrace.Propagate(err,
"The logs aggregator container exists but is not running. Instead container status is '%v'. The logs collector cannot be run without a logs aggregator.",
maybeLogsAggregator.GetStatus(),
)
}
logsAggregator = maybeLogsAggregator

//Declaring the implementation
logsCollectorContainer := fluentbit.NewFluentbitLogsCollectorContainer()
Expand Down
Expand Up @@ -104,6 +104,12 @@ func CreateEngine(
return nil, stacktrace.Propagate(err,
"An error occurred attempting to create logging components for engine with GUID '%v' in Docker network with network id '%v'.", engineGuidStr, targetNetworkId)
}
shouldRemoveLogsAggregator := true
defer func() {
if shouldRemoveLogsAggregator {
removeLogsAggregatorFunc()
}
}()

// schedule log removal for log retention
go func() {
Expand All @@ -118,13 +124,6 @@ func CreateEngine(
logRemover.Run()
}
}()

shouldRemoveCentralizedLogComponents := true
defer func() {
if shouldRemoveCentralizedLogComponents {
removeLogsAggregatorFunc()
}
}()
logrus.Infof("Centralized logs components started.")

enclaveManagerUIPortSpec, err := port_spec.NewPortSpec(uint16(enclaveManagerUIPort), consts.EngineTransportProtocol, consts.HttpApplicationProtocol, defaultWait)
Expand Down Expand Up @@ -265,7 +264,7 @@ func CreateEngine(
return nil, stacktrace.Propagate(err, "An error occurred creating an engine object from container with GUID '%v'", containerId)
}

shouldRemoveCentralizedLogComponents = false
shouldRemoveLogsAggregator = false
shouldKillEngineContainer = false
return result, nil
}
Expand Up @@ -52,6 +52,10 @@ func (vector *vectorContainerConfigProvider) GetContainerArgs(
),
}

// The logs aggregator should ALWAYS be running to ensure that no logs are lost for services in enclaves
// Thus, instruct docker to restart the container if it exits with non-zero status code for whatever reason
restartPolicy := docker_manager.RestartPolicy("on-failure")

createAndStartArgs := docker_manager.NewCreateAndStartContainerArgsBuilder(
containerImage,
containerName,
Expand All @@ -66,6 +70,8 @@ func (vector *vectorContainerConfigProvider) GetContainerArgs(
},
).WithCmdArgs(
overrideCmd,
).WithRestartPolicy(
restartPolicy,
).Build()

return createAndStartArgs, nil
Expand Down

0 comments on commit 7f171ce

Please sign in to comment.