Skip to content

Commit

Permalink
fix: Run user service containers in --init mode for Docker (#965)
Browse files Browse the repository at this point in the history
## Description:
Hopefully fixes #959 
It also adds a retry mechanism on RemoveContainer when this specific
error is hit

## Is this change user facing?
NO
<!-- If yes, please add the "user facing" label to the PR -->
<!-- If yes, don't forget to include docs changes where relevant -->

## References (if applicable):
<!-- Add relevant Github Issues, Discord threads, or other helpful
information. -->
  • Loading branch information
Guillaume Bouvignies committed Jul 25, 2023
1 parent c421afd commit b8989a8
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 9 deletions.
Expand Up @@ -546,6 +546,8 @@ func createStartServiceOperation(
memoryAllocationMegabytes,
).WithSkipAddingToBridgeNetworkIfStaticIpIsSet(
skipAddingUserServiceToBridgeNetwork,
).WithContainerInitEnabled(
true,
)

if entrypointArgs != nil {
Expand Down
Expand Up @@ -27,6 +27,7 @@ type CreateAndStartContainerArgs struct {
memoryAllocationMegabytes uint64
loggingDriverConfig LoggingDriver
skipAddingToBridgeNetworkIfStaticIpIsSet bool
containerInitEnabled bool
}

// Builder for creating CreateAndStartContainerArgs object
Expand All @@ -51,6 +52,7 @@ type CreateAndStartContainerArgsBuilder struct {
memoryAllocationMegabytes uint64
loggingDriverCnfg LoggingDriver
skipAddingToBridgeNetworkIfStaticIpIsSet bool
containerInitEnabled bool
}

/*
Expand Down Expand Up @@ -82,6 +84,7 @@ func NewCreateAndStartContainerArgsBuilder(dockerImage string, name string, netw
memoryAllocationMegabytes: 0,
loggingDriverCnfg: nil,
skipAddingToBridgeNetworkIfStaticIpIsSet: false,
containerInitEnabled: false,
}
}

Expand All @@ -107,6 +110,7 @@ func (builder *CreateAndStartContainerArgsBuilder) Build() *CreateAndStartContai
memoryAllocationMegabytes: builder.memoryAllocationMegabytes,
loggingDriverConfig: builder.loggingDriverCnfg,
skipAddingToBridgeNetworkIfStaticIpIsSet: builder.skipAddingToBridgeNetworkIfStaticIpIsSet,
containerInitEnabled: builder.containerInitEnabled,
}
}

Expand Down Expand Up @@ -225,3 +229,13 @@ func (builder *CreateAndStartContainerArgsBuilder) WithSkipAddingToBridgeNetwork
builder.skipAddingToBridgeNetworkIfStaticIpIsSet = skipAddingToBridgeNetworkIfStaticIpIsSet
return builder
}

// WithContainerInitEnabled allows you to set the `--init` options when a container is started in Docker.
// `--init` option wraps the main container process with `tini`, making sure zombie processes are handled properly
// at the container level.
// If not used, a buggy container image can create zombie processes which can prevents docker from removing the
// container. tini makes sure it keeps track of all processes and can kill the zombie ones appropriately.
func (builder *CreateAndStartContainerArgsBuilder) WithContainerInitEnabled(containerInitEnabled bool) *CreateAndStartContainerArgsBuilder {
builder.containerInitEnabled = containerInitEnabled
return builder
}
Expand Up @@ -103,10 +103,12 @@ const (

minMemoryLimit = 6

containerIsNotRunningErrMsg = "is not running"

cannotKillContainerErrMsg = "cannot kill container"
zombieProcessesCannotRemoveContainerErrMsg = "is zombie and can not be killed"
defaultRemoveProcessesMaxRetries = uint8(3)
defaultRemoveContainerTimeBetweenRetries = 10 * time.Second

containerIsNotRunningErrMsg = "is not running"
cannotKillContainerErrMsg = "cannot kill container"
defaultKillContainerMaxRetries = uint8(3)
defaultKillContainerTimeBetweenRetries = 10 * time.Millisecond

Expand Down Expand Up @@ -472,7 +474,8 @@ func (manager *DockerManager) CreateAndStartContainer(
args.needsAccessToDockerHostMachine,
args.cpuAllocationMillicpus,
args.memoryAllocationMegabytes,
args.loggingDriverConfig)
args.loggingDriverConfig,
args.containerInitEnabled)
if err != nil {
return "", nil, stacktrace.Propagate(err, "Failed to configure host to container mappings from service.")
}
Expand Down Expand Up @@ -740,7 +743,7 @@ Args:
containerId: ID of Docker container to kill
*/
func (manager *DockerManager) KillContainer(ctx context.Context, containerId string) error {
if err := manager.killContainerWithRetriesWhenErrorResponseFromDeamon(
if err := manager.killContainerWithRetriesWhenErrorResponseFromDaemon(
ctx,
containerId,
defaultKillContainerMaxRetries,
Expand All @@ -767,12 +770,18 @@ Args:
containerId: ID of Docker container to remove
*/
func (manager *DockerManager) RemoveContainer(ctx context.Context, containerId string) error {
removeOpts := types.ContainerRemoveOptions{
removeOpts := &types.ContainerRemoveOptions{
RemoveVolumes: shouldRemoveAnonymousVolumesWhenRemovingContainers,
RemoveLinks: shouldRemoveLinksWhenRemovingContainers,
Force: shouldKillContainersWhenRemovingContainers,
}
if err := manager.dockerClient.ContainerRemove(ctx, containerId, removeOpts); err != nil {
err := manager.removeContainerWithRetriesOnFailureForZombieProcesses(
ctx,
containerId,
removeOpts,
defaultRemoveProcessesMaxRetries,
defaultRemoveContainerTimeBetweenRetries)
if err != nil {
return stacktrace.Propagate(err, "An error occurred removing container with ID '%v'", containerId)
}
return nil
Expand Down Expand Up @@ -1144,6 +1153,7 @@ func (manager *DockerManager) getContainerHostConfig(
cpuAllocationMillicpus uint64,
memoryAllocationMegabytes uint64,
loggingDriverConfig LoggingDriver,
useInit bool,
) (hostConfig *container.HostConfig, err error) {

bindsList := make([]string, 0, len(bindMounts))
Expand Down Expand Up @@ -1265,6 +1275,7 @@ func (manager *DockerManager) getContainerHostConfig(
// NOTE: Do NOT use PublishAllPorts here!!!! This will work if a Dockerfile doesn't have an EXPOSE directive, but
// if the Dockerfile *does* have an EXPOSE directive then _only_ the ports with EXPOSE will be published
// See also: https://www.ctl.io/developers/blog/post/docker-networking-rules/

containerHostConfigPtr := &container.HostConfig{
Binds: bindsList,
ContainerIDFile: "",
Expand Down Expand Up @@ -1308,7 +1319,7 @@ func (manager *DockerManager) getContainerHostConfig(
Mounts: nil,
MaskedPaths: nil,
ReadonlyPaths: nil,
Init: nil,
Init: &useInit,
}
return containerHostConfigPtr, nil
}
Expand Down Expand Up @@ -1358,7 +1369,7 @@ func (manager *DockerManager) getContainerCfg(
return nodeConfigPtr, nil
}

func (manager *DockerManager) killContainerWithRetriesWhenErrorResponseFromDeamon(
func (manager *DockerManager) killContainerWithRetriesWhenErrorResponseFromDaemon(
ctx context.Context,
containerId string,
maxRetries uint8,
Expand Down Expand Up @@ -1391,6 +1402,34 @@ func (manager *DockerManager) killContainerWithRetriesWhenErrorResponseFromDeamo
return stacktrace.Propagate(err, "An error occurred killing container with ID '%v'", containerId)
}

func (manager *DockerManager) removeContainerWithRetriesOnFailureForZombieProcesses(
ctx context.Context,
containerId string,
options *types.ContainerRemoveOptions,
maxRetries uint8,
timeBetweenRetries time.Duration,
) error {
var err error
for i := uint8(0); i < maxRetries; i++ {
if err = manager.dockerClient.ContainerRemove(ctx, containerId, *options); err != nil {

errMsg := strings.ToLower(err.Error())

// For some stupid reason, ContainerKill throws an error if the container isn't running (even though
// ContainerStop does not)
if strings.Contains(errMsg, zombieProcessesCannotRemoveContainerErrMsg) {
logrus.Warnf("Container with ID '%s' has zombie processes and cannot be removed. Removal will be retried in %f seconds", containerId, timeBetweenRetries.Seconds())
time.Sleep(timeBetweenRetries)
continue
} else {
return err
}
}
return nil
}
return stacktrace.Propagate(err, "All %d attempts to remove container with ID '%s' failed", maxRetries, containerId)
}

// Takes in a PortMap (as reported by Docker container inspect) and returns a map of the used ports -> host port binding on the expected interface
// If no bindings for the interface are found, len(output) < len(input)
func getHostPortBindingsOnExpectedInterface(hostPortBindingsOnAllInterfaces nat.PortMap) map[nat.Port]*nat.PortBinding {
Expand Down

0 comments on commit b8989a8

Please sign in to comment.