Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Run user service containers in --init mode for Docker #965

Merged
merged 2 commits into from Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -546,6 +546,8 @@ func createStartServiceOperation(
memoryAllocationMegabytes,
).WithSkipAddingToBridgeNetworkIfStaticIpIsSet(
skipAddingUserServiceToBridgeNetwork,
).WithContainerInitEnabled(
true,
)

if entrypointArgs != nil {
Expand Down
Expand Up @@ -27,6 +27,7 @@ type CreateAndStartContainerArgs struct {
memoryAllocationMegabytes uint64
loggingDriverConfig LoggingDriver
skipAddingToBridgeNetworkIfStaticIpIsSet bool
containerInitEnabled bool
}

// Builder for creating CreateAndStartContainerArgs object
Expand All @@ -51,6 +52,7 @@ type CreateAndStartContainerArgsBuilder struct {
memoryAllocationMegabytes uint64
loggingDriverCnfg LoggingDriver
skipAddingToBridgeNetworkIfStaticIpIsSet bool
containerInitEnabled bool
}

/*
Expand Down Expand Up @@ -82,6 +84,7 @@ func NewCreateAndStartContainerArgsBuilder(dockerImage string, name string, netw
memoryAllocationMegabytes: 0,
loggingDriverCnfg: nil,
skipAddingToBridgeNetworkIfStaticIpIsSet: false,
containerInitEnabled: false,
}
}

Expand All @@ -107,6 +110,7 @@ func (builder *CreateAndStartContainerArgsBuilder) Build() *CreateAndStartContai
memoryAllocationMegabytes: builder.memoryAllocationMegabytes,
loggingDriverConfig: builder.loggingDriverCnfg,
skipAddingToBridgeNetworkIfStaticIpIsSet: builder.skipAddingToBridgeNetworkIfStaticIpIsSet,
containerInitEnabled: builder.containerInitEnabled,
}
}

Expand Down Expand Up @@ -225,3 +229,13 @@ func (builder *CreateAndStartContainerArgsBuilder) WithSkipAddingToBridgeNetwork
builder.skipAddingToBridgeNetworkIfStaticIpIsSet = skipAddingToBridgeNetworkIfStaticIpIsSet
return builder
}

// WithContainerInitEnabled allows you to set the `--init` options when a container is started in Docker.
// `--init` option wraps the main container process with `tini`, making sure zombie processes are handled properly
// at the container level.
// If not used, a buggy container image can create zombie processes which can prevents docker from removing the
// container. tini makes sure it keeps track of all processes and can kill the zombie ones appropriately.
func (builder *CreateAndStartContainerArgsBuilder) WithContainerInitEnabled(containerInitEnabled bool) *CreateAndStartContainerArgsBuilder {
builder.containerInitEnabled = containerInitEnabled
return builder
}
Expand Up @@ -103,10 +103,12 @@ const (

minMemoryLimit = 6

containerIsNotRunningErrMsg = "is not running"

cannotKillContainerErrMsg = "cannot kill container"
zombieProcessesCannotRemoveContainerErrMsg = "is zombie and can not be killed"
defaultRemoveProcessesMaxRetries = uint8(3)
defaultRemoveContainerTimeBetweenRetries = 10 * time.Second

containerIsNotRunningErrMsg = "is not running"
cannotKillContainerErrMsg = "cannot kill container"
defaultKillContainerMaxRetries = uint8(3)
defaultKillContainerTimeBetweenRetries = 10 * time.Millisecond

Expand Down Expand Up @@ -472,7 +474,8 @@ func (manager *DockerManager) CreateAndStartContainer(
args.needsAccessToDockerHostMachine,
args.cpuAllocationMillicpus,
args.memoryAllocationMegabytes,
args.loggingDriverConfig)
args.loggingDriverConfig,
args.containerInitEnabled)
if err != nil {
return "", nil, stacktrace.Propagate(err, "Failed to configure host to container mappings from service.")
}
Expand Down Expand Up @@ -740,7 +743,7 @@ Args:
containerId: ID of Docker container to kill
*/
func (manager *DockerManager) KillContainer(ctx context.Context, containerId string) error {
if err := manager.killContainerWithRetriesWhenErrorResponseFromDeamon(
if err := manager.killContainerWithRetriesWhenErrorResponseFromDaemon(
ctx,
containerId,
defaultKillContainerMaxRetries,
Expand All @@ -767,12 +770,18 @@ Args:
containerId: ID of Docker container to remove
*/
func (manager *DockerManager) RemoveContainer(ctx context.Context, containerId string) error {
removeOpts := types.ContainerRemoveOptions{
removeOpts := &types.ContainerRemoveOptions{
RemoveVolumes: shouldRemoveAnonymousVolumesWhenRemovingContainers,
RemoveLinks: shouldRemoveLinksWhenRemovingContainers,
Force: shouldKillContainersWhenRemovingContainers,
}
if err := manager.dockerClient.ContainerRemove(ctx, containerId, removeOpts); err != nil {
err := manager.removeContainerWithRetriesOnFailureForZombieProcesses(
ctx,
containerId,
removeOpts,
defaultRemoveProcessesMaxRetries,
defaultRemoveContainerTimeBetweenRetries)
if err != nil {
return stacktrace.Propagate(err, "An error occurred removing container with ID '%v'", containerId)
}
return nil
Expand Down Expand Up @@ -1144,6 +1153,7 @@ func (manager *DockerManager) getContainerHostConfig(
cpuAllocationMillicpus uint64,
memoryAllocationMegabytes uint64,
loggingDriverConfig LoggingDriver,
useInit bool,
) (hostConfig *container.HostConfig, err error) {

bindsList := make([]string, 0, len(bindMounts))
Expand Down Expand Up @@ -1265,6 +1275,7 @@ func (manager *DockerManager) getContainerHostConfig(
// NOTE: Do NOT use PublishAllPorts here!!!! This will work if a Dockerfile doesn't have an EXPOSE directive, but
// if the Dockerfile *does* have an EXPOSE directive then _only_ the ports with EXPOSE will be published
// See also: https://www.ctl.io/developers/blog/post/docker-networking-rules/

containerHostConfigPtr := &container.HostConfig{
Binds: bindsList,
ContainerIDFile: "",
Expand Down Expand Up @@ -1308,7 +1319,7 @@ func (manager *DockerManager) getContainerHostConfig(
Mounts: nil,
MaskedPaths: nil,
ReadonlyPaths: nil,
Init: nil,
Init: &useInit,
}
return containerHostConfigPtr, nil
}
Expand Down Expand Up @@ -1358,7 +1369,7 @@ func (manager *DockerManager) getContainerCfg(
return nodeConfigPtr, nil
}

func (manager *DockerManager) killContainerWithRetriesWhenErrorResponseFromDeamon(
func (manager *DockerManager) killContainerWithRetriesWhenErrorResponseFromDaemon(
ctx context.Context,
containerId string,
maxRetries uint8,
Expand Down Expand Up @@ -1391,6 +1402,34 @@ func (manager *DockerManager) killContainerWithRetriesWhenErrorResponseFromDeamo
return stacktrace.Propagate(err, "An error occurred killing container with ID '%v'", containerId)
}

func (manager *DockerManager) removeContainerWithRetriesOnFailureForZombieProcesses(
ctx context.Context,
containerId string,
options *types.ContainerRemoveOptions,
maxRetries uint8,
timeBetweenRetries time.Duration,
) error {
var err error
for i := uint8(0); i < maxRetries; i++ {
if err = manager.dockerClient.ContainerRemove(ctx, containerId, *options); err != nil {

errMsg := strings.ToLower(err.Error())

// For some stupid reason, ContainerKill throws an error if the container isn't running (even though
// ContainerStop does not)
if strings.Contains(errMsg, zombieProcessesCannotRemoveContainerErrMsg) {
logrus.Warnf("Container with ID '%s' has zombie processes and cannot be removed. Removal will be retried in %f seconds", containerId, timeBetweenRetries.Seconds())
time.Sleep(timeBetweenRetries)
continue
} else {
return err
}
}
return nil
}
return stacktrace.Propagate(err, "All %d attempts to remove container with ID '%s' failed", maxRetries, containerId)
}

// Takes in a PortMap (as reported by Docker container inspect) and returns a map of the used ports -> host port binding on the expected interface
// If no bindings for the interface are found, len(output) < len(input)
func getHostPortBindingsOnExpectedInterface(hostPortBindingsOnAllInterfaces nat.PortMap) map[nat.Port]*nat.PortBinding {
Expand Down