Skip to content

Commit

Permalink
harden redis and sync service. (#712)
Browse files Browse the repository at this point in the history
* harden redis and sync service.

* increase default sync service redis pool size from `runtime.NumCPU()*10` (default) to `8192` to prevent starvation.
    * Starvation is very probable in the sidecar, given that we use the XREAD primitive, which blocks a connection.
    * The sidecar is especially exposed to this, as it is a single process that accumulates a lot of `XREAD` blocked conns to receive network change requests from instances.
    * `8192` is a suitable value because on `local:docker`, we won't run as many containers, and on `cluster:k8s`, sidecars only handle containers colocated on the node.
* increase ulimits on infra containers to 1M open file descriptors.
* expose pprof port of sidecar; enable GC traces for debuggability.
* upgrade to go-redis v7.2.0.

additionally: harden throttling logic in local:docker runner.

* address review comment.
  • Loading branch information
raulk committed Mar 18, 2020
1 parent 0298846 commit 9a98279
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 22 deletions.
2 changes: 2 additions & 0 deletions Dockerfile
Expand Up @@ -33,4 +33,6 @@ RUN mkdir -p /usr/local/bin
COPY --from=0 /testground /usr/local/bin/testground
ENV PATH="/usr/local/bin:${PATH}"

EXPOSE 6060

ENTRYPOINT [ "/usr/local/bin/testground" ]
3 changes: 2 additions & 1 deletion go.mod
Expand Up @@ -18,7 +18,8 @@ require (
github.com/containernetworking/cni v0.7.1
github.com/docker/distribution v2.7.1+incompatible // indirect
github.com/docker/docker v1.4.2-0.20200206084213-b5fc6ea92cde
github.com/docker/go-connections v0.4.0 // indirect
github.com/docker/go-connections v0.4.0
github.com/docker/go-units v0.4.0
github.com/go-playground/validator/v10 v10.1.0
github.com/google/uuid v1.1.1
github.com/gorilla/mux v1.7.3
Expand Down
6 changes: 4 additions & 2 deletions go.sum
Expand Up @@ -123,8 +123,8 @@ github.com/go-playground/universal-translator v0.17.0 h1:icxd5fm+REJzpZx7ZfpaD87
github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+Scu5vgOQjsIJAF8j9muTVoKLVtA=
github.com/go-playground/validator/v10 v10.1.0 h1:LNfPbVcg93V/91tkAQH8nbFbFn7u2X4hHnLMeRZHIMM=
github.com/go-playground/validator/v10 v10.1.0/go.mod h1:uOYAAleCW8F/7oMFd6aG0GOhaH6EGOAJShg8Id5JGkI=
github.com/go-redis/redis/v7 v7.0.0-beta.4 h1:p6z7Pde69EGRWvlC++y8aFcaWegyrKHzOBGo0zUACTQ=
github.com/go-redis/redis/v7 v7.0.0-beta.4/go.mod h1:xhhSbUMTsleRPur+Vgx9sUHtyN33bdjxY+9/0n9Ig8s=
github.com/go-redis/redis/v7 v7.2.0 h1:CrCexy/jYWZjW0AyVoHlcJUeZN19VWlbepTh1Vq6dJs=
github.com/go-redis/redis/v7 v7.2.0/go.mod h1:JDNMw23GTyLNC4GZu9njt15ctBQVn7xjRfnwdHj/Dcg=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4=
github.com/gogo/protobuf v1.0.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
Expand Down Expand Up @@ -493,6 +493,7 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR
golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191004110552-13f9640d40b9 h1:rjwSpXsdiK0dV8/Naq3kAw9ymfAeJIyd0upUIElB+lI=
golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 h1:bHNaocaoJxYBo5cw41UyTMLjYlb8wPY7+WFrnklbHOM=
Expand Down Expand Up @@ -532,6 +533,7 @@ golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3 h1:7TYNF4UdlohbFwpNH04CoPMp1cHUZgO1Ebq5r2hIjfo=
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
Expand Down
35 changes: 27 additions & 8 deletions pkg/runner/local_docker.go
Expand Up @@ -14,6 +14,9 @@ import (
"sync"
"time"

"github.com/docker/go-connections/nat"
"github.com/docker/go-units"

"github.com/ipfs/testground/pkg/api"
"github.com/ipfs/testground/pkg/conv"
"github.com/ipfs/testground/pkg/docker"
Expand All @@ -34,6 +37,8 @@ import (
"golang.org/x/sync/errgroup"
)

const InfraMaxFilesUlimit int64 = 1048576

var (
_ api.Runner = (*LocalDockerRunner)(nil)
_ api.Healthchecker = (*LocalDockerRunner)(nil)
Expand Down Expand Up @@ -510,10 +515,11 @@ func (r *LocalDockerRunner) Run(ctx context.Context, input *api.RunInput, ow io.
)

ctx, cancel := context.WithCancel(ctx)
defer cancel()

log.Infow("starting containers", "count", len(containers))

g, _ := errgroup.WithContext(ctx)
g, gctx := errgroup.WithContext(ctx)
for _, id := range containers {
id := id
f := func() error {
Expand All @@ -525,7 +531,11 @@ func (r *LocalDockerRunner) Run(ctx context.Context, input *api.RunInput, ow io.
err := cli.ContainerStart(ctx, id, types.ContainerStartOptions{})
if err == nil {
log.Debugw("started container", "id", id)
started <- id
select {
case <-gctx.Done():
default:
started <- id
}
}
return err
}
Expand Down Expand Up @@ -599,13 +609,10 @@ func (r *LocalDockerRunner) Run(ctx context.Context, input *api.RunInput, ow io.

select {
case err = <-doneCh:
fmt.Println("done ch: ", err)
case <-ctx.Done():
fmt.Println("context done: ", ctx.Err())
err = ctx.Err()
}

cancel()
return &api.RunOutput{RunID: input.RunID}, err
}

Expand Down Expand Up @@ -702,6 +709,11 @@ func ensureInfraContainer(ctx context.Context, cli *client.Client, log *zap.Suga
HostConfig: &container.HostConfig{
NetworkMode: container.NetworkMode(networkID),
PublishAllPorts: true,
Resources: container.Resources{
Ulimits: []*units.Ulimit{
{Name: "nofile", Hard: InfraMaxFilesUlimit, Soft: InfraMaxFilesUlimit},
},
},
},
PullImageIfMissing: pull,
})
Expand All @@ -726,11 +738,13 @@ func ensureSidecarContainer(ctx context.Context, cli *client.Client, workDir str
ContainerConfig: &container.Config{
Image: "ipfs/testground:latest",
Entrypoint: []string{"testground"},
Cmd: []string{"sidecar", "--runner", "docker"},
Env: []string{"REDIS_HOST=testground-redis"},
Cmd: []string{"sidecar", "--runner", "docker", "--pprof"},
Env: []string{"REDIS_HOST=testground-redis", "GODEBUG=gctrace=1"},
},
HostConfig: &container.HostConfig{
NetworkMode: container.NetworkMode(controlNetworkID),
PublishAllPorts: true,
PortBindings: nat.PortMap{"6060": []nat.PortBinding{{HostIP: "0.0.0.0", HostPort: "0"}}},
NetworkMode: container.NetworkMode(controlNetworkID),
// To lookup namespaces. Can't use SandboxKey for some reason.
PidMode: "host",
// We need _both_ to actually get a network namespace handle.
Expand All @@ -743,6 +757,11 @@ func ensureSidecarContainer(ctx context.Context, cli *client.Client, workDir str
Source: dockerSock,
Target: "/var/run/docker.sock",
}},
Resources: container.Resources{
Ulimits: []*units.Ulimit{
{Name: "nofile", Hard: InfraMaxFilesUlimit, Soft: InfraMaxFilesUlimit},
},
},
},
PullImageIfMissing: false, // Don't pull from Docker Hub
})
Expand Down
13 changes: 8 additions & 5 deletions sdk/sync/common.go
Expand Up @@ -95,11 +95,14 @@ func redisClient(ctx context.Context) (client *redis.Client, err error) {

// TODO: will need to populate opts from an env variable.
opts := redis.Options{
MaxRetries: 5,
MinRetryBackoff: 1 * time.Second,
MaxRetryBackoff: 3 * time.Second,
DialTimeout: 10 * time.Second,
ReadTimeout: 10 * time.Second,
PoolSize: 8192,
MaxRetries: 5,
MinRetryBackoff: 1 * time.Second,
MaxRetryBackoff: 3 * time.Second,
DialTimeout: 10 * time.Second,
ReadTimeout: 10 * time.Second,
IdleCheckFrequency: 30 * time.Second,
MaxConnAge: 1 * time.Minute,
}

for _, h := range tryHosts {
Expand Down
2 changes: 1 addition & 1 deletion sdk/sync/go.mod
Expand Up @@ -3,7 +3,7 @@ module github.com/ipfs/testground/sdk/sync
go 1.14

require (
github.com/go-redis/redis/v7 v7.0.0-beta.4
github.com/go-redis/redis/v7 v7.2.0
github.com/ipfs/testground v0.1.0
github.com/ipfs/testground/sdk/runtime v0.1.0
github.com/libp2p/go-libp2p-core v0.2.3
Expand Down
4 changes: 4 additions & 0 deletions sdk/sync/go.sum
Expand Up @@ -76,6 +76,8 @@ github.com/go-playground/universal-translator v0.17.0/go.mod h1:UkSxE5sNxxRwHyU+
github.com/go-playground/validator/v10 v10.1.0/go.mod h1:uOYAAleCW8F/7oMFd6aG0GOhaH6EGOAJShg8Id5JGkI=
github.com/go-redis/redis/v7 v7.0.0-beta.4 h1:p6z7Pde69EGRWvlC++y8aFcaWegyrKHzOBGo0zUACTQ=
github.com/go-redis/redis/v7 v7.0.0-beta.4/go.mod h1:xhhSbUMTsleRPur+Vgx9sUHtyN33bdjxY+9/0n9Ig8s=
github.com/go-redis/redis/v7 v7.2.0 h1:CrCexy/jYWZjW0AyVoHlcJUeZN19VWlbepTh1Vq6dJs=
github.com/go-redis/redis/v7 v7.2.0/go.mod h1:JDNMw23GTyLNC4GZu9njt15ctBQVn7xjRfnwdHj/Dcg=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
github.com/gogo/protobuf v1.0.0/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
Expand Down Expand Up @@ -340,6 +342,7 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR
golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191004110552-13f9640d40b9 h1:rjwSpXsdiK0dV8/Naq3kAw9ymfAeJIyd0upUIElB+lI=
golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20191109021931-daa7c04131f5 h1:bHNaocaoJxYBo5cw41UyTMLjYlb8wPY7+WFrnklbHOM=
Expand Down Expand Up @@ -375,6 +378,7 @@ golang.org/x/sys v0.0.0-20190626221950-04f50cda93cb/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190922100055-0a153f010e69 h1:rOhMmluY6kLMhdnrivzec6lLgaVbMHMn2ISQXJeJ5EM=
golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200122134326-e047566fdf82 h1:ywK/j/KkyTHcdyYSZNXGjMwgmDSfjglYZ3vStQ/gSCU=
golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5 h1:LfCXLvNmTYH9kEmVgqbnsWfruoXZIrh4YBgqVHtDvw0=
Expand Down
47 changes: 47 additions & 0 deletions sdk/sync/redis_bench_test.go
@@ -0,0 +1,47 @@
package sync

import (
"context"
"fmt"
"sync"
"testing"
)

func BenchmarkBarrier(b *testing.B) {
b.ReportAllocs()

close := ensureRedis(b)
defer close()

runenv := randomRunEnv()

watcher, writer := MustWatcherWriter(context.Background(), runenv)
defer watcher.Close()
defer writer.Close()

target := 1000000
workers := 10
each := target / workers

for n := 0; n < b.N; n++ {
ctx, cancel := context.WithCancel(context.Background())
state := State(fmt.Sprintf("yoda-%d", n))

var wg sync.WaitGroup
for i := 0; i < workers; i++ {
wg.Add(1)
go func(start, end int) {
defer wg.Done()

for i := start; i < end; i++ {
writer.SignalEntry(ctx, state)
}
}(i*each, (i+1)*each)
}

b.ResetTimer()
ch := watcher.Barrier(ctx, state, int64(target))
<-ch
cancel()
}
}
10 changes: 5 additions & 5 deletions sdk/sync/redis_test.go
Expand Up @@ -25,8 +25,8 @@ func init() {

// Check if there's a running instance of redis, or start it otherwise. If we
// start an ad-hoc instance, the close function will terminate it.
func ensureRedis(t *testing.T) (close func()) {
t.Helper()
func ensureRedis(tb testing.TB) (close func()) {
tb.Helper()

// Try to obtain a client; if this fails, we'll attempt to start a redis
// instance.
Expand All @@ -38,20 +38,20 @@ func ensureRedis(t *testing.T) (close func()) {

cmd := exec.Command("redis-server", "-")
if err := cmd.Start(); err != nil {
t.Fatalf("failed to start redis: %s", err)
tb.Fatalf("failed to start redis: %s", err)
}

time.Sleep(1 * time.Second)

// Try to obtain a client again.
if client, err = redisClient(context.Background()); err != nil {
t.Fatalf("failed to obtain redis client despite starting instance: %v", err)
tb.Fatalf("failed to obtain redis client despite starting instance: %v", err)
}
defer client.Close()

return func() {
if err := cmd.Process.Kill(); err != nil {
t.Fatalf("failed while stopping test-scoped redis: %s", err)
tb.Fatalf("failed while stopping test-scoped redis: %s", err)
}
}
}
Expand Down

0 comments on commit 9a98279

Please sign in to comment.