Skip to content

Commit

Permalink
automatically expose pprof+prometheus endpoint. (#677)
Browse files Browse the repository at this point in the history
  • Loading branch information
raulk committed Mar 18, 2020
1 parent ef5694c commit e16f9ca
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 40 deletions.
3 changes: 2 additions & 1 deletion .dockerignore
Expand Up @@ -5,4 +5,5 @@
**/.gitignore
**/.gitkeep
**/.gitmodules
**/Dockerfile
**/Dockerfile
**/infra
13 changes: 9 additions & 4 deletions manifests/placebo.toml
Expand Up @@ -52,7 +52,12 @@ instances = { min = 1, max = 250, default = 1 }
[testcases.params]
some_param = { type = "int", desc = "some param", unit = "peers" }

# # seq 3
# [[testcases]]
# name = "panic"
# instances = { min = 1, max = 250, default = 1 }
# seq 3
[[testcases]]
name = "panic"
instances = { min = 1, max = 250, default = 1 }

# seq 4
[[testcases]]
name = "stall"
instances = { min = 1, max = 250, default = 1 }
1 change: 1 addition & 0 deletions pkg/build/golang/Dockerfile.template
Expand Up @@ -64,4 +64,5 @@ COPY --from=0 /tmp/delete_me /tmp/go-ipfs/ipfs* /usr/local/bin/
RUN rm -f /usr/local/bin/delete_me
ENV PATH="/usr/local/bin:${PATH}"

EXPOSE 6060
ENTRYPOINT [ "/testplan", "--vv"]
11 changes: 7 additions & 4 deletions pkg/docker/container.go
Expand Up @@ -3,6 +3,7 @@ package docker
import (
"context"
"errors"
"fmt"
"os"

"github.com/docker/docker/api/types"
Expand Down Expand Up @@ -127,13 +128,15 @@ func EnsureContainer(ctx context.Context, log *zap.SugaredLogger, cli *client.Cl
log.Infow("starting new container", "id", res.ID)

err = cli.ContainerStart(ctx, res.ID, types.ContainerStartOptions{})
if err == nil {
log.Infow("started container", "id", res.ID)
if err != nil {
return nil, false, err
}

log.Infow("started container", "id", res.ID)

c, err := cli.ContainerInspect(ctx, res.ID)
if err == nil {
log.Infow("started container", "id", res.ID)
if err != nil {
return nil, false, fmt.Errorf("failed to inspect container: %w", err)
}

return &c, true, err
Expand Down
9 changes: 7 additions & 2 deletions pkg/runner/local_docker.go
Expand Up @@ -588,6 +588,11 @@ func ensureControlNetwork(ctx context.Context, cli *client.Client, log *zap.Suga
ctx,
log, cli,
"testground-control",
// making internal=false enables us to expose ports to the host (e.g.
// pprof and prometheus). by itself, it would allow the container to
// access the Internet, and therefore would break isolation, but since
// we have sidecar overriding the default Docker ip routes, and
// suppressing such traffic, we're safe.
false,
network.IPAMConfig{
Subnet: controlSubnet,
Expand Down Expand Up @@ -635,14 +640,14 @@ func newDataNetwork(ctx context.Context, cli *client.Client, log *zap.SugaredLog
}

// ensure container is started
func ensureInfraContainer(ctx context.Context, cli *client.Client, log *zap.SugaredLogger, containerName string, imageName string, NetworkID string, pull bool) (id string, err error) {
func ensureInfraContainer(ctx context.Context, cli *client.Client, log *zap.SugaredLogger, containerName string, imageName string, networkID string, pull bool) (id string, err error) {
container, _, err := docker.EnsureContainer(ctx, log, cli, &docker.EnsureContainerOpts{
ContainerName: containerName,
ContainerConfig: &container.Config{
Image: imageName,
},
HostConfig: &container.HostConfig{
NetworkMode: container.NetworkMode(NetworkID),
NetworkMode: container.NetworkMode(networkID),
PublishAllPorts: true,
},
PullImageIfMissing: pull,
Expand Down
36 changes: 34 additions & 2 deletions pkg/sidecar/docker_reactor.go
Expand Up @@ -18,6 +18,16 @@ import (
"github.com/ipfs/testground/sdk/runtime"
)

// PublicAddr points to an IP address in the public range. It helps us discover
// the IP address of the gateway (i.e. the Docker host) on the control network
// (the learned route will be via the control network because, at this point,
// the only network that's attached to the container is the control network).
//
// Sidecar doesn't whitelist traffic to public addresses, but it special-cases
// traffic between the container and the host, so that pprof, metrics and other
// ports can be exposed to the Docker host.
var PublicAddr = net.ParseIP("1.1.1.1")

type DockerReactor struct {
routes []net.IP
manager *docker.Manager
Expand Down Expand Up @@ -163,8 +173,6 @@ func (d *DockerReactor) handleContainer(ctx context.Context, container *docker.C

// TODO: Some of this code could be factored out into helpers.

// Get the routes to redis. We need to keep these.

var controlRoutes []netlink.Route
for _, route := range d.routes {
nlroutes, err := netlinkHandle.RouteGet(route)
Expand All @@ -174,6 +182,30 @@ func (d *DockerReactor) handleContainer(ctx context.Context, container *docker.C
controlRoutes = append(controlRoutes, nlroutes...)
}

// Get the route to a public address. We will NOT be whitelisting traffic to
// public IPs, but this helps us discover the IP address of the Docker host
// on the control network. See the godoc on the PublicAddr var for more
// info.
pub, err := netlinkHandle.RouteGet(PublicAddr)
if err != nil {
return nil, fmt.Errorf("failed to resolve route for %s: %w", PublicAddr, err)
}

switch {
case len(pub) == 0:
logging.S().Warnw("failed to discover gateway/host address; no routes to public IPs", "container_id", container.ID)
case pub[0].Gw == nil:
logging.S().Warnw("failed to discover gateway/host address; gateway is nil", "route", pub[0], "container_id", container.ID)
default:
hostRoutes, err := netlinkHandle.RouteGet(pub[0].Gw)
if err != nil {
logging.S().Warnw("failed to add route for gateway/host address", "error", err, "route", pub[0], "container_id", container.ID)
break
}
logging.S().Infow("successfully resolved route to host", "container_id", container.ID)
controlRoutes = append(controlRoutes, hostRoutes...)
}

for id, link := range links {
if name, ok := reverseIndex[id]; ok {
// manage this network
Expand Down
4 changes: 4 additions & 0 deletions pkg/tgwriter/tgwriter.go
Expand Up @@ -44,6 +44,10 @@ type Error struct {
}

func (tgw *TgWriter) Write(p []byte) (n int, err error) {
if p == nil {
return 0, nil
}

pld := Msg{
Type: "progress",
Payload: p,
Expand Down
13 changes: 7 additions & 6 deletions plans/placebo/main.go
Expand Up @@ -12,6 +12,7 @@ import (
func main() {
runtime.Invoke(run)
}

func run(runenv *runtime.RunEnv) error {
if runenv.TestCaseSeq < 0 {
panic("test case sequence number not set")
Expand All @@ -21,25 +22,25 @@ func run(runenv *runtime.RunEnv) error {
case 0:
return nil
case 2:
// expose prometheus endpoint
listener := runenv.MustExportPrometheus()
defer listener.Close()

// create context for cancelation
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

// snapshot metrics every second and save them into "metrics" directory
err := runenv.HTTPPeriodicSnapshots(ctx, "http://"+listener.Addr().String(), time.Second, "metrics")
err := runenv.HTTPPeriodicSnapshots(ctx, "http://"+runtime.HTTPListenAddr+"/metrics", time.Second, "metrics")
if err != nil {
return err
}

time.Sleep(time.Second * 10)
time.Sleep(time.Second * 5)
return nil
case 3:
// panic
panic(errors.New("this is an intentional panic"))
case 4:
// stall
time.Sleep(24 * time.Hour)
return nil
default:
return fmt.Errorf("aborting")
}
Expand Down
20 changes: 0 additions & 20 deletions sdk/runtime/metrics.go
Expand Up @@ -3,15 +3,13 @@ package runtime
import (
"context"
"io"
"net"
"net/http"
"os"
"path"
"strconv"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

func NewCounter(runenv *RunEnv, name string, help string) prometheus.Counter {
Expand Down Expand Up @@ -50,24 +48,6 @@ func NewSummary(runenv *RunEnv, name string, help string, opts prometheus.Summar
return s
}

// MustExportPrometheus starts an HTTP server with the Prometheus handler.
// It starts on a random open port and returns the listener. It is the caller
// responsability to close the listener.
func (re *RunEnv) MustExportPrometheus() net.Listener {
listener, err := net.Listen("tcp", "127.0.0.1:0")
if err != nil {
panic(err)
}

go func() {
// avoid triggering golangci-lint for not checking
// the error.
_ = http.Serve(listener, promhttp.Handler())
}()

return listener
}

// HTTPPeriodicSnapshots periodically fetches the snapshots from the given address
// and outputs them to the out directory. Every file will be in the format timestamp.out.
func (re *RunEnv) HTTPPeriodicSnapshots(ctx context.Context, addr string, dur time.Duration, outDir string) error {
Expand Down
44 changes: 43 additions & 1 deletion sdk/runtime/runner.go
Expand Up @@ -3,12 +3,31 @@ package runtime
import (
"fmt"
"io"
"net"
"os"
"runtime/debug"
"strings"
"time"

"net/http"
_ "net/http/pprof"

"github.com/prometheus/client_golang/prometheus/promhttp"
)

const (
// These ports are the HTTP ports we'll attempt to bind to. If this instance
// is running in a Docker container, binding to 6060 is safe. If it's a
// local:exec run, these ports belong to the host, so starting more than one
// instance will lead to a collision. Therefore we fallback to 0.
HTTPPort = 6060
HTTPPortFallback = 0
)

// HTTPListenAddr will be set to the listener address _before_ the test case is
// invoked. If we were unable to start the listener, this value will be "".
var HTTPListenAddr string

// Invoke runs the passed test-case and reports the result.
func Invoke(tc func(*RunEnv) error) {
var (
Expand All @@ -17,6 +36,8 @@ func Invoke(tc func(*RunEnv) error) {
durationGauge = NewGauge(runenv, "plan_duration", "Run time (seconds)")
)

setupHTTPListener(runenv)

// The prometheus pushgateway has a customized scrape interval, which is used to hint to the
// prometheus operator at which interval the it should be scraped. This is currently set to 5s.
// To provide an updated metric in every scrape, jobs will push to the pushgateway at the same
Expand Down Expand Up @@ -80,7 +101,6 @@ func Invoke(tc func(*RunEnv) error) {

if err = errfile.Sync(); err != nil {
runenv.RecordCrash(fmt.Errorf("stderr file tee sync failed failed: %w", err))
return
}
}()

Expand All @@ -107,3 +127,25 @@ func Invoke(tc func(*RunEnv) error) {
_ = rd.Close()
<-doneCh
}

func setupHTTPListener(runenv *RunEnv) {
addr := fmt.Sprintf("0.0.0.0:%d", HTTPPort)
l, err := net.Listen("tcp", addr)
if err != nil {
addr = fmt.Sprintf("0.0.0.0:%d", HTTPPortFallback)
if l, err = net.Listen("tcp", addr); err != nil {
runenv.RecordMessage("error registering default http handler at: %s: %s", addr, err)
return
}
}

// DefaultServeMux already includes the pprof handler, add the
// Prometheus handler.
http.DefaultServeMux.Handle("/metrics", promhttp.Handler())

HTTPListenAddr = l.Addr().String()

runenv.RecordMessage("registering default http handler at: http://%s/ (pprof: http://%s/debug/pprof/)", HTTPListenAddr, HTTPListenAddr)

go http.Serve(l, nil)
}

0 comments on commit e16f9ca

Please sign in to comment.