From fd7aff3f96900b5518c45c577b4e33be186c0c09 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Mon, 13 Jun 2022 10:22:32 -0400 Subject: [PATCH 1/2] client: add `OTEL_RESOURCE_ATTRIBUTES` env var. Add new task hook to inject a `OTEL_RESOURCE_ATTRIBUTES` environment variable with Nomad attributes into tasks. The attributes set are related to the alloc and specific task that is running, the node where the alloc is running, and the job and eval that generated the alloc. These attributes are merged if the task already defines a `OTEL_RESOURCE_ATTRIBUTES` environment variable, or disabled if the value defined by the task is an empty string. --- client/allocrunner/taskrunner/otel_hook.go | 144 ++++++++++++++++++ .../allocrunner/taskrunner/otel_hook_test.go | 140 +++++++++++++++++ .../taskrunner/task_runner_hooks.go | 5 + go.mod | 1 + go.sum | 2 + website/content/docs/runtime/environment.mdx | 97 ++++++++++++ website/content/partials/envvars.mdx | 11 ++ 7 files changed, 400 insertions(+) create mode 100644 client/allocrunner/taskrunner/otel_hook.go create mode 100644 client/allocrunner/taskrunner/otel_hook_test.go diff --git a/client/allocrunner/taskrunner/otel_hook.go b/client/allocrunner/taskrunner/otel_hook.go new file mode 100644 index 000000000000..cadbac9bb744 --- /dev/null +++ b/client/allocrunner/taskrunner/otel_hook.go @@ -0,0 +1,144 @@ +package taskrunner + +import ( + "context" + "fmt" + "net/url" + + log "github.com/hashicorp/go-hclog" + multierror "github.com/hashicorp/go-multierror" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" + "github.com/hashicorp/nomad/nomad/structs" + "go.opentelemetry.io/otel/baggage" +) + +const envKeyOtelResourceAttrs = "OTEL_RESOURCE_ATTRIBUTES" + +type otelHookConfig struct { + logger log.Logger + alloc *structs.Allocation + node *structs.Node +} + +type otelHook struct { + alloc *structs.Allocation + node *structs.Node + logger log.Logger +} + +func newOtelHook(config *otelHookConfig) *otelHook { + hook := &otelHook{ + alloc: config.alloc, + node: config.node, + } + hook.logger = config.logger.Named(hook.Name()). + With("alloc_id", config.alloc.ID) + + return hook +} + +func (h *otelHook) Name() string { + return "otel" +} + +func (h *otelHook) Prestart(_ context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error { + logger := h.logger.With("task", req.Task.Name) + + resourceAttrsEnv, ok := req.TaskEnv.EnvMap[envKeyOtelResourceAttrs] + if ok && resourceAttrsEnv == "" { + logger.Debug("skipping OTEL_RESOURCE_ATTRIBUTES environment variable") + return nil + } + + resourceAttrs, err := generateBaggage(h.alloc, req.Task, h.node) + if err != nil { + logger.Warn("failed to generate OTEL_RESOURCE_ATTRIBUTES environment variable", "error", err) + return nil + } + + if resourceAttrsEnv != "" { + logger.Debug("merging existing OTEL_RESOURCE_ATTRIBUTES environment variable values", "attrs", resourceAttrsEnv) + + taskBaggage, err := baggage.Parse(resourceAttrsEnv) + if err != nil { + logger.Warn("failed to parse task environment variable OTEL_RESOURCE_ATTRIBUTES as baggage", + "otel_resource_attributes", resourceAttrsEnv, "error", err) + } else { + for _, m := range taskBaggage.Members() { + k, v := m.Key(), m.Value() + logger.Trace("found member", "key", k, "value", v) + + // TODO(luiz): don't create new member once baggage.Members() + // returns values with `hasData` set to `true`. + // https://github.com/open-telemetry/opentelemetry-go/issues/3164 + member, err := baggage.NewMember(k, v) + if err != nil { + logger.Warn("failed to create new baggage member", "key", k, "value", v, "error", err) + continue + } + + resourceAttrs, err = resourceAttrs.SetMember(member) + if err != nil { + logger.Warn("failed to set new baggage member", "key", k, "value", v, "error", err) + continue + } + } + } + } + + // TODO(luiz): remove decode step once the Otel SDK handles it internally. + // https://github.com/open-telemetry/opentelemetry-go/pull/2963 + attrs, err := url.QueryUnescape(resourceAttrs.String()) + if err != nil { + attrs = resourceAttrs.String() + } + resp.Env = map[string]string{ + envKeyOtelResourceAttrs: attrs, + } + return nil +} + +func generateBaggage(alloc *structs.Allocation, task *structs.Task, node *structs.Node) (baggage.Baggage, error) { + var mErr *multierror.Error + job := alloc.Job + members := []baggage.Member{ + newMember("nomad.alloc.createTime", fmt.Sprintf("%v", alloc.CreateTime), mErr), + newMember("nomad.alloc.id", alloc.ID, mErr), + newMember("nomad.alloc.name", alloc.Name, mErr), + newMember("nomad.eval.id", alloc.EvalID, mErr), + newMember("nomad.group.name", alloc.TaskGroup, mErr), + newMember("nomad.job.id", job.ID, mErr), + newMember("nomad.job.name", job.Name, mErr), + newMember("nomad.job.region", job.Region, mErr), + newMember("nomad.job.type", job.Type, mErr), + newMember("nomad.namespace", alloc.Namespace, mErr), + newMember("nomad.node.id", node.ID, mErr), + newMember("nomad.node.name", node.Name, mErr), + newMember("nomad.node.datacenter", node.Datacenter, mErr), + newMember("nomad.task.name", task.Name, mErr), + newMember("nomad.task.driver", task.Driver, mErr), + } + if job.ParentID != "" { + members = append(members, newMember("nomad.job.parentId", job.ParentID, mErr)) + } + if node.NodeClass != "" { + members = append(members, newMember("nomad.node.class", node.NodeClass, mErr)) + } + if err := mErr.ErrorOrNil(); err != nil { + return baggage.Baggage{}, err + } + + b, err := baggage.New(members...) + if err != nil { + _ = multierror.Append(mErr, err) + } + return b, mErr.ErrorOrNil() +} + +func newMember(key, value string, mErr *multierror.Error) baggage.Member { + m, err := baggage.NewMember(key, value) + if err != nil { + _ = multierror.Append(mErr, err) + } + return m +} diff --git a/client/allocrunner/taskrunner/otel_hook_test.go b/client/allocrunner/taskrunner/otel_hook_test.go new file mode 100644 index 000000000000..9d506d2def47 --- /dev/null +++ b/client/allocrunner/taskrunner/otel_hook_test.go @@ -0,0 +1,140 @@ +package taskrunner + +import ( + "context" + "fmt" + "os" + "testing" + + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" + "github.com/hashicorp/nomad/client/taskenv" + "github.com/hashicorp/nomad/nomad/mock" + "go.opentelemetry.io/otel/baggage" + + "github.com/shoenig/test/must" +) + +// Statically assert the otel hook implements the expected interfaces +var _ interfaces.TaskPrestartHook = &otelHook{} + +func TestTaskRunner_OtelHook(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + taskEnv map[string]string + expectNomadAttrs bool + expectAdditionalAttrs map[string]string + }{ + { + name: "tasks have otel resource attributes env var", + expectNomadAttrs: true, + }, + { + name: "disable otel resource attributes env var", + taskEnv: map[string]string{ + envKeyOtelResourceAttrs: "", + }, + expectNomadAttrs: false, + }, + { + name: "merge otel resource attributes env var", + taskEnv: map[string]string{ + envKeyOtelResourceAttrs: "test=true", + }, + expectNomadAttrs: true, + expectAdditionalAttrs: map[string]string{ + "test": "true", + }, + }, + { + name: "invalid values are ignored", + taskEnv: map[string]string{ + envKeyOtelResourceAttrs: "not-valid", + }, + expectNomadAttrs: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + alloc := mock.Alloc() + node := mock.Node() + task := mock.Job().TaskGroups[0].Tasks[0] + + otelHook := newOtelHook(&otelHookConfig{ + logger: hclog.NewNullLogger(), + alloc: alloc, + node: node, + }) + + // Setup task environment with addition test values. + builder := taskenv.NewBuilder(node, alloc, task, "global") + taskEnv := builder.Build() + for k, v := range tc.taskEnv { + taskEnv.EnvMap[k] = v + } + + // Run hook. + req := &interfaces.TaskPrestartRequest{ + TaskEnv: taskEnv, + TaskDir: &allocdir.TaskDir{Dir: os.TempDir()}, + Task: task, + } + resp := interfaces.TaskPrestartResponse{} + err := otelHook.Prestart(context.Background(), req, &resp) + must.NoError(t, err) + + // Read and parse resulting OTEL_RESOURCE_ATTRIBUTES env var. + got := resp.Env[envKeyOtelResourceAttrs] + b, err := baggage.Parse(got) + must.NoError(t, err) + + if tc.expectNomadAttrs { + must.Eq(t, b.Member("nomad.alloc.id").Value(), alloc.ID) + must.Eq(t, b.Member("nomad.alloc.name").Value(), alloc.Name) + must.Eq(t, b.Member("nomad.alloc.createTime").Value(), fmt.Sprintf("%v", alloc.CreateTime)) + must.Eq(t, b.Member("nomad.eval.id").Value(), alloc.EvalID) + must.Eq(t, b.Member("nomad.job.id").Value(), alloc.Job.ID) + must.Eq(t, b.Member("nomad.job.name").Value(), alloc.Job.Name) + must.Eq(t, b.Member("nomad.job.region").Value(), alloc.Job.Region) + must.Eq(t, b.Member("nomad.job.type").Value(), alloc.Job.Type) + must.Eq(t, b.Member("nomad.namespace").Value(), alloc.Namespace) + must.Eq(t, b.Member("nomad.node.id").Value(), node.ID) + must.Eq(t, b.Member("nomad.node.name").Value(), node.Name) + must.Eq(t, b.Member("nomad.node.datacenter").Value(), node.Datacenter) + must.Eq(t, b.Member("nomad.task.name").Value(), task.Name) + must.Eq(t, b.Member("nomad.task.driver").Value(), task.Driver) + + if alloc.Job.ParentID != "" { + must.Eq(t, b.Member("nomad.job.parentId").Value(), alloc.Job.ParentID) + } else { + must.Eq(t, b.Member("nomad.job.parentId"), baggage.Member{}) + } + + if node.NodeClass != "" { + must.Eq(t, b.Member("nomad.node.class").Value(), node.NodeClass) + } else { + must.Eq(t, b.Member("nomad.node.class"), baggage.Member{}) + } + } else { + must.Eq(t, got, "") + } + + if len(tc.expectAdditionalAttrs) > 0 { + for k, v := range tc.expectAdditionalAttrs { + must.Eq(t, b.Member(k).Value(), v) + } + } else { + for _, m := range b.Members() { + // If not additional values are expected, all attributes + // must be related to Nomad. + must.StrContains(t, m.Key(), "nomad") + } + } + }) + } +} diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index 5c15e3a793a1..d69729f1308e 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -68,6 +68,11 @@ func (tr *TaskRunner) initHooks() { newArtifactHook(tr, tr.getter, hookLogger), newStatsHook(tr, tr.clientConfig.StatsCollectionInterval, hookLogger), newDeviceHook(tr.devicemanager, hookLogger), + newOtelHook(&otelHookConfig{ + logger: hookLogger, + alloc: tr.Alloc(), + node: tr.clientConfig.Node, + }), } // If the task has a CSI stanza, add the hook. diff --git a/go.mod b/go.mod index 3c78b5b4c6bc..dbe05f45032e 100644 --- a/go.mod +++ b/go.mod @@ -117,6 +117,7 @@ require ( github.com/zclconf/go-cty v1.8.0 github.com/zclconf/go-cty-yaml v1.0.2 go.etcd.io/bbolt v1.3.6 + go.opentelemetry.io/otel v1.9.0 go.uber.org/goleak v1.1.12 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d golang.org/x/exp v0.0.0-20220609121020-a51bd0440498 diff --git a/go.sum b/go.sum index 6cf4e9064318..3178fbaefed5 100644 --- a/go.sum +++ b/go.sum @@ -1294,6 +1294,8 @@ go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.23.0 h1:gqCw0LfLxScz8irSi8exQc7fyQ0fKQU/qnC/X8+V/1M= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= +go.opentelemetry.io/otel v1.9.0 h1:8WZNQFIB2a71LnANS9JeyidJKKGOOremcUtb/OtHISw= +go.opentelemetry.io/otel v1.9.0/go.mod h1:np4EoPGzoPs3O67xUVNoPPcmSvsfOxNlNA4F4AC+0Eo= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= diff --git a/website/content/docs/runtime/environment.mdx b/website/content/docs/runtime/environment.mdx index 90ed25160ef4..daea33f2a169 100644 --- a/website/content/docs/runtime/environment.mdx +++ b/website/content/docs/runtime/environment.mdx @@ -112,7 +112,104 @@ using the `exec`, `raw_exec`, and `java` task drivers. The variables that are passed to the tasks can be controlled using the client configuration [`env.denylist`][]. +## OpenTelemetry resource attributes + +Nomad will automatically include the `OTEL_RESOURCE_ATTRIBUTES` environment +variable according to the [OpenTelemetry resource SDK +specification][otel_resource_spec]. The attributes key use the `nomad.` scope. + +| Attribute | Type | Description | Examples | +| ------------------------ | -------- | --------------------------- | -------- | +| `nomad.alloc.createTime` | `string` | The time the allocation was created. | `1663012858195456000` | +| `nomad.alloc.id` | `string` | The ID of the allocation. | `6cd78c59-b6ec-cc9d-c5df-09aa630bf5d6` | +| `nomad.alloc.name` | `string` | The name of the allocation. | `otel-demo.demo-client[0]` | +| `nomad.eval.id` | `string` | The evaluation ID that generated the allocation. | `02a24dc2-30ac-eaa9-9330-d2125dc5a3a0` | +| `nomad.group.name` | `string` | The name of the group that generated this allocation. | `demo-client` | +| `nomad.job.id` | `string` | The ID of the job that generated this allocation. | `otel-demo` | +| `nomad.job.name` | `string` | The name of the job that generated this allocation. | `otel-demo` | +| `nomad.job.parentId` | `string` | The ID of the parent job if applicable. | `parent-periodic-job` | +| `nomad.job.region` | `string` | The region of the job that generated this allocation. | `global` | +| `nomad.job.type` | `string` | The type of the job that generated this allocation. | `service` | +| `nomad.namespace` | `string` | The namespace where this allocation is running. | `default` | +| `nomad.node.class` | `string` | The node class of the node running this allocation, if defined. | `dc1` | +| `nomad.node.datacenter` | `string` | The datacenter of the node running this allocation. | `dc1` | +| `nomad.node.id` | `string` | The ID of the node running this allocation. | `8e441a6a-3885-26f0-1017-03e13605cfe7` | +| `nomad.node.name` | `string` | The name of the node running this allocation. | `nomad-1` | +| `nomad.task.driver` | `string` | The driver used by the task. | `exec` | +| `nomad.task.name` | `string` | The name of the task. | `client` | + +If a task already defines the `OTEL_RESOURCE_ATTRIBUTES` environment variable +in its [`env`][task_env] configuration, Nomad will merge the default values +above with the values defined by the task, with the task values taking +precedence. + +```hcl +job "example" { + # ... + group "cache" { + # ... + task "redis" { + # ... + env { + OTEL_RESOURCE_ATTRIBUTES = "app.version=1" + } + } + } +} +``` + +``` +OTEL_RESOURCE_ATTRIBUTES=app.version=1,nomad.alloc.id=d3551994-fb78-d73e-db8c-bc050f45fa0a,nomad.alloc.name=.... +``` + +If `OTEL_RESOURCE_ATTRIBUTES` is defined in a [`template`][template_env], it +will overwrite the Nomad default values. + +```hcl +job "example" { + # ... + group "cache" { + # ... + task "redis" { + # ... + template { + data = < + + + OTEL_RESOURCE_ATTRIBUTES + + + Encoded baggage of OpenTelemetry resource attributes related to Nomad. + Refer to + OpenTelemetry resource attributes + for more information. + + From 672882c1a9988a653533bb3314b80b030d40fd00 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Mon, 12 Sep 2022 20:49:32 -0400 Subject: [PATCH 2/2] changelog: add entry for #14556 --- .changelog/14556.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/14556.txt diff --git a/.changelog/14556.txt b/.changelog/14556.txt new file mode 100644 index 000000000000..efbd4b59f09f --- /dev/null +++ b/.changelog/14556.txt @@ -0,0 +1,3 @@ +```release-note:improvement +client: expose Nomad attributes to allocations using the `OTEL_RESOURCE_ATTRIBUTES` environment variable +```