From 96ac3daacda6f7b1ff199adc84999d39a78c8812 Mon Sep 17 00:00:00 2001 From: Jina Jain Date: Sun, 7 Sep 2025 22:40:05 -0700 Subject: [PATCH 1/9] initial commit for clusterobservability CRD --- apis/v1alpha1/clusterobservability_types.go | 295 ++++++++ apis/v1alpha1/zz_generated.deepcopy.go | 260 +++++++ ...emetry-operator.clusterserviceversion.yaml | 31 +- ...entelemetry.io_clusterobservabilities.yaml | 198 ++++++ ...emetry-operator.clusterserviceversion.yaml | 31 +- ...entelemetry.io_clusterobservabilities.yaml | 198 ++++++ ...entelemetry.io_clusterobservabilities.yaml | 190 +++++ config/crd/kustomization.yaml | 1 + config/rbac/role.yaml | 26 +- ...ability_v1alpha1_clusterobservability.yaml | 55 ++ docs/api/clusterobservabilities.md | 576 ++++++++++++++++ docs/cluster-observability.md | 361 ++++++++++ go.sum | 27 +- internal/config/cli.go | 7 + .../clusterobservability_controller.go | 650 ++++++++++++++++++ internal/controllers/common.go | 4 + .../clusterobservability.go | 461 +++++++++++++ .../config/configs/agent-collector-base.yaml | 73 ++ .../configs/cluster-collector-base.yaml | 22 + .../openshift/agent-collector-overrides.yaml | 80 +++ .../cluster-collector-overrides.yaml | 29 + .../clusterobservability/config/loader.go | 459 +++++++++++++ .../clusterobservability/openshift_scc.go | 102 +++ internal/manifests/params.go | 21 +- .../status/clusterobservability/handle.go | 489 +++++++++++++ main.go | 16 + pkg/featuregate/featuregate.go | 7 + 27 files changed, 4601 insertions(+), 68 deletions(-) create mode 100644 apis/v1alpha1/clusterobservability_types.go create mode 100644 bundle/community/manifests/opentelemetry.io_clusterobservabilities.yaml create mode 100644 bundle/openshift/manifests/opentelemetry.io_clusterobservabilities.yaml create mode 100644 config/crd/bases/opentelemetry.io_clusterobservabilities.yaml create mode 100644 config/samples/clusterobservability_v1alpha1_clusterobservability.yaml create mode 100644 docs/api/clusterobservabilities.md create mode 100644 docs/cluster-observability.md create mode 100644 internal/controllers/clusterobservability_controller.go create mode 100644 internal/manifests/clusterobservability/clusterobservability.go create mode 100644 internal/manifests/clusterobservability/config/configs/agent-collector-base.yaml create mode 100644 internal/manifests/clusterobservability/config/configs/cluster-collector-base.yaml create mode 100644 internal/manifests/clusterobservability/config/configs/distros/openshift/agent-collector-overrides.yaml create mode 100644 internal/manifests/clusterobservability/config/configs/distros/openshift/cluster-collector-overrides.yaml create mode 100644 internal/manifests/clusterobservability/config/loader.go create mode 100644 internal/manifests/clusterobservability/openshift_scc.go create mode 100644 internal/status/clusterobservability/handle.go diff --git a/apis/v1alpha1/clusterobservability_types.go b/apis/v1alpha1/clusterobservability_types.go new file mode 100644 index 0000000000..63b3bc040f --- /dev/null +++ b/apis/v1alpha1/clusterobservability_types.go @@ -0,0 +1,295 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ObservabilitySignal represents the type of observability signal. +// +kubebuilder:validation:Enum=logs;traces;metrics;profiles +type ObservabilitySignal string + +const ( + ObservabilitySignalLogs ObservabilitySignal = "logs" + ObservabilitySignalTraces ObservabilitySignal = "traces" + ObservabilitySignalMetrics ObservabilitySignal = "metrics" + ObservabilitySignalProfiles ObservabilitySignal = "profiles" +) + +// OTLPHTTPExporter defines OTLP HTTP exporter configuration. +// This structure mirrors the official OpenTelemetry Collector otlphttpexporter configuration. +type OTLPHTTPExporter struct { + // Endpoint is the target base URL to send data to (e.g., https://example.com:4318). + // +optional + Endpoint string `json:"endpoint,omitempty"` + + // TracesEndpoint is the target URL to send trace data to (e.g., https://example.com:4318/v1/traces). + // If this setting is present the endpoint setting is ignored for traces. + // +optional + TracesEndpoint string `json:"traces_endpoint,omitempty"` + + // MetricsEndpoint is the target URL to send metric data to (e.g., https://example.com:4318/v1/metrics). + // If this setting is present the endpoint setting is ignored for metrics. + // +optional + MetricsEndpoint string `json:"metrics_endpoint,omitempty"` + + // LogsEndpoint is the target URL to send log data to (e.g., https://example.com:4318/v1/logs). + // If this setting is present the endpoint setting is ignored for logs. + // +optional + LogsEndpoint string `json:"logs_endpoint,omitempty"` + + // ProfilesEndpoint is the target URL to send profile data to (e.g., https://example.com:4318/v1/development/profiles). + // If this setting is present the endpoint setting is ignored for profiles. + // +optional + ProfilesEndpoint string `json:"profiles_endpoint,omitempty"` + + // TLS defines TLS configuration for the exporter. + // +optional + TLS *TLSConfig `json:"tls,omitempty"` + + // Timeout is the HTTP request time limit (e.g., "30s", "1m"). Default is 30s. + // +optional + Timeout string `json:"timeout,omitempty"` + + // ReadBufferSize for HTTP client. Default is 0. + // +optional + // +kubebuilder:validation:Minimum=0 + ReadBufferSize *int `json:"read_buffer_size,omitempty"` + + // WriteBufferSize for HTTP client. Default is 512 * 1024. + // +optional + // +kubebuilder:validation:Minimum=0 + WriteBufferSize *int `json:"write_buffer_size,omitempty"` + + // SendingQueue defines configuration for the sending queue. + // +optional + SendingQueue *SendingQueueConfig `json:"sending_queue,omitempty"` + + // RetryOnFailure defines retry configuration for failed requests. + // +optional + RetryOnFailure *RetryConfig `json:"retry_on_failure,omitempty"` + + // Encoding defines the encoding to use for the messages. + // Valid options: proto, json. Default is proto. + // +optional + // +kubebuilder:validation:Enum=proto;json + Encoding string `json:"encoding,omitempty"` + + // Compression defines the compression algorithm to use. + // By default gzip compression is enabled. Use "none" to disable. + // +optional + // +kubebuilder:validation:Enum=gzip;none;"" + Compression string `json:"compression,omitempty"` + + // Headers defines additional headers to be sent with each request. + // +optional + Headers map[string]string `json:"headers,omitempty"` +} + +// TLSConfig defines TLS configuration for the OTLP HTTP exporter. +// This mirrors the OpenTelemetry Collector configtls settings. +type TLSConfig struct { + // CAFile is the path to the CA certificate file for server verification. + // +optional + CAFile string `json:"ca_file,omitempty"` + + // CertFile is the path to the client certificate file for mutual TLS. + // +optional + CertFile string `json:"cert_file,omitempty"` + + // KeyFile is the path to the client private key file for mutual TLS. + // +optional + KeyFile string `json:"key_file,omitempty"` + + // Insecure controls whether to use insecure transport. Default is false. + // +optional + Insecure bool `json:"insecure,omitempty"` + + // ServerName for TLS handshake. If empty, uses the hostname from endpoint. + // +optional + ServerName string `json:"server_name,omitempty"` +} + +// SendingQueueConfig defines configuration for the sending queue. +type SendingQueueConfig struct { + // Enabled controls whether the queue is enabled. Default is true. + // +optional + Enabled *bool `json:"enabled,omitempty"` + + // NumConsumers is the number of consumers that dequeue batches. Default is 10. + // +optional + // +kubebuilder:validation:Minimum=1 + NumConsumers *int `json:"num_consumers,omitempty"` + + // QueueSize is the maximum number of batches allowed in queue at a given time. Default is 1000. + // +optional + // +kubebuilder:validation:Minimum=1 + QueueSize *int `json:"queue_size,omitempty"` +} + +// RetryConfig defines retry configuration for failed requests. +type RetryConfig struct { + // Enabled controls whether retry is enabled. Default is true. + // +optional + Enabled *bool `json:"enabled,omitempty"` + + // InitialInterval is the initial retry interval (e.g., "5s"). Default is 5s. + // +optional + InitialInterval string `json:"initial_interval,omitempty"` + + // RandomizationFactor is the randomization factor for retry intervals (e.g., "0.5"). Default is 0.5. + // +optional + RandomizationFactor string `json:"randomization_factor,omitempty"` + + // Multiplier is the multiplier for retry intervals (e.g., "1.5"). Default is 1.5. + // +optional + Multiplier string `json:"multiplier,omitempty"` + + // MaxInterval is the maximum retry interval (e.g., "30s"). Default is 30s. + // +optional + MaxInterval string `json:"max_interval,omitempty"` + + // MaxElapsedTime is the maximum elapsed time for retries (e.g., "5m"). Default is 5m. + // +optional + MaxElapsedTime string `json:"max_elapsed_time,omitempty"` +} + +// ClusterObservabilitySpec defines the desired state of ClusterObservability. +// This follows a simplified design using a single OTLP HTTP exporter for all signals. +type ClusterObservabilitySpec struct { + // Signals defines which observability signals to collect and export. + // Must contain at least one signal type from: logs, traces, metrics, profiles + // +required + // +kubebuilder:validation:MinItems=1 + // +listType=set + Signals []ObservabilitySignal `json:"signals"` + + // Exporter defines the OTLP HTTP exporter configuration for all signals. + // The collector will automatically append appropriate paths for each signal type. + // +required + Exporter OTLPHTTPExporter `json:"exporter"` +} + +// ClusterObservabilityConditionType represents the type of condition. +type ClusterObservabilityConditionType string + +const ( + // ClusterObservabilityConditionReady indicates whether the ClusterObservability is ready. + ClusterObservabilityConditionReady ClusterObservabilityConditionType = "Ready" + // ClusterObservabilityConditionConfigured indicates whether the ClusterObservability is configured. + ClusterObservabilityConditionConfigured ClusterObservabilityConditionType = "Configured" + // ClusterObservabilityConditionConflicted indicates that multiple ClusterObservability resources exist. + ClusterObservabilityConditionConflicted ClusterObservabilityConditionType = "Conflicted" +) + +const ( + // ClusterObservabilityFinalizer is the finalizer used for ClusterObservability resources. + ClusterObservabilityFinalizer = "clusterobservability.opentelemetry.io/finalizer" +) + +// ClusterObservabilityCondition represents a condition of a ClusterObservability. +type ClusterObservabilityCondition struct { + // Type of condition. + // +required + Type ClusterObservabilityConditionType `json:"type"` + + // Status of the condition. + // +required + Status metav1.ConditionStatus `json:"status"` + + // Last time the condition transitioned from one status to another. + // +optional + LastTransitionTime metav1.Time `json:"lastTransitionTime,omitempty"` + + // The reason for the condition's last transition. + // +optional + Reason string `json:"reason,omitempty"` + + // A human readable message indicating details about the transition. + // +optional + Message string `json:"message,omitempty"` + + // ObservedGeneration represents the .metadata.generation that the condition was set based upon. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` +} + +// ClusterObservabilityStatus defines the observed state of ClusterObservability. +type ClusterObservabilityStatus struct { + // Conditions represent the latest available observations of the ClusterObservability state. + // +optional + // +listType=map + // +listMapKey=type + Conditions []ClusterObservabilityCondition `json:"conditions,omitempty"` + + // ObservedGeneration is the most recent generation observed for this ClusterObservability. + // It corresponds to the ClusterObservability's generation, which is updated on mutation + // by the API Server. + // +optional + ObservedGeneration int64 `json:"observedGeneration,omitempty"` + + // Phase represents the current phase of the ClusterObservability. + // +optional + Phase string `json:"phase,omitempty"` + + // Message provides additional information about the current state. + // +optional + Message string `json:"message,omitempty"` + + // ComponentsStatus provides status information about individual observability components. + // +optional + ComponentsStatus map[string]ComponentStatus `json:"componentsStatus,omitempty"` + + // ConfigVersions tracks the version hashes of the configuration files used. + // This enables detection of config changes when operator is upgraded. + // +optional + ConfigVersions map[string]string `json:"configVersions,omitempty"` +} + +// ComponentStatus represents the status of an individual component. +type ComponentStatus struct { + // Ready indicates whether the component is ready. + // +optional + Ready bool `json:"ready,omitempty"` + + // Message provides additional information about the component status. + // +optional + Message string `json:"message,omitempty"` + + // LastUpdated is the last time this status was updated. + // +optional + LastUpdated metav1.Time `json:"lastUpdated,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Signals",type="string",JSONPath=".spec.signals",description="Observability signals" +// +kubebuilder:printcolumn:name="Endpoint",type="string",JSONPath=".spec.exporter.endpoint",description="OTLP exporter endpoint" +// +kubebuilder:printcolumn:name="Phase",type="string",JSONPath=".status.phase",description="Current phase" +// +kubebuilder:printcolumn:name="Age",type="date",JSONPath=".metadata.creationTimestamp" +// +operator-sdk:csv:customresourcedefinitions:displayName="Cluster Observability" +// +operator-sdk:csv:customresourcedefinitions:resources={{Pod,v1},{Deployment,apps/v1},{ConfigMap,v1},{Service,v1},{DaemonSet,apps/v1}} + +// ClusterObservability is the Schema for the clusterobservabilities API. +type ClusterObservability struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec ClusterObservabilitySpec `json:"spec,omitempty"` + Status ClusterObservabilityStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// ClusterObservabilityList contains a list of ClusterObservability. +type ClusterObservabilityList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ClusterObservability `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ClusterObservability{}, &ClusterObservabilityList{}) +} diff --git a/apis/v1alpha1/zz_generated.deepcopy.go b/apis/v1alpha1/zz_generated.deepcopy.go index 2168d86fe3..e0e5ca4cdb 100644 --- a/apis/v1alpha1/zz_generated.deepcopy.go +++ b/apis/v1alpha1/zz_generated.deepcopy.go @@ -122,6 +122,154 @@ func (in *AutoscalerSpec) DeepCopy() *AutoscalerSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterObservability) DeepCopyInto(out *ClusterObservability) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterObservability. +func (in *ClusterObservability) DeepCopy() *ClusterObservability { + if in == nil { + return nil + } + out := new(ClusterObservability) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterObservability) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterObservabilityCondition) DeepCopyInto(out *ClusterObservabilityCondition) { + *out = *in + in.LastTransitionTime.DeepCopyInto(&out.LastTransitionTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterObservabilityCondition. +func (in *ClusterObservabilityCondition) DeepCopy() *ClusterObservabilityCondition { + if in == nil { + return nil + } + out := new(ClusterObservabilityCondition) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterObservabilityList) DeepCopyInto(out *ClusterObservabilityList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ClusterObservability, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterObservabilityList. +func (in *ClusterObservabilityList) DeepCopy() *ClusterObservabilityList { + if in == nil { + return nil + } + out := new(ClusterObservabilityList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ClusterObservabilityList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterObservabilitySpec) DeepCopyInto(out *ClusterObservabilitySpec) { + *out = *in + if in.Signals != nil { + in, out := &in.Signals, &out.Signals + *out = make([]ObservabilitySignal, len(*in)) + copy(*out, *in) + } + in.Exporter.DeepCopyInto(&out.Exporter) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterObservabilitySpec. +func (in *ClusterObservabilitySpec) DeepCopy() *ClusterObservabilitySpec { + if in == nil { + return nil + } + out := new(ClusterObservabilitySpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ClusterObservabilityStatus) DeepCopyInto(out *ClusterObservabilityStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]ClusterObservabilityCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.ComponentsStatus != nil { + in, out := &in.ComponentsStatus, &out.ComponentsStatus + *out = make(map[string]ComponentStatus, len(*in)) + for key, val := range *in { + (*out)[key] = *val.DeepCopy() + } + } + if in.ConfigVersions != nil { + in, out := &in.ConfigVersions, &out.ConfigVersions + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterObservabilityStatus. +func (in *ClusterObservabilityStatus) DeepCopy() *ClusterObservabilityStatus { + if in == nil { + return nil + } + out := new(ClusterObservabilityStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ComponentStatus) DeepCopyInto(out *ComponentStatus) { + *out = *in + in.LastUpdated.DeepCopyInto(&out.LastUpdated) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComponentStatus. +func (in *ComponentStatus) DeepCopy() *ComponentStatus { + if in == nil { + return nil + } + out := new(ComponentStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ConfigMapsSpec) DeepCopyInto(out *ConfigMapsSpec) { *out = *in @@ -526,6 +674,53 @@ func (in *NodeJS) DeepCopy() *NodeJS { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *OTLPHTTPExporter) DeepCopyInto(out *OTLPHTTPExporter) { + *out = *in + if in.TLS != nil { + in, out := &in.TLS, &out.TLS + *out = new(TLSConfig) + **out = **in + } + if in.ReadBufferSize != nil { + in, out := &in.ReadBufferSize, &out.ReadBufferSize + *out = new(int) + **out = **in + } + if in.WriteBufferSize != nil { + in, out := &in.WriteBufferSize, &out.WriteBufferSize + *out = new(int) + **out = **in + } + if in.SendingQueue != nil { + in, out := &in.SendingQueue, &out.SendingQueue + *out = new(SendingQueueConfig) + (*in).DeepCopyInto(*out) + } + if in.RetryOnFailure != nil { + in, out := &in.RetryOnFailure, &out.RetryOnFailure + *out = new(RetryConfig) + (*in).DeepCopyInto(*out) + } + if in.Headers != nil { + in, out := &in.Headers, &out.Headers + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new OTLPHTTPExporter. +func (in *OTLPHTTPExporter) DeepCopy() *OTLPHTTPExporter { + if in == nil { + return nil + } + out := new(OTLPHTTPExporter) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ObservabilitySpec) DeepCopyInto(out *ObservabilitySpec) { *out = *in @@ -1275,6 +1470,26 @@ func (in *Resource) DeepCopy() *Resource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RetryConfig) DeepCopyInto(out *RetryConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryConfig. +func (in *RetryConfig) DeepCopy() *RetryConfig { + if in == nil { + return nil + } + out := new(RetryConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Sampler) DeepCopyInto(out *Sampler) { *out = *in @@ -1305,6 +1520,36 @@ func (in *ScaleSubresourceStatus) DeepCopy() *ScaleSubresourceStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SendingQueueConfig) DeepCopyInto(out *SendingQueueConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.NumConsumers != nil { + in, out := &in.NumConsumers, &out.NumConsumers + *out = new(int) + **out = **in + } + if in.QueueSize != nil { + in, out := &in.QueueSize, &out.QueueSize + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SendingQueueConfig. +func (in *SendingQueueConfig) DeepCopy() *SendingQueueConfig { + if in == nil { + return nil + } + out := new(SendingQueueConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TLS) DeepCopyInto(out *TLS) { *out = *in @@ -1320,6 +1565,21 @@ func (in *TLS) DeepCopy() *TLS { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TLSConfig) DeepCopyInto(out *TLSConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TLSConfig. +func (in *TLSConfig) DeepCopy() *TLSConfig { + if in == nil { + return nil + } + out := new(TLSConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TargetAllocator) DeepCopyInto(out *TargetAllocator) { *out = *in diff --git a/bundle/community/manifests/opentelemetry-operator.clusterserviceversion.yaml b/bundle/community/manifests/opentelemetry-operator.clusterserviceversion.yaml index 7181a252e6..503f9ffbf4 100644 --- a/bundle/community/manifests/opentelemetry-operator.clusterserviceversion.yaml +++ b/bundle/community/manifests/opentelemetry-operator.clusterserviceversion.yaml @@ -99,7 +99,7 @@ metadata: categories: Logging & Tracing,Monitoring certified: "false" containerImage: ghcr.io/open-telemetry/opentelemetry-operator/opentelemetry-operator - createdAt: "2025-10-27T18:11:36Z" + createdAt: "2025-11-03T16:15:22Z" description: Provides the OpenTelemetry components, including the Collector operators.operatorframework.io/builder: operator-sdk-v1.29.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 @@ -111,6 +111,9 @@ spec: apiservicedefinitions: {} customresourcedefinitions: owned: + - kind: ClusterObservability + name: clusterobservabilities.opentelemetry.io + version: v1alpha1 - description: Instrumentation is the spec for OpenTelemetry instrumentation. displayName: OpenTelemetry Instrumentation kind: Instrumentation @@ -429,18 +432,10 @@ spec: - apiGroups: - opentelemetry.io resources: + - clusterobservabilities - instrumentations - - opentelemetrycollectors - verbs: - - get - - list - - patch - - update - - watch - - apiGroups: - - opentelemetry.io - resources: - opampbridges + - opentelemetrycollectors - targetallocators - targetallocators/finalizers verbs: @@ -454,12 +449,14 @@ spec: - apiGroups: - opentelemetry.io resources: + - clusterobservabilities/finalizers - opampbridges/finalizers verbs: - update - apiGroups: - opentelemetry.io resources: + - clusterobservabilities/status - opampbridges/status - opentelemetrycollectors/finalizers - opentelemetrycollectors/status @@ -493,6 +490,18 @@ spec: - patch - update - watch + - apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - authentication.k8s.io resources: diff --git a/bundle/community/manifests/opentelemetry.io_clusterobservabilities.yaml b/bundle/community/manifests/opentelemetry.io_clusterobservabilities.yaml new file mode 100644 index 0000000000..bd55f9fcd6 --- /dev/null +++ b/bundle/community/manifests/opentelemetry.io_clusterobservabilities.yaml @@ -0,0 +1,198 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + creationTimestamp: null + labels: + app.kubernetes.io/name: opentelemetry-operator + name: clusterobservabilities.opentelemetry.io +spec: + group: opentelemetry.io + names: + kind: ClusterObservability + listKind: ClusterObservabilityList + plural: clusterobservabilities + singular: clusterobservability + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Observability signals + jsonPath: .spec.signals + name: Signals + type: string + - description: OTLP exporter endpoint + jsonPath: .spec.exporter.endpoint + name: Endpoint + type: string + - description: Current phase + jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + type: string + kind: + type: string + metadata: + type: object + spec: + properties: + exporter: + properties: + compression: + enum: + - gzip + - none + - "" + type: string + encoding: + enum: + - proto + - json + type: string + endpoint: + type: string + headers: + additionalProperties: + type: string + type: object + logs_endpoint: + type: string + metrics_endpoint: + type: string + profiles_endpoint: + type: string + read_buffer_size: + minimum: 0 + type: integer + retry_on_failure: + properties: + enabled: + type: boolean + initial_interval: + type: string + max_elapsed_time: + type: string + max_interval: + type: string + multiplier: + type: string + randomization_factor: + type: string + type: object + sending_queue: + properties: + enabled: + type: boolean + num_consumers: + minimum: 1 + type: integer + queue_size: + minimum: 1 + type: integer + type: object + timeout: + type: string + tls: + properties: + ca_file: + type: string + cert_file: + type: string + insecure: + type: boolean + key_file: + type: string + server_name: + type: string + type: object + traces_endpoint: + type: string + write_buffer_size: + minimum: 0 + type: integer + type: object + signals: + items: + enum: + - logs + - traces + - metrics + - profiles + type: string + minItems: 1 + type: array + x-kubernetes-list-type: set + required: + - exporter + - signals + type: object + status: + properties: + componentsStatus: + additionalProperties: + properties: + lastUpdated: + format: date-time + type: string + message: + type: string + ready: + type: boolean + type: object + type: object + conditions: + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + type: string + observedGeneration: + format: int64 + type: integer + reason: + type: string + status: + type: string + type: + type: string + required: + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + configVersions: + additionalProperties: + type: string + type: object + message: + type: string + observedGeneration: + format: int64 + type: integer + phase: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/bundle/openshift/manifests/opentelemetry-operator.clusterserviceversion.yaml b/bundle/openshift/manifests/opentelemetry-operator.clusterserviceversion.yaml index 13f823e8de..aca4d312c0 100644 --- a/bundle/openshift/manifests/opentelemetry-operator.clusterserviceversion.yaml +++ b/bundle/openshift/manifests/opentelemetry-operator.clusterserviceversion.yaml @@ -99,7 +99,7 @@ metadata: categories: Logging & Tracing,Monitoring certified: "false" containerImage: ghcr.io/open-telemetry/opentelemetry-operator/opentelemetry-operator - createdAt: "2025-10-27T18:11:36Z" + createdAt: "2025-11-03T16:15:22Z" description: Provides the OpenTelemetry components, including the Collector operators.operatorframework.io/builder: operator-sdk-v1.29.0 operators.operatorframework.io/project_layout: go.kubebuilder.io/v3 @@ -111,6 +111,9 @@ spec: apiservicedefinitions: {} customresourcedefinitions: owned: + - kind: ClusterObservability + name: clusterobservabilities.opentelemetry.io + version: v1alpha1 - description: Instrumentation is the spec for OpenTelemetry instrumentation. displayName: OpenTelemetry Instrumentation kind: Instrumentation @@ -429,18 +432,10 @@ spec: - apiGroups: - opentelemetry.io resources: + - clusterobservabilities - instrumentations - - opentelemetrycollectors - verbs: - - get - - list - - patch - - update - - watch - - apiGroups: - - opentelemetry.io - resources: - opampbridges + - opentelemetrycollectors - targetallocators - targetallocators/finalizers verbs: @@ -454,12 +449,14 @@ spec: - apiGroups: - opentelemetry.io resources: + - clusterobservabilities/finalizers - opampbridges/finalizers verbs: - update - apiGroups: - opentelemetry.io resources: + - clusterobservabilities/status - opampbridges/status - opentelemetrycollectors/finalizers - opentelemetrycollectors/status @@ -493,6 +490,18 @@ spec: - patch - update - watch + - apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - authentication.k8s.io resources: diff --git a/bundle/openshift/manifests/opentelemetry.io_clusterobservabilities.yaml b/bundle/openshift/manifests/opentelemetry.io_clusterobservabilities.yaml new file mode 100644 index 0000000000..bd55f9fcd6 --- /dev/null +++ b/bundle/openshift/manifests/opentelemetry.io_clusterobservabilities.yaml @@ -0,0 +1,198 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + creationTimestamp: null + labels: + app.kubernetes.io/name: opentelemetry-operator + name: clusterobservabilities.opentelemetry.io +spec: + group: opentelemetry.io + names: + kind: ClusterObservability + listKind: ClusterObservabilityList + plural: clusterobservabilities + singular: clusterobservability + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Observability signals + jsonPath: .spec.signals + name: Signals + type: string + - description: OTLP exporter endpoint + jsonPath: .spec.exporter.endpoint + name: Endpoint + type: string + - description: Current phase + jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + type: string + kind: + type: string + metadata: + type: object + spec: + properties: + exporter: + properties: + compression: + enum: + - gzip + - none + - "" + type: string + encoding: + enum: + - proto + - json + type: string + endpoint: + type: string + headers: + additionalProperties: + type: string + type: object + logs_endpoint: + type: string + metrics_endpoint: + type: string + profiles_endpoint: + type: string + read_buffer_size: + minimum: 0 + type: integer + retry_on_failure: + properties: + enabled: + type: boolean + initial_interval: + type: string + max_elapsed_time: + type: string + max_interval: + type: string + multiplier: + type: string + randomization_factor: + type: string + type: object + sending_queue: + properties: + enabled: + type: boolean + num_consumers: + minimum: 1 + type: integer + queue_size: + minimum: 1 + type: integer + type: object + timeout: + type: string + tls: + properties: + ca_file: + type: string + cert_file: + type: string + insecure: + type: boolean + key_file: + type: string + server_name: + type: string + type: object + traces_endpoint: + type: string + write_buffer_size: + minimum: 0 + type: integer + type: object + signals: + items: + enum: + - logs + - traces + - metrics + - profiles + type: string + minItems: 1 + type: array + x-kubernetes-list-type: set + required: + - exporter + - signals + type: object + status: + properties: + componentsStatus: + additionalProperties: + properties: + lastUpdated: + format: date-time + type: string + message: + type: string + ready: + type: boolean + type: object + type: object + conditions: + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + type: string + observedGeneration: + format: int64 + type: integer + reason: + type: string + status: + type: string + type: + type: string + required: + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + configVersions: + additionalProperties: + type: string + type: object + message: + type: string + observedGeneration: + format: int64 + type: integer + phase: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} +status: + acceptedNames: + kind: "" + plural: "" + conditions: null + storedVersions: null diff --git a/config/crd/bases/opentelemetry.io_clusterobservabilities.yaml b/config/crd/bases/opentelemetry.io_clusterobservabilities.yaml new file mode 100644 index 0000000000..2e93aff948 --- /dev/null +++ b/config/crd/bases/opentelemetry.io_clusterobservabilities.yaml @@ -0,0 +1,190 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: clusterobservabilities.opentelemetry.io +spec: + group: opentelemetry.io + names: + kind: ClusterObservability + listKind: ClusterObservabilityList + plural: clusterobservabilities + singular: clusterobservability + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Observability signals + jsonPath: .spec.signals + name: Signals + type: string + - description: OTLP exporter endpoint + jsonPath: .spec.exporter.endpoint + name: Endpoint + type: string + - description: Current phase + jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + properties: + apiVersion: + type: string + kind: + type: string + metadata: + type: object + spec: + properties: + exporter: + properties: + compression: + enum: + - gzip + - none + - "" + type: string + encoding: + enum: + - proto + - json + type: string + endpoint: + type: string + headers: + additionalProperties: + type: string + type: object + logs_endpoint: + type: string + metrics_endpoint: + type: string + profiles_endpoint: + type: string + read_buffer_size: + minimum: 0 + type: integer + retry_on_failure: + properties: + enabled: + type: boolean + initial_interval: + type: string + max_elapsed_time: + type: string + max_interval: + type: string + multiplier: + type: string + randomization_factor: + type: string + type: object + sending_queue: + properties: + enabled: + type: boolean + num_consumers: + minimum: 1 + type: integer + queue_size: + minimum: 1 + type: integer + type: object + timeout: + type: string + tls: + properties: + ca_file: + type: string + cert_file: + type: string + insecure: + type: boolean + key_file: + type: string + server_name: + type: string + type: object + traces_endpoint: + type: string + write_buffer_size: + minimum: 0 + type: integer + type: object + signals: + items: + enum: + - logs + - traces + - metrics + - profiles + type: string + minItems: 1 + type: array + x-kubernetes-list-type: set + required: + - exporter + - signals + type: object + status: + properties: + componentsStatus: + additionalProperties: + properties: + lastUpdated: + format: date-time + type: string + message: + type: string + ready: + type: boolean + type: object + type: object + conditions: + items: + properties: + lastTransitionTime: + format: date-time + type: string + message: + type: string + observedGeneration: + format: int64 + type: integer + reason: + type: string + status: + type: string + type: + type: string + required: + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + configVersions: + additionalProperties: + type: string + type: object + message: + type: string + observedGeneration: + format: int64 + type: integer + phase: + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml index 7c7d535ff4..6e9fcc44b5 100644 --- a/config/crd/kustomization.yaml +++ b/config/crd/kustomization.yaml @@ -6,6 +6,7 @@ resources: - bases/opentelemetry.io_instrumentations.yaml - bases/opentelemetry.io_opampbridges.yaml - bases/opentelemetry.io_targetallocators.yaml +- bases/opentelemetry.io_clusterobservabilities.yaml # +kubebuilder:scaffold:crdkustomizeresource # patches here are for enabling the conversion webhook for each CRD diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index df0e9aa6de..ecddc921ac 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -137,18 +137,10 @@ rules: - apiGroups: - opentelemetry.io resources: + - clusterobservabilities - instrumentations - - opentelemetrycollectors - verbs: - - get - - list - - patch - - update - - watch -- apiGroups: - - opentelemetry.io - resources: - opampbridges + - opentelemetrycollectors - targetallocators - targetallocators/finalizers verbs: @@ -162,12 +154,14 @@ rules: - apiGroups: - opentelemetry.io resources: + - clusterobservabilities/finalizers - opampbridges/finalizers verbs: - update - apiGroups: - opentelemetry.io resources: + - clusterobservabilities/status - opampbridges/status - opentelemetrycollectors/finalizers - opentelemetrycollectors/status @@ -201,3 +195,15 @@ rules: - patch - update - watch +- apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - create + - delete + - get + - list + - patch + - update + - watch diff --git a/config/samples/clusterobservability_v1alpha1_clusterobservability.yaml b/config/samples/clusterobservability_v1alpha1_clusterobservability.yaml new file mode 100644 index 0000000000..cd529f79b4 --- /dev/null +++ b/config/samples/clusterobservability_v1alpha1_clusterobservability.yaml @@ -0,0 +1,55 @@ +# Copyright The OpenTelemetry Authors +# SPDX-License-Identifier: Apache-2.0 + +# Example ClusterObservability configuration +# This resource manages a complete observability stack for your Kubernetes cluster +apiVersion: opentelemetry.io/v1alpha1 +kind: ClusterObservability +metadata: + name: cluster-observability + namespace: opentelemetry-operator-system +spec: + # Signals to collect - must include at least one + signals: + - traces + - metrics + - logs + + # OTLP HTTP exporter configuration + exporter: + # Base endpoint for all signals (e.g., https://otel-backend.example.com:4318) + endpoint: "https://otel-backend.example.com:4318" + + # Optional: Override endpoint for specific signals + # traces_endpoint: "https://trace-backend.example.com/v2/trace/otlp" + # metrics_endpoint: "https://metrics-backend.example.com/v2/datapoint/otlp" + # logs_endpoint: "https://logs-backend.example.com/v1/logs" + + # Optional: Add headers (e.g., for authentication) + headers: + "Authorization": "Bearer your-token-here" + + # Optional: Compression (gzip or none) + compression: "gzip" + + # Optional: Request timeout + timeout: "30s" + + # Optional: TLS configuration + # Note: File mounting for ca_file, cert_file, key_file is not yet supported + # tls: + # insecure: false + # server_name: "otel-backend.example.com" + + # Optional: Retry configuration + # retry_on_failure: + # enabled: true + # initial_interval: "5s" + # max_interval: "30s" + # max_elapsed_time: "5m" + + # Optional: Sending queue configuration + # sending_queue: + # enabled: true + # num_consumers: 10 + # queue_size: 1000 diff --git a/docs/api/clusterobservabilities.md b/docs/api/clusterobservabilities.md new file mode 100644 index 0000000000..5281f0cd47 --- /dev/null +++ b/docs/api/clusterobservabilities.md @@ -0,0 +1,576 @@ +# API Reference + +Packages: + +- [opentelemetry.io/v1alpha1](#opentelemetryiov1alpha1) + +# opentelemetry.io/v1alpha1 + +Resource Types: + +- [ClusterObservability](#clusterobservability) + + + + +## ClusterObservability +[↩ Parent](#opentelemetryiov1alpha1 ) + + + + + + +ClusterObservability is the Schema for the clusterobservabilities API. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
apiVersionstringopentelemetry.io/v1alpha1true
kindstringClusterObservabilitytrue
metadataobjectRefer to the Kubernetes API documentation for the fields of the `metadata` field.true
specobject + ClusterObservabilitySpec defines the desired state of ClusterObservability. +This follows a simplified design using a single OTLP HTTP exporter for all signals.
+
false
statusobject + ClusterObservabilityStatus defines the observed state of ClusterObservability.
+
false
+ + +### ClusterObservability.spec +[↩ Parent](#clusterobservability) + + + +ClusterObservabilitySpec defines the desired state of ClusterObservability. +This follows a simplified design using a single OTLP HTTP exporter for all signals. + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
exporterobject + Exporter defines the OTLP HTTP exporter configuration for all signals. +The collector will automatically append appropriate paths for each signal type.
+
true
signals[]enum + Signals defines which observability signals to collect and export. +Must contain at least one signal type from: logs, traces, metrics, profiles
+
true
+ + +### ClusterObservability.spec.exporter +[↩ Parent](#clusterobservabilityspec) + + + +Exporter defines the OTLP HTTP exporter configuration for all signals. +The collector will automatically append appropriate paths for each signal type. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
compressionenum + Compression defines the compression algorithm to use. +By default gzip compression is enabled. Use "none" to disable.
+
+ Enum: gzip, none,
+
false
encodingenum + Encoding defines the encoding to use for the messages. +Valid options: proto, json. Default is proto.
+
+ Enum: proto, json
+
false
endpointstring + Endpoint is the target base URL to send data to (e.g., https://example.com:4318).
+
false
headersmap[string]string + Headers defines additional headers to be sent with each request.
+
false
logs_endpointstring + LogsEndpoint is the target URL to send log data to (e.g., https://example.com:4318/v1/logs). +If this setting is present the endpoint setting is ignored for logs.
+
false
metrics_endpointstring + MetricsEndpoint is the target URL to send metric data to (e.g., https://example.com:4318/v1/metrics). +If this setting is present the endpoint setting is ignored for metrics.
+
false
profiles_endpointstring + ProfilesEndpoint is the target URL to send profile data to (e.g., https://example.com:4318/v1/development/profiles). +If this setting is present the endpoint setting is ignored for profiles.
+
false
read_buffer_sizeinteger + ReadBufferSize for HTTP client. Default is 0.
+
+ Minimum: 0
+
false
retry_on_failureobject + RetryOnFailure defines retry configuration for failed requests.
+
false
sending_queueobject + SendingQueue defines configuration for the sending queue.
+
false
timeoutstring + Timeout is the HTTP request time limit (e.g., "30s", "1m"). Default is 30s.
+
false
tlsobject + TLS defines TLS configuration for the exporter.
+
false
traces_endpointstring + TracesEndpoint is the target URL to send trace data to (e.g., https://example.com:4318/v1/traces). +If this setting is present the endpoint setting is ignored for traces.
+
false
write_buffer_sizeinteger + WriteBufferSize for HTTP client. Default is 512 * 1024.
+
+ Minimum: 0
+
false
+ + +### ClusterObservability.spec.exporter.retry_on_failure +[↩ Parent](#clusterobservabilityspecexporter) + + + +RetryOnFailure defines retry configuration for failed requests. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean + Enabled controls whether retry is enabled. Default is true.
+
false
initial_intervalstring + InitialInterval is the initial retry interval (e.g., "5s"). Default is 5s.
+
false
max_elapsed_timestring + MaxElapsedTime is the maximum elapsed time for retries (e.g., "5m"). Default is 5m.
+
false
max_intervalstring + MaxInterval is the maximum retry interval (e.g., "30s"). Default is 30s.
+
false
multiplierstring + Multiplier is the multiplier for retry intervals (e.g., "1.5"). Default is 1.5.
+
false
randomization_factorstring + RandomizationFactor is the randomization factor for retry intervals (e.g., "0.5"). Default is 0.5.
+
false
+ + +### ClusterObservability.spec.exporter.sending_queue +[↩ Parent](#clusterobservabilityspecexporter) + + + +SendingQueue defines configuration for the sending queue. + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
enabledboolean + Enabled controls whether the queue is enabled. Default is true.
+
false
num_consumersinteger + NumConsumers is the number of consumers that dequeue batches. Default is 10.
+
+ Minimum: 1
+
false
queue_sizeinteger + QueueSize is the maximum number of batches allowed in queue at a given time. Default is 1000.
+
+ Minimum: 1
+
false
+ + +### ClusterObservability.spec.exporter.tls +[↩ Parent](#clusterobservabilityspecexporter) + + + +TLS defines TLS configuration for the exporter. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
ca_filestring + CAFile is the path to the CA certificate file for server verification.
+
false
cert_filestring + CertFile is the path to the client certificate file for mutual TLS.
+
false
insecureboolean + Insecure controls whether to use insecure transport. Default is false.
+
false
key_filestring + KeyFile is the path to the client private key file for mutual TLS.
+
false
server_namestring + ServerName for TLS handshake. If empty, uses the hostname from endpoint.
+
false
+ + +### ClusterObservability.status +[↩ Parent](#clusterobservability) + + + +ClusterObservabilityStatus defines the observed state of ClusterObservability. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
componentsStatusmap[string]object + ComponentsStatus provides status information about individual observability components.
+
false
conditions[]object + Conditions represent the latest available observations of the ClusterObservability state.
+
false
configVersionsmap[string]string + ConfigVersions tracks the version hashes of the configuration files used. +This enables detection of config changes when operator is upgraded.
+
false
messagestring + Message provides additional information about the current state.
+
false
observedGenerationinteger + ObservedGeneration is the most recent generation observed for this ClusterObservability. +It corresponds to the ClusterObservability's generation, which is updated on mutation +by the API Server.
+
+ Format: int64
+
false
phasestring + Phase represents the current phase of the ClusterObservability.
+
false
+ + +### ClusterObservability.status.componentsStatus[key] +[↩ Parent](#clusterobservabilitystatus) + + + +ComponentStatus represents the status of an individual component. + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
lastUpdatedstring + LastUpdated is the last time this status was updated.
+
+ Format: date-time
+
false
messagestring + Message provides additional information about the component status.
+
false
readyboolean + Ready indicates whether the component is ready.
+
false
+ + +### ClusterObservability.status.conditions[index] +[↩ Parent](#clusterobservabilitystatus) + + + +ClusterObservabilityCondition represents a condition of a ClusterObservability. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionRequired
statusstring + Status of the condition.
+
true
typestring + Type of condition.
+
true
lastTransitionTimestring + Last time the condition transitioned from one status to another.
+
+ Format: date-time
+
false
messagestring + A human readable message indicating details about the transition.
+
false
observedGenerationinteger + ObservedGeneration represents the .metadata.generation that the condition was set based upon.
+
+ Format: int64
+
false
reasonstring + The reason for the condition's last transition.
+
false
\ No newline at end of file diff --git a/docs/cluster-observability.md b/docs/cluster-observability.md new file mode 100644 index 0000000000..d38561623b --- /dev/null +++ b/docs/cluster-observability.md @@ -0,0 +1,361 @@ +# ClusterObservability Controller + +ClusterObservability provides a streamlined way to deploy and manage OpenTelemetry observability components across an entire Kubernetes cluster with a single Custom Resource. + +## Overview + +ClusterObservability automatically creates and manages: +- **Agent Collector**: DaemonSet for node-level metrics, logs, and host OTLP receiver +- **Cluster Collector**: Deployment for cluster-level k8s metrics and events +- **Auto-Instrumentation**: Single Instrumentation CR for application instrumentation (points to local agent) + +The controller uses a **controller-of-controllers pattern**, creating `OpenTelemetryCollector` and `Instrumentation` CRs that are managed by their respective controllers. + +## Quick Start + +```yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: ClusterObservability +metadata: + name: cluster-observability + namespace: opentelemetry-operator-system +spec: + signals: ["metrics", "traces", "logs"] + # OTLP HTTP exporter only + exporter: + endpoint: "https://otel-backend.example.com:4318" + # Optional: override endpoint for specific signals + traces_endpoint: "https://trace-backend.example.com/v2/trace/otlp" + metrics_endpoint: "https://metrics-backend.example.com/v2/datapoint/otlp" + logs_endpoint: "https://logs-backend.example.com/v1/logs" + headers: + "Authorization": "Bearer your-token" + compression: "gzip" + timeout: "30s" +``` + +## Architecture + +```mermaid +graph TB + CO[ClusterObservability CR] --> Controller[ClusterObservability Controller] + + Controller --> OTLC1[OpenTelemetryCollector CR
Agent DaemonSet] + Controller --> OTLC2[OpenTelemetryCollector CR
Cluster Deployment] + Controller --> INSTR[Instrumentation CR
Single Instance
Points to local agent] + + OTLC1 --> OTC1[OpenTelemetryCollector Controller] + OTLC2 --> OTC2[OpenTelemetryCollector Controller] + INSTR --> IC[Instrumentation Controller] + + OTC1 --> DS[Agent DaemonSet] + OTC2 --> DEP[Cluster Deployment] + IC --> POD[Instrumented Pods] + + subgraph "OpenShift Integration" + Controller --> SCC[Security Context Constraints] + DS --> KubeletCA[Kubelet CA Certificate] + end + + subgraph "Configuration System" + Controller --> ConfigLoader[Config Loader] + ConfigLoader --> BaseConfig[Base Configs] + ConfigLoader --> DistroConfig[Distro Overrides] + end +``` + +## Controller Interaction Flow + +```mermaid +sequenceDiagram + participant User + participant K8s as Kubernetes API + participant Controller as ClusterObservability Controller + participant ConfigLoader as Config Loader + participant Status as Status Handler + + User->>K8s: Create ClusterObservability CR + K8s->>Controller: Watch Event (Create) + + Controller->>Controller: Add Finalizer + Controller->>Controller: Validate Singleton + + alt Multiple ClusterObservability CRs + Controller->>Status: Mark as Conflicted + Status->>K8s: Update Status (Conflicted) + else Single Active CR + Controller->>ConfigLoader: Load Collector Configs + ConfigLoader-->>Controller: Agent & Cluster Configs + + Controller->>K8s: Create Agent OpenTelemetryCollector CR + Controller->>K8s: Create Cluster OpenTelemetryCollector CR + Controller->>K8s: Create Instrumentation CR + + opt OpenShift Environment (cached detection) + Controller->>K8s: Create Security Context Constraints + end + + Controller->>Status: Check Component Health + Status->>K8s: Query Component Status + Status-->>Controller: Health Status + + Controller->>Status: Update Status (Ready/NotReady) + Status->>K8s: Update ClusterObservability Status + end + + Note over Controller: Continuous Reconciliation + K8s->>Controller: Watch Event (Update/Delete) + Controller->>Controller: Reconcile Changes +``` + +## Feature Gate + +ClusterObservability is controlled by the `operator.clusterobservability` feature gate: + +```bash +# Enable ClusterObservability +./manager --feature-gates=+operator.clusterobservability + +# Check if enabled +./manager --help | grep cluster-observability +``` + +## CRD Configuration + +ClusterObservability has a simple spec with two main fields at present: + +```go +type ClusterObservabilitySpec struct { + Signals []ObservabilitySignal // "logs", "metrics", "traces", "profiles" + Exporter OTLPHTTPExporter // OTLP HTTP exporter configuration +} +``` + +The `exporter` field uses the `otlphttp` exporter from OpenTelemetry Collector. + +### Basic Example +```yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: ClusterObservability +metadata: + name: cluster-observability + namespace: opentelemetry-operator-system +spec: + signals: ["metrics", "traces"] + exporter: + endpoint: "https://otel.example.com:4318" + headers: + "Authorization": "Bearer your-token" + timeout: "30s" +``` + +**Note**: TLS certificate file mounting (`ca_file`, `cert_file`, `key_file`) is not supported yet even though the config can be set. + + +## Conflict Detection Example + +The controller only allows one active ClusterObservability resource in the cluster. When multiple resources are detected, the **oldest resource** (by creation timestamp) remains active while others are marked as conflicted. If resources have identical creation timestamps, the resource with the lexicographically smaller namespace/name becomes active. + +When a second ClusterObservability resource is created, it gets marked with `Conflicted` status: + +```yaml +Name: cluster-observability-2 +Namespace: opentelemetry-operator-system +Labels: +Annotations: +API Version: opentelemetry.io/v1alpha1 +Kind: ClusterObservability +Metadata: + Creation Timestamp: 2025-09-06T03:30:28Z + Generation: 1 + Resource Version: 7935997 + UID: 969895d1-ab6b-429e-b740-f8381ab3ce32 +Spec: + Exporter: + Endpoint: http://otlp-collector.opentelemetry-demo.svc.cluster.local:4317 + Headers: + X - Deployment: clusterobservability-test + Signals: + traces + metrics +Status: + Conditions: + Last Transition Time: 2025-09-06T03:30:28Z + Message: Multiple ClusterObservability resources exist in cluster + Reason: Configured + Status: True + Type: Conflicted + Message: Multiple ClusterObservability resources detected. Only the oldest resource is active. + Observed Generation: 1 + Phase: Conflicted +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Info 6s (x2 over 6s) cluster-observability status updated - resource is conflicted + Warning Conflicted 4s (x25 over 6s) cluster-observability Multiple ClusterObservability resources detected. Only opentelemetry-operator-system/cluster-observability (oldest) is active +``` + +## Configuration System + +ClusterObservability uses an embedded YAML-based configuration system that supports different Kubernetes distributions: + +``` +internal/manifests/clusterobservability/config/configs/ +├── agent-collector-base.yaml # Base agent collector config +├── cluster-collector-base.yaml # Base cluster collector config +└── distros/ + └── openshift/ + ├── agent-collector-overrides.yaml + └── cluster-collector-overrides.yaml +``` + +### Agent Collector Configuration + +Agent collectors run as DaemonSet with `hostNetwork: true` and collect following at present: +- **OTLP Receiver**: Receives traces/metrics from auto-instrumented apps (gRPC:4317, HTTP:4318) +- **Kubelet Stats**: Pod and container metrics via `kubeletstats` receiver +- **Container Logs**: Application logs via `filelog` receiver + +The agent collector exposes OTLP ports on the host network, allowing instrumented applications to send telemetry to their local node's collector using `$(OTEL_NODE_IP):4317` or `$(OTEL_NODE_IP):4318`. + +### Cluster Collector Configuration + +Cluster collectors run as Deployment and collect: +- **Cluster Metrics**: Via `k8s_cluster` receiver + +## Auto-Instrumentation + +ClusterObservability creates a **single Instrumentation CR** in the same namespace as the ClusterObservability resource. Users reference this CR from their application annotations. The instrumentation configuration is controlled by operator settings. + + + +### Example Clusterobservability Object + +```yaml +Name: cluster-observability +Namespace: opentelemetry-operator-system +Labels: +Annotations: +API Version: opentelemetry.io/v1alpha1 +Kind: ClusterObservability +Metadata: + Creation Timestamp: 2025-09-06T03:36:05Z + Finalizers: + clusterobservability.opentelemetry.io/finalizer + Generation: 1 + Resource Version: 7948019 + UID: b3dc8d25-b345-4cb4-8bbc-516e5e7b1dc7 +Spec: + Exporter: + Compression: gzip + Headers: + Content - Type: application/x-protobuf + X - SF - TOKEN: fake-token + metrics_endpoint: https://ingest.us0.signalfx.com/v2/datapoint/otlp + Timeout: 30s + traces_endpoint: https://ingest.us0.signalfx.com/v2/trace/otlp + Signals: + traces + metrics +Status: + Components Status: + Agent: + Last Updated: 2025-09-06T03:40:24Z + Message: Agent collector DaemonSet not ready: 0/3 pods ready + Cluster: + Last Updated: 2025-09-06T03:40:24Z + Message: Cluster collector Deployment ready: 1/1 replicas ready + Ready: true + Instrumentation: + Last Updated: 2025-09-06T03:40:24Z + Message: Instrumentation CR ready: opentelemetry-operator-system/default-instrumentation + Ready: true + Conditions: + Last Transition Time: 2025-09-06T03:36:05Z + Message: ClusterObservability configuration applied successfully + Reason: Configured + Status: True + Type: Configured + Last Transition Time: 2025-09-06T03:36:05Z + Message: Collector configuration has been updated - managed collectors will be reconciled + Reason: ConfigChanged + Status: True + Type: ConfigurationUpdated + Config Versions: + Agent - Collector - Openshift: d3945a86e3b61a9bb578b8340cf9679a486b4cde13332b7f216b6d85874ea6ee + Cluster - Collector - Openshift: 4ac402eda083f315297e410b2dccb1698cb5ae10ebedc8ad5eb860a5aeda66a1 + Message: Some components are not ready + Observed Generation: 1 + Phase: Pending +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal ConfigChanged 4m19s (x2 over 4m19s) cluster-observability Collector configuration has changed, updating managed resources + Normal Info 4m17s (x23 over 4m19s) cluster-observability applied status changes +``` + +### How Users Apply Auto-Instrumentation + +Users trigger auto-instrumentation by adding annotations that reference the single Instrumentation CR: + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-app + namespace: my-apps +spec: + template: + metadata: + annotations: + # Reference the single Instrumentation CR using namespace/name format + instrumentation.opentelemetry.io/inject-java: "opentelemetry-operator-system/default-instrumentation" + spec: + containers: + - name: app + image: my-java-app:latest +``` + +**Pattern**: `namespace/instrumentation-name` where `namespace` is where ClusterObservability is deployed. + +## Troubleshooting + +### Check Controller Status +```bash +kubectl logs deployment/opentelemetry-operator-controller-manager -n opentelemetry-operator-system +``` + +### Manual RBAC Workaround +If automatic RBAC creation (`--create-rbac-permissions=true`) isn't working, you may need to apply manual RBAC permissions. This is a known issue being investigated. + +```bash +# Apply manual ClusterRole and ClusterRoleBinding for collectors +kubectl apply -f deploy-test/cluster-observability-manual-rbac.yaml +``` + +### Check ClusterObservability Status +```bash +kubectl get clusterobservabilities -n opentelemetry-operator-system +kubectl describe clusterobservability cluster-observability -n opentelemetry-operator-system +``` + +### Check Component Health +```bash +# OpentelemetryCollector CR +kubectl get opentelemetrycollector -l app.kubernetes.io/managed-by=opentelemetry-operator -n opentelemetry-operator-system + +# Agent collectors +kubectl get daemonsets -l app.kubernetes.io/managed-by=opentelemetry-operator -n opentelemetry-operator-system + +# Cluster collectors +kubectl get deployments -l app.kubernetes.io/managed-by=opentelemetry-operator -n opentelemetry-operator-system + +# Auto-instrumentation +kubectl get instrumentations -n opentelemetry-operator-system +``` + +### Check Events +```bash +kubectl get events --field-selector reason=Conflicted +kubectl get events --field-selector involvedObject.kind=ClusterObservability +``` \ No newline at end of file diff --git a/go.sum b/go.sum index 7ccbf510e1..416f3e7b82 100644 --- a/go.sum +++ b/go.sum @@ -543,67 +543,42 @@ go.opentelemetry.io/collector/featuregate v1.37.0 h1:CjsHzjktiqq/dxid4Xkhuf3yD6o go.opentelemetry.io/collector/featuregate v1.37.0/go.mod h1:Y/KsHbvREENKvvN9RlpiWk/IGBK+CATBYzIIpU7nccc= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0/go.mod h1:NfchwuyNoMcZ5MLHwPrODwUF1HWCXWrL31s8gSAdIKY= -go.opentelemetry.io/contrib/otelconf v0.17.0 h1:Yh9uifPSe8yiksLshMbeAXGm/ZRmo7LD7Di+/yd1L5w= -go.opentelemetry.io/contrib/otelconf v0.17.0/go.mod h1:8dHKS6uMiZlvmrA7MGUtb4HwnX+ukdF5iS3p2UPKvLE= go.opentelemetry.io/contrib/otelconf v0.18.0 h1:ciF2Gf00BWs0DnexKFZXcxg9kJ8r3SUW1LOzW3CsKA8= go.opentelemetry.io/contrib/otelconf v0.18.0/go.mod h1:FcP7k+JLwBLdOxS6qY6VQ/4b5VBntI6L6o80IMwhAeI= go.opentelemetry.io/otel v1.38.0 h1:RkfdswUDRimDg0m2Az18RKOsnI8UDzppJAtj01/Ymk8= go.opentelemetry.io/otel v1.38.0/go.mod h1:zcmtmQ1+YmQM9wrNsTGV/q/uyusom3P8RxwExxkZhjM= -go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.13.0 h1:z6lNIajgEBVtQZHjfw2hAccPEBDs+nx58VemmXWa2ec= -go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.13.0/go.mod h1:+kyc3bRx/Qkq05P6OCu3mTEIOxYRYzoIg+JsUp5X+PM= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0 h1:OMqPldHt79PqWKOMYIAQs3CxAi7RLgPxwfFSwr4ZxtM= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.14.0/go.mod h1:1biG4qiqTxKiUCtoWDPpL3fB3KxVwCiGw81j3nKMuHE= -go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.13.0 h1:zUfYw8cscHHLwaY8Xz3fiJu+R59xBnkgq2Zr1lwmK/0= -go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.13.0/go.mod h1:514JLMCcFLQFS8cnTepOk6I09cKWJ5nGHBxHrMJ8Yfg= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.14.0 h1:QQqYw3lkrzwVsoEX0w//EhH/TCnpRdEenKBOOEIMjWc= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.14.0/go.mod h1:gSVQcr17jk2ig4jqJ2DX30IdWH251JcNAecvrqTxH1s= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.37.0 h1:zG8GlgXCJQd5BU98C0hZnBbElszTmUgCNCfYneaDL0A= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.37.0/go.mod h1:hOfBCz8kv/wuq73Mx2H2QnWokh/kHZxkh6SNF2bdKtw= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0 h1:vl9obrcoWVKp/lwl8tRE33853I8Xru9HFbw/skNeLs8= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.38.0/go.mod h1:GAXRxmLJcVM3u22IjTg74zWBrRCKq8BnOqUVLodpcpw= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.38.0 h1:Oe2z/BCg5q7k4iXC3cqJxKYg0ieRiOqF0cecFYdPTwk= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.38.0/go.mod h1:ZQM5lAJpOsKnYagGg/zV2krVqTtaVdYdDkhMoX6Oalg= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0 h1:Ahq7pZmv87yiyn3jeFz/LekZmPLLdKejuO3NcK9MssM= -go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.37.0/go.mod h1:MJTqhM0im3mRLw1i8uGHnCvUEeS7VwRyxlLC78PA18M= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.37.0 h1:EtFWSnwW9hGObjkIdmlnWSydO+Qs8OwzfzXLUPg4xOc= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.37.0/go.mod h1:QjUEoiGCPkvFZ/MjK6ZZfNOS6mfVEVKYE99dFhuN2LI= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.37.0 h1:bDMKF3RUSxshZ5OjOTi8rsHGaPKsAt76FaqgvIUySLc= -go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.37.0/go.mod h1:dDT67G/IkA46Mr2l9Uj7HsQVwsjASyV9SjGofsiUZDA= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0 h1:aTL7F04bJHUlztTsNGJ2l+6he8c+y/b//eR0jjjemT4= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.38.0/go.mod h1:kldtb7jDTeol0l3ewcmd8SDvx3EmIE7lyvqbasU3QC4= go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo= go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk= -go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.13.0 h1:yEX3aC9KDgvYPhuKECHbOlr5GLwH6KTjLJ1sBSkkxkc= -go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.13.0/go.mod h1:/GXR0tBmmkxDaCUGahvksvp66mx4yh5+cFXgSlhg0vQ= go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.14.0 h1:B/g+qde6Mkzxbry5ZZag0l7QrQBCtVm7lVjaLgmpje8= go.opentelemetry.io/otel/exporters/stdout/stdoutlog v0.14.0/go.mod h1:mOJK8eMmgW6ocDJn6Bn11CcZ05gi3P8GylBXEkZtbgA= -go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.37.0 h1:6VjV6Et+1Hd2iLZEPtdV7vie80Yyqf7oikJLjQ/myi0= -go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.37.0/go.mod h1:u8hcp8ji5gaM/RfcOo8z9NMnf1pVLfVY7lBY2VOGuUU= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.38.0 h1:wm/Q0GAAykXv83wzcKzGGqAnnfLFyFe7RslekZuv+VI= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.38.0/go.mod h1:ra3Pa40+oKjvYh+ZD3EdxFZZB0xdMfuileHAm4nNN7w= -go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.37.0 h1:SNhVp/9q4Go/XHBkQ1/d5u9P/U+L1yaGPoi0x+mStaI= -go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.37.0/go.mod h1:tx8OOlGH6R4kLV67YaYO44GFXloEjGPZuMjEkaaqIp4= go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.38.0 h1:kJxSDN4SgWWTjG/hPp3O7LCGLcHXFlvS2/FFOrwL+SE= go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.38.0/go.mod h1:mgIOzS7iZeKJdeB8/NYHrJ48fdGc71Llo5bJ1J4DWUE= -go.opentelemetry.io/otel/log v0.13.0 h1:yoxRoIZcohB6Xf0lNv9QIyCzQvrtGZklVbdCoyb7dls= -go.opentelemetry.io/otel/log v0.13.0/go.mod h1:INKfG4k1O9CL25BaM1qLe0zIedOpvlS5Z7XgSbmN83E= go.opentelemetry.io/otel/log v0.14.0 h1:2rzJ+pOAZ8qmZ3DDHg73NEKzSZkhkGIua9gXtxNGgrM= go.opentelemetry.io/otel/log v0.14.0/go.mod h1:5jRG92fEAgx0SU/vFPxmJvhIuDU9E1SUnEQrMlJpOno= go.opentelemetry.io/otel/metric v1.38.0 h1:Kl6lzIYGAh5M159u9NgiRkmoMKjvbsKtYRwgfrA6WpA= go.opentelemetry.io/otel/metric v1.38.0/go.mod h1:kB5n/QoRM8YwmUahxvI3bO34eVtQf2i4utNVLr9gEmI= go.opentelemetry.io/otel/sdk v1.38.0 h1:l48sr5YbNf2hpCUj/FoGhW9yDkl+Ma+LrVl8qaM5b+E= go.opentelemetry.io/otel/sdk v1.38.0/go.mod h1:ghmNdGlVemJI3+ZB5iDEuk4bWA3GkTpW+DOoZMYBVVg= -go.opentelemetry.io/otel/sdk/log v0.13.0 h1:I3CGUszjM926OphK8ZdzF+kLqFvfRY/IIoFq/TjwfaQ= -go.opentelemetry.io/otel/sdk/log v0.13.0/go.mod h1:lOrQyCCXmpZdN7NchXb6DOZZa1N5G1R2tm5GMMTpDBw= go.opentelemetry.io/otel/sdk/log v0.14.0 h1:JU/U3O7N6fsAXj0+CXz21Czg532dW2V4gG1HE/e8Zrg= go.opentelemetry.io/otel/sdk/log v0.14.0/go.mod h1:imQvII+0ZylXfKU7/wtOND8Hn4OpT3YUoIgqJVksUkM= -go.opentelemetry.io/otel/sdk/log/logtest v0.13.0 h1:9yio6AFZ3QD9j9oqshV1Ibm9gPLlHNxurno5BreMtIA= -go.opentelemetry.io/otel/sdk/log/logtest v0.13.0/go.mod h1:QOGiAJHl+fob8Nu85ifXfuQYmJTFAvcrxL6w5/tu168= go.opentelemetry.io/otel/sdk/log/logtest v0.14.0 h1:Ijbtz+JKXl8T2MngiwqBlPaHqc4YCaP/i13Qrow6gAM= +go.opentelemetry.io/otel/sdk/log/logtest v0.14.0/go.mod h1:dCU8aEL6q+L9cYTqcVOk8rM9Tp8WdnHOPLiBgp0SGOA= go.opentelemetry.io/otel/sdk/metric v1.38.0 h1:aSH66iL0aZqo//xXzQLYozmWrXxyFkBJ6qT5wthqPoM= go.opentelemetry.io/otel/sdk/metric v1.38.0/go.mod h1:dg9PBnW9XdQ1Hd6ZnRz689CbtrUp0wMMs9iPcgT9EZA= go.opentelemetry.io/otel/trace v1.38.0 h1:Fxk5bKrDZJUH+AMyyIXGcFAPah0oRcT+LuNtJrmcNLE= diff --git a/internal/config/cli.go b/internal/config/cli.go index 81ebf77cee..57c0175684 100644 --- a/internal/config/cli.go +++ b/internal/config/cli.go @@ -145,6 +145,13 @@ func ApplyCLI(cfg *Config) error { cfg.Zap.LevelFormat, _ = f.GetString("zap-level-format") case "enable-webhooks": cfg.EnableWebhooks, _ = f.GetBool("enable-webhooks") + case "create-rbac-permissions": + val, _ := f.GetBool("create-rbac-permissions") + if val { + cfg.CreateRBACPermissions = autoRBAC.Available + } else { + cfg.CreateRBACPermissions = autoRBAC.NotAvailable + } } } }) diff --git a/internal/controllers/clusterobservability_controller.go b/internal/controllers/clusterobservability_controller.go new file mode 100644 index 0000000000..0079900df9 --- /dev/null +++ b/internal/controllers/clusterobservability_controller.go @@ -0,0 +1,650 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package controllers + +import ( + "context" + "fmt" + "time" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + apiequality "k8s.io/apimachinery/pkg/api/equality" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/handler" + + "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1" + "github.com/open-telemetry/opentelemetry-operator/apis/v1beta1" + "github.com/open-telemetry/opentelemetry-operator/internal/autodetect/openshift" + "github.com/open-telemetry/opentelemetry-operator/internal/config" + "github.com/open-telemetry/opentelemetry-operator/internal/manifests" + "github.com/open-telemetry/opentelemetry-operator/internal/manifests/clusterobservability" + coStatus "github.com/open-telemetry/opentelemetry-operator/internal/status/clusterobservability" +) + +// ClusterObservabilityReconciler reconciles a ClusterObservability object. +type ClusterObservabilityReconciler struct { + client.Client + recorder record.EventRecorder + scheme *runtime.Scheme + log logr.Logger + config config.Config +} + +// ClusterObservabilityReconcilerParams is the set of options to build a new ClusterObservabilityReconciler. +type ClusterObservabilityReconcilerParams struct { + client.Client + Recorder record.EventRecorder + Scheme *runtime.Scheme + Log logr.Logger + Config config.Config +} + +func (r *ClusterObservabilityReconciler) getParams(instance v1alpha1.ClusterObservability) manifests.Params { + return manifests.Params{ + Config: r.config, + Client: r.Client, + ClusterObservability: instance, + Log: r.log, + Scheme: r.scheme, + Recorder: r.recorder, + } +} + +func NewClusterObservabilityReconciler(params ClusterObservabilityReconcilerParams) *ClusterObservabilityReconciler { + reconciler := &ClusterObservabilityReconciler{ + Client: params.Client, + scheme: params.Scheme, + log: params.Log, + recorder: params.Recorder, + config: params.Config, + } + return reconciler +} + +//+kubebuilder:rbac:groups=opentelemetry.io,resources=clusterobservabilities,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=opentelemetry.io,resources=clusterobservabilities/status,verbs=get;update;patch +//+kubebuilder:rbac:groups=opentelemetry.io,resources=clusterobservabilities/finalizers,verbs=update +//+kubebuilder:rbac:groups=opentelemetry.io,resources=opentelemetrycollectors,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=opentelemetry.io,resources=instrumentations,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch +//+kubebuilder:rbac:groups="",resources=events,verbs=create;patch +//+kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch +//+kubebuilder:rbac:groups=apps,resources=daemonsets,verbs=get;list;watch +//+kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch +//+kubebuilder:rbac:groups=security.openshift.io,resources=securitycontextconstraints,verbs=get;list;watch;create;update;patch;delete + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +func (r *ClusterObservabilityReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := r.log.WithValues("clusterobservability", req.NamespacedName) + + var instance v1alpha1.ClusterObservability + if err := r.Client.Get(ctx, req.NamespacedName, &instance); err != nil { + if !apierrors.IsNotFound(err) { + log.Error(err, "unable to fetch ClusterObservability") + } + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Handle deletion + if deletionTimestamp := instance.GetDeletionTimestamp(); deletionTimestamp != nil { + return r.handleDeletion(ctx, log, &instance) + } + + // Validate singleton constraint + isActive, conflictErr := r.validateSingleton(ctx, log, &instance) + if conflictErr != nil { + return ctrl.Result{}, conflictErr + } + + if !isActive { + // This instance is conflicted, update status and skip reconciliation + params := r.getParams(instance) + return coStatus.HandleReconcileStatus(ctx, log, params, fmt.Errorf("multiple ClusterObservability resources detected")) + } + + // TODO: Add upgrade support + // TODO: Support management state like OpenTelemetryCollector + + configChanged, configErr := coStatus.DetectConfigChanges(&instance) + if configErr != nil { + log.Error(configErr, "failed to detect config changes") + } + + if configChanged { + log.Info("Configuration changes detected - triggering full reconciliation") + r.recorder.Event(&instance, corev1.EventTypeNormal, "ConfigChanged", + "Collector configuration has changed, updating managed resources") + } + + // Add finalizer to ensure proper resource cleanup + if !controllerutil.ContainsFinalizer(&instance, v1alpha1.ClusterObservabilityFinalizer) { + if controllerutil.AddFinalizer(&instance, v1alpha1.ClusterObservabilityFinalizer) { + if err := r.Update(ctx, &instance); err != nil { + return ctrl.Result{}, err + } + } + } + + log.V(2).Info("Reconciling ClusterObservability managed resources") + + params := r.getParams(instance) + + desiredObjects, buildErr := clusterobservability.Build(params) + if buildErr != nil { + return ctrl.Result{}, buildErr + } + + var openTelemetryCRs []client.Object + var unstructuredObjects []client.Object + var regularObjects []client.Object + + for _, obj := range desiredObjects { + switch obj.(type) { + case *v1beta1.OpenTelemetryCollector, *v1alpha1.Instrumentation: + openTelemetryCRs = append(openTelemetryCRs, obj) + case *unstructured.Unstructured: + unstructuredObjects = append(unstructuredObjects, obj) + default: + regularObjects = append(regularObjects, obj) + } + } + + // Handle OpenTelemetry CRs - their controllers manage the underlying resources + for _, crObj := range openTelemetryCRs { + if err := r.reconcileOpenTelemetryResource(ctx, log, crObj); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile OpenTelemetry CR %s: %w", crObj.GetObjectKind(), err) + } + } + + // Handle Unstructured objects (like OpenShift SCC) separately to avoid deep copy issues + for _, unstructuredObj := range unstructuredObjects { + if err := r.reconcileUnstructuredResource(ctx, log, unstructuredObj); err != nil { + return ctrl.Result{}, fmt.Errorf("failed to reconcile unstructured resource %s: %w", unstructuredObj.GetName(), err) + } + } + // Handle regular Kubernetes resources (currently none - OpenTelemetry CRs handle their own resources) + if len(regularObjects) > 0 { + ownedObjects, err := r.findClusterObservabilityOwnedObjects(ctx, params) + if err != nil { + return ctrl.Result{}, err + } + err = reconcileDesiredObjects(ctx, r.Client, log, ¶ms.ClusterObservability, params.Scheme, regularObjects, ownedObjects) + if err != nil { + return ctrl.Result{}, err + } + } + return coStatus.HandleReconcileStatus(ctx, log, params, nil) +} + +// reconcileOpenTelemetryResource creates/updates OpenTelemetry CRs. +// Their respective controllers handle the underlying Kubernetes resources. +// TODO: fix issue with resourceVersion becoming stale due to updates from OpenTelemetryCollector/Instrumentation controllers. +func (r *ClusterObservabilityReconciler) reconcileOpenTelemetryResource(ctx context.Context, log logr.Logger, desired client.Object) error { + key := client.ObjectKeyFromObject(desired) + + var existing client.Object + switch desired.(type) { + case *v1beta1.OpenTelemetryCollector: + existing = &v1beta1.OpenTelemetryCollector{} + case *v1alpha1.Instrumentation: + existing = &v1alpha1.Instrumentation{} + default: + return fmt.Errorf("unsupported CRD type: %T", desired) + } + + getErr := r.Get(ctx, key, existing) + + if getErr != nil { + if apierrors.IsNotFound(getErr) { + if createErr := r.Create(ctx, desired); createErr != nil { + return fmt.Errorf("failed to create %s %s: %w", desired.GetObjectKind().GroupVersionKind().Kind, key, createErr) + } + log.Info("Created CR", "kind", desired.GetObjectKind().GroupVersionKind().Kind, "name", key.Name, "namespace", key.Namespace) + return nil + } + return fmt.Errorf("failed to get %s %s: %w", desired.GetObjectKind().GroupVersionKind().Kind, key, getErr) + } + switch existingCRD := existing.(type) { + case *v1beta1.OpenTelemetryCollector: + desiredCRD := desired.(*v1beta1.OpenTelemetryCollector) + if !apiequality.Semantic.DeepEqual(existingCRD.Spec, desiredCRD.Spec) { + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + latest := &v1beta1.OpenTelemetryCollector{} + if err := r.Get(ctx, key, latest); err != nil { + return err + } + + // Only update if still different (another controller might have updated it) + if apiequality.Semantic.DeepEqual(latest.Spec, desiredCRD.Spec) { + log.Info("OpenTelemetryCollector already matches desired state", "name", key.Name, "namespace", key.Namespace) + return nil + } + + // Update the latest version with our desired changes + latest.Spec = desiredCRD.Spec + latest.Labels = desiredCRD.Labels + latest.Annotations = desiredCRD.Annotations + + return r.Update(ctx, latest) + }) + + if err != nil { + return fmt.Errorf("failed to update OpenTelemetryCollector %s: %w", key, err) + } + + log.Info("Updated OpenTelemetryCollector", "name", key.Name, "namespace", key.Namespace) + } + + case *v1alpha1.Instrumentation: + desiredCRD := desired.(*v1alpha1.Instrumentation) + if !apiequality.Semantic.DeepEqual(existingCRD.Spec, desiredCRD.Spec) { + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + latest := &v1alpha1.Instrumentation{} + if err := r.Get(ctx, key, latest); err != nil { + return err + } + + // Only update if still different (another controller might have updated it) + if apiequality.Semantic.DeepEqual(latest.Spec, desiredCRD.Spec) { + log.Info("Instrumentation already matches desired state", "name", key.Name, "namespace", key.Namespace) + return nil + } + + // Update the latest version with our desired changes + latest.Spec = desiredCRD.Spec + latest.Labels = desiredCRD.Labels + latest.Annotations = desiredCRD.Annotations + + return r.Update(ctx, latest) + }) + + if err != nil { + return fmt.Errorf("failed to update Instrumentation %s: %w", key, err) + } + + log.Info("Updated Instrumentation", "name", key.Name, "namespace", key.Namespace) + } + + default: + return fmt.Errorf("unsupported CRD type: %T", existing) + } + + return nil +} + +// reconcileUnstructuredResource handles Unstructured objects (like OpenShift SCCs) +// without deep copy issues that occur with complex nested data. +func (r *ClusterObservabilityReconciler) reconcileUnstructuredResource(ctx context.Context, log logr.Logger, obj client.Object) error { + unstructuredObj := obj.(*unstructured.Unstructured) + + // Create a new Unstructured object for fetching existing resource + // This avoids deep copy issues with the desired object + existing := &unstructured.Unstructured{} + existing.SetGroupVersionKind(unstructuredObj.GroupVersionKind()) + + key := client.ObjectKeyFromObject(unstructuredObj) + getErr := r.Client.Get(ctx, key, existing) + if getErr != nil && !apierrors.IsNotFound(getErr) { + return fmt.Errorf("failed to get existing unstructured resource %s: %w", unstructuredObj.GetName(), getErr) + } + + if apierrors.IsNotFound(getErr) { + // Create new resource + if createErr := r.Client.Create(ctx, unstructuredObj); createErr != nil { + return fmt.Errorf("failed to create unstructured resource %s: %w", unstructuredObj.GetName(), createErr) + } + log.Info("Created unstructured resource", + "kind", unstructuredObj.GetKind(), + "name", unstructuredObj.GetName()) + } else { + // Check if update is needed by comparing specs + if !apiequality.Semantic.DeepEqual(existing.Object, unstructuredObj.Object) { + unstructuredObj.SetResourceVersion(existing.GetResourceVersion()) + if updateErr := r.Client.Update(ctx, unstructuredObj); updateErr != nil { + return fmt.Errorf("failed to update unstructured resource %s: %w", unstructuredObj.GetName(), updateErr) + } + log.Info("Updated unstructured resource", + "kind", unstructuredObj.GetKind(), + "name", unstructuredObj.GetName()) + } + } + + return nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *ClusterObservabilityReconciler) SetupWithManager(mgr ctrl.Manager) error { + err := r.SetupCaches(mgr) + if err != nil { + return err + } + + ownedResources := r.GetOwnedResourceTypes() + builder := ctrl.NewControllerManagedBy(mgr). + For(&v1alpha1.ClusterObservability{}). + Watches( + &corev1.Namespace{}, + handler.EnqueueRequestsFromMapFunc(r.findClusterObservabilityForNamespace), + ) + + for _, resource := range ownedResources { + builder.Owns(resource) + } + + return builder.Complete(r) +} + +// SetupCaches sets up field indexing for efficient owned object queries. +func (r *ClusterObservabilityReconciler) SetupCaches(mgr ctrl.Manager) error { + const clusterObservabilityResourceOwnerKey = ".metadata.owner" + + ownedResources := r.GetOwnedResourceTypes() + for _, resource := range ownedResources { + if err := mgr.GetCache().IndexField(context.Background(), resource, clusterObservabilityResourceOwnerKey, func(rawObj client.Object) []string { + owner := metav1.GetControllerOf(rawObj) + if owner == nil { + return nil + } + // Make sure it's a ClusterObservability + if owner.APIVersion != v1alpha1.GroupVersion.String() || owner.Kind != "ClusterObservability" { + return nil + } + return []string{owner.Name} + }); err != nil { + return err + } + } + return nil +} + +// findClusterObservabilityForNamespace finds ClusterObservability instances when namespaces change. +func (r *ClusterObservabilityReconciler) findClusterObservabilityForNamespace(_ context.Context, obj client.Object) []ctrl.Request { + ctx := context.Background() + + var clusterObservabilityList v1alpha1.ClusterObservabilityList + if err := r.List(ctx, &clusterObservabilityList); err != nil { + r.log.Error(err, "failed to list ClusterObservability resources") + return nil + } + + var requests []ctrl.Request + for _, co := range clusterObservabilityList.Items { + requests = append(requests, ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(&co), + }) + } + return requests +} + +// validateSingleton ensures only one ClusterObservability resource is active in the cluster. +// Returns true if this instance is the active one, false if conflicted. +func (r *ClusterObservabilityReconciler) validateSingleton(ctx context.Context, log logr.Logger, instance *v1alpha1.ClusterObservability) (bool, error) { + var clusterObservabilityList v1alpha1.ClusterObservabilityList + if err := r.List(ctx, &clusterObservabilityList); err != nil { + log.Error(err, "failed to list ClusterObservability resources for singleton validation") + return false, err + } + + // Filter out deleted resources and find the oldest active resource + var activeResources []v1alpha1.ClusterObservability + for _, co := range clusterObservabilityList.Items { + if co.DeletionTimestamp == nil { + activeResources = append(activeResources, co) + } + } + + if len(activeResources) <= 1 { + // No conflict, this is the only active resource + return true, nil + } + + // Multiple resources exist, determine which one should be active + // Use oldest by creation timestamp as the winner + // If timestamps are equal, use lexicographical name comparison as tie-breaker + oldestResource := &activeResources[0] + for i := 1; i < len(activeResources); i++ { + candidate := &activeResources[i] + + if candidate.CreationTimestamp.Before(&oldestResource.CreationTimestamp) { + oldestResource = candidate + } else if candidate.CreationTimestamp.Equal(&oldestResource.CreationTimestamp) { + candidateKey := candidate.Namespace + "/" + candidate.Name + oldestKey := oldestResource.Namespace + "/" + oldestResource.Name + if candidateKey < oldestKey { + oldestResource = candidate + } + } + } + + isWinner := oldestResource.UID == instance.UID + + if !isWinner { + // This resource is conflicted, emit an event and update status + r.recorder.Event(instance, corev1.EventTypeWarning, "Conflicted", + fmt.Sprintf("Multiple ClusterObservability resources detected. Only %s/%s (oldest) is active", + oldestResource.Namespace, oldestResource.Name)) + log.Info("ClusterObservability resource is conflicted", + "active", fmt.Sprintf("%s/%s", oldestResource.Namespace, oldestResource.Name), + "conflicted", fmt.Sprintf("%s/%s", instance.Namespace, instance.Name)) + } else { + // This resource is the winner, emit events for conflicted ones + for _, conflicted := range activeResources { + if conflicted.UID != instance.UID { + r.recorder.Event(&conflicted, corev1.EventTypeWarning, "Conflicted", + fmt.Sprintf("Multiple ClusterObservability resources detected. Only %s/%s (oldest) is active", + instance.Namespace, instance.Name)) + } + } + log.Info("ClusterObservability resource is active", "conflicted-count", len(activeResources)-1) + } + + return isWinner, nil +} + +// handleDeletion handles the cleanup of ClusterObservability resources and managed objects. +func (r *ClusterObservabilityReconciler) handleDeletion(ctx context.Context, log logr.Logger, instance *v1alpha1.ClusterObservability) (ctrl.Result, error) { + log.Info("Handling ClusterObservability deletion") + + if !controllerutil.ContainsFinalizer(instance, v1alpha1.ClusterObservabilityFinalizer) { + // Finalizer already removed, nothing to do + return ctrl.Result{}, nil + } + + // Clean up all managed resources + if err := r.cleanupManagedResources(ctx, log, instance); err != nil { + log.Error(err, "failed to cleanup managed resources") + r.recorder.Event(instance, corev1.EventTypeWarning, "CleanupFailed", + fmt.Sprintf("Failed to cleanup managed resources: %v", err)) + return ctrl.Result{RequeueAfter: time.Second * 30}, err + } + + // Remove finalizer to allow deletion + latest := &v1alpha1.ClusterObservability{} + if err := r.Get(ctx, client.ObjectKeyFromObject(instance), latest); err != nil { + log.Error(err, "failed to get latest ClusterObservability for finalizer removal") + return ctrl.Result{}, err + } + + controllerutil.RemoveFinalizer(latest, v1alpha1.ClusterObservabilityFinalizer) + if err := r.Update(ctx, latest); err != nil { + log.Error(err, "failed to remove finalizer") + return ctrl.Result{}, err + } + + log.Info("Successfully cleaned up ClusterObservability resources") + r.recorder.Event(instance, corev1.EventTypeNormal, "Deleted", "ClusterObservability and all managed resources cleaned up") + + return ctrl.Result{}, nil +} + +// cleanupManagedResources deletes all resources managed by ClusterObservability. +func (r *ClusterObservabilityReconciler) cleanupManagedResources(ctx context.Context, log logr.Logger, instance *v1alpha1.ClusterObservability) error { + // Clean up OpenTelemetryCollector CRs (both agent and cluster collectors) + if err := r.cleanupCollectors(ctx, log, instance); err != nil { + return fmt.Errorf("failed to cleanup collectors: %w", err) + } + + // Clean up the single Instrumentation CR in operator namespace + if err := r.cleanupInstrumentations(ctx, log, instance); err != nil { + return fmt.Errorf("failed to cleanup instrumentations: %w", err) + } + + // Clean up cluster-scoped resources (ClusterRole, ClusterRoleBinding) + if err := r.cleanupClusterScopedResources(ctx, log, instance); err != nil { + return fmt.Errorf("failed to cleanup cluster-scoped resources: %w", err) + } + + log.Info("All managed resources cleaned up successfully") + return nil +} + +// cleanupCollectors removes OpenTelemetryCollector CRs managed by ClusterObservability. +func (r *ClusterObservabilityReconciler) cleanupCollectors(ctx context.Context, log logr.Logger, instance *v1alpha1.ClusterObservability) error { + // Use consistent naming pattern for collectors + agentCollectorName := fmt.Sprintf("%s-agent", instance.Name) + clusterCollectorName := fmt.Sprintf("%s-cluster", instance.Name) + + collectors := []string{agentCollectorName, clusterCollectorName} + + for _, name := range collectors { + collector := &v1beta1.OpenTelemetryCollector{} + key := types.NamespacedName{Name: name, Namespace: instance.Namespace} + + if err := r.Get(ctx, key, collector); err != nil { + if apierrors.IsNotFound(err) { + continue // Already deleted + } + return fmt.Errorf("failed to get collector %s: %w", name, err) + } + + if err := r.Delete(ctx, collector); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to delete collector %s: %w", name, err) + } + + log.Info("Deleted OpenTelemetryCollector", "name", name) + } + + return nil +} + +// cleanupInstrumentations removes the single Instrumentation CR managed by ClusterObservability. +func (r *ClusterObservabilityReconciler) cleanupInstrumentations(ctx context.Context, log logr.Logger, instance *v1alpha1.ClusterObservability) error { + // Delete the single Instrumentation CR in the operator namespace + instrumentationName := "default-instrumentation" + + instrumentation := &v1alpha1.Instrumentation{} + key := types.NamespacedName{Name: instrumentationName, Namespace: instance.Namespace} + + if err := r.Get(ctx, key, instrumentation); err != nil { + if apierrors.IsNotFound(err) { + return nil // Already deleted or never created + } + return fmt.Errorf("failed to get instrumentation %s in namespace %s: %w", instrumentationName, instance.Namespace, err) + } + + // Check if this instrumentation is managed by our ClusterObservability instance + if !isOwnedByClusterObservability(instrumentation, instance) { + return nil // Not our resource + } + + if err := r.Delete(ctx, instrumentation); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to delete instrumentation %s in namespace %s: %w", instrumentationName, instance.Namespace, err) + } + + log.Info("Deleted Instrumentation", "name", instrumentationName, "namespace", instance.Namespace) + return nil +} + +// cleanupClusterScopedResources removes cluster-scoped resources that can't use owner references. +func (r *ClusterObservabilityReconciler) cleanupClusterScopedResources(ctx context.Context, log logr.Logger, instance *v1alpha1.ClusterObservability) error { + + if r.config.OpenShiftRoutesAvailability == openshift.RoutesAvailable { + agentCollectorName := fmt.Sprintf("%s-%s", instance.Name, clusterobservability.AgentCollectorSuffix) + sccName := fmt.Sprintf("%s-hostaccess", agentCollectorName) + + scc := &unstructured.Unstructured{} + scc.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "security.openshift.io", + Version: "v1", + Kind: "SecurityContextConstraints", + }) + scc.SetName(sccName) + + if err := r.Delete(ctx, scc); err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to delete SecurityContextConstraints %s: %w", sccName, err) + } + log.Info("Deleted SecurityContextConstraints", "name", sccName) + } + + return nil +} + +// isOwnedByClusterObservability checks if a resource is managed by the given ClusterObservability instance. +func isOwnedByClusterObservability(obj client.Object, instance *v1alpha1.ClusterObservability) bool { + labels := obj.GetLabels() + if labels == nil { + return false + } + + if managedBy, ok := labels["app.kubernetes.io/managed-by"]; !ok || managedBy != "opentelemetry-operator" { + return false + } + + if component, ok := labels["app.kubernetes.io/component"]; !ok || component != "cluster-observability" { + return false + } + for _, owner := range obj.GetOwnerReferences() { + if owner.UID == instance.UID { + return true + } + } + + return false +} + +// GetOwnedResourceTypes returns CRs directly created by ClusterObservability. +// Note: We only track OpenTelemetry CRs we create, not the underlying K8s resources +// (those are managed by OpenTelemetryCollector controller). +func (r *ClusterObservabilityReconciler) GetOwnedResourceTypes() []client.Object { + return []client.Object{ + &v1beta1.OpenTelemetryCollector{}, + &v1alpha1.Instrumentation{}, + } +} + +// findClusterObservabilityOwnedObjects finds OpenTelemetry CRs owned by ClusterObservability for cleanup. +func (r *ClusterObservabilityReconciler) findClusterObservabilityOwnedObjects(ctx context.Context, params manifests.Params) (map[types.UID]client.Object, error) { + const clusterObservabilityResourceOwnerKey = ".metadata.owner" + ownedObjects := map[types.UID]client.Object{} + + listOpts := []client.ListOption{ + client.InNamespace(params.ClusterObservability.Namespace), + client.MatchingFields{clusterObservabilityResourceOwnerKey: params.ClusterObservability.Name}, + } + + ownedObjectTypes := r.GetOwnedResourceTypes() + for _, objectType := range ownedObjectTypes { + objs, err := getList(ctx, r.Client, objectType, listOpts...) + if err != nil { + return nil, err + } + for uid, object := range objs { + ownedObjects[uid] = object + } + } + + return ownedObjects, nil +} diff --git a/internal/controllers/common.go b/internal/controllers/common.go index 7e70127fca..166096833a 100644 --- a/internal/controllers/common.go +++ b/internal/controllers/common.go @@ -31,6 +31,10 @@ func isNamespaceScoped(obj client.Object) bool { case *rbacv1.ClusterRole, *rbacv1.ClusterRoleBinding: return false default: + // Check for OpenShift SecurityContextConstraints (unstructured) + if obj.GetObjectKind().GroupVersionKind().Kind == "SecurityContextConstraints" { + return false + } return true } } diff --git a/internal/manifests/clusterobservability/clusterobservability.go b/internal/manifests/clusterobservability/clusterobservability.go new file mode 100644 index 0000000000..605106b416 --- /dev/null +++ b/internal/manifests/clusterobservability/clusterobservability.go @@ -0,0 +1,461 @@ +// Copyright The OpenTelemetry Authors +// SPDX-License-Identifier: Apache-2.0 + +package clusterobservability + +import ( + "fmt" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/open-telemetry/opentelemetry-operator/apis/v1alpha1" + "github.com/open-telemetry/opentelemetry-operator/apis/v1beta1" + "github.com/open-telemetry/opentelemetry-operator/internal/autodetect/openshift" + "github.com/open-telemetry/opentelemetry-operator/internal/manifests" + "github.com/open-telemetry/opentelemetry-operator/internal/manifests/clusterobservability/config" + "github.com/open-telemetry/opentelemetry-operator/internal/manifests/manifestutils" +) + +const ( + ComponentClusterObservability = "cluster-observability" + + // Collector name suffixes. + AgentCollectorSuffix = "agent" + ClusterCollectorSuffix = "cluster" + + // Default instrumentation name for managed namespaces. + DefaultInstrumentationName = "default-instrumentation" +) + +// getCollectorImage returns a sensible default collector image when build-time version is not set. +func getCollectorImage(configuredImage string) string { + // If the configured image has a 0.0.0 tag (fallback during development builds) + // replace it with latest + if configuredImage == "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector:0.0.0" { + return "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:latest" + } + return configuredImage +} + +// Build creates the manifest for the ClusterObservability resource. +func Build(params manifests.Params) ([]client.Object, error) { + var resourceManifests []client.Object + + // Build agent-level collector (DaemonSet) + agentCollector, err := buildAgentCollector(params) + if err != nil { + return nil, fmt.Errorf("failed to build agent collector: %w", err) + } + if agentCollector != nil { + resourceManifests = append(resourceManifests, agentCollector) + } + + // Build cluster-level collector (Deployment) + clusterCollector, err := buildClusterCollector(params) + if err != nil { + return nil, fmt.Errorf("failed to build cluster collector: %w", err) + } + if clusterCollector != nil { + resourceManifests = append(resourceManifests, clusterCollector) + } + + // Build Instrumentation CRs for all namespaces + instrumentations, err := buildInstrumentations(params) + if err != nil { + return nil, fmt.Errorf("failed to build instrumentation CRs: %w", err) + } + resourceManifests = append(resourceManifests, instrumentations...) + + // Build OpenShift Security Context Constraints if on OpenShift + if isOpenShiftEnvironment(params) { + sccResources := buildOpenShiftSCC(params) + resourceManifests = append(resourceManifests, sccResources...) + } + + return resourceManifests, nil +} + +// buildAgentCollector creates an OpenTelemetryCollector CR for agent-level collection. +func buildAgentCollector(params manifests.Params) (*v1beta1.OpenTelemetryCollector, error) { + co := params.ClusterObservability + + // Load configuration using the config loader + configLoader := config.NewConfigLoader() + + // Detect Kubernetes distribution + distroProvider := configLoader.DetectDistroProvider(params.Config) + + // Load the configuration + collectorConfig, err := configLoader.LoadCollectorConfig( + config.AgentCollectorType, + distroProvider, + co.Spec.Signals, + co.Spec, + ) + if err != nil { + return nil, fmt.Errorf("failed to load agent collector config: %w", err) + } + + // Validate the configuration + if err := configLoader.ValidateConfig(collectorConfig); err != nil { + return nil, fmt.Errorf("agent collector config validation failed: %w", err) + } + + agentCollectorName := fmt.Sprintf("%s-%s", co.Name, AgentCollectorSuffix) + labels := manifestutils.Labels(co.ObjectMeta, agentCollectorName, params.Config.CollectorImage, ComponentClusterObservability, params.Config.LabelsFilter) + labels["app.kubernetes.io/managed-by"] = "opentelemetry-operator" + labels["app.kubernetes.io/component"] = ComponentClusterObservability + + agentCollector := &v1beta1.OpenTelemetryCollector{ + ObjectMeta: metav1.ObjectMeta{ + Name: agentCollectorName, + Namespace: co.Namespace, + Labels: labels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: co.APIVersion, + Kind: co.Kind, + Name: co.Name, + UID: co.UID, + Controller: &[]bool{true}[0], + BlockOwnerDeletion: &[]bool{true}[0], + }, + }, + }, + Spec: v1beta1.OpenTelemetryCollectorSpec{ + Mode: v1beta1.ModeDaemonSet, + Config: collectorConfig, + OpenTelemetryCommonFields: v1beta1.OpenTelemetryCommonFields{ + Image: getCollectorImage(params.Config.CollectorImage), + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: &[]bool{false}[0], + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: &[]bool{true}[0], + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + PodSecurityContext: &corev1.PodSecurityContext{ + RunAsNonRoot: &[]bool{true}[0], + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + // Enable host networking for DaemonSet to allow direct port access + HostNetwork: true, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "host-dev", + MountPath: "/hostfs/dev", + ReadOnly: true, + }, + { + Name: "host-etc", + MountPath: "/hostfs/etc", + ReadOnly: true, + }, + { + Name: "host-proc", + MountPath: "/hostfs/proc", + ReadOnly: true, + }, + { + Name: "host-run-udev-data", + MountPath: "/hostfs/run/udev/data", + ReadOnly: true, + }, + { + Name: "host-sys", + MountPath: "/hostfs/sys", + ReadOnly: true, + }, + { + Name: "host-var-run-utmp", + MountPath: "/hostfs/var/run/utmp", + ReadOnly: true, + }, + { + Name: "host-usr-lib-osrelease", + MountPath: "/hostfs/usr/lib/os-release", + ReadOnly: true, + }, + { + Name: "var-log-pods", + MountPath: "/var/log/pods", + ReadOnly: true, + }, + { + Name: "var-lib-docker-containers", + MountPath: "/var/lib/docker/containers", + ReadOnly: true, + }, + // OpenShift kubelet CA certificate mount (direct file) + { + Name: "kubelet-serving-ca", + MountPath: "/etc/kubelet-serving-ca/ca-bundle.crt", + ReadOnly: true, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "host-dev", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/dev", + }, + }, + }, + { + Name: "host-etc", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/etc", + }, + }, + }, + { + Name: "host-proc", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/proc", + }, + }, + }, + { + Name: "host-run-udev-data", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/run/udev/data", + }, + }, + }, + { + Name: "host-sys", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/sys", + }, + }, + }, + { + Name: "host-var-run-utmp", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/run/utmp", + }, + }, + }, + { + Name: "host-usr-lib-osrelease", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/usr/lib/os-release", + }, + }, + }, + { + Name: "var-log-pods", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/log/pods", + }, + }, + }, + { + Name: "var-lib-docker-containers", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/var/lib/docker/containers", + }, + }, + }, + // OpenShift kubelet CA certificate volume via hostPath + { + Name: "kubelet-serving-ca", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/etc/kubernetes/kubelet-ca.crt", + Type: &[]corev1.HostPathType{corev1.HostPathFile}[0], + }, + }, + }, + }, + }, + }, + } + + return agentCollector, nil +} + +// buildClusterCollector creates an OpenTelemetryCollector CR for cluster-level collection. +func buildClusterCollector(params manifests.Params) (*v1beta1.OpenTelemetryCollector, error) { + co := params.ClusterObservability + + // Load configuration using the config loader + configLoader := config.NewConfigLoader() + + // Detect Kubernetes distribution + distroProvider := configLoader.DetectDistroProvider(params.Config) + + // Load the configuration + collectorConfig, err := configLoader.LoadCollectorConfig( + config.ClusterCollectorType, + distroProvider, + co.Spec.Signals, + co.Spec, + ) + if err != nil { + return nil, fmt.Errorf("failed to load cluster collector config: %w", err) + } + + // Validate the configuration + if err := configLoader.ValidateConfig(collectorConfig); err != nil { + return nil, fmt.Errorf("cluster collector config validation failed: %w", err) + } + + replicas := int32(1) + clusterCollectorName := fmt.Sprintf("%s-%s", co.Name, ClusterCollectorSuffix) + clusterLabels := manifestutils.Labels(co.ObjectMeta, clusterCollectorName, params.Config.CollectorImage, ComponentClusterObservability, params.Config.LabelsFilter) + clusterLabels["app.kubernetes.io/managed-by"] = "opentelemetry-operator" + clusterLabels["app.kubernetes.io/component"] = ComponentClusterObservability + + clusterCollector := &v1beta1.OpenTelemetryCollector{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterCollectorName, + Namespace: co.Namespace, + Labels: clusterLabels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: co.APIVersion, + Kind: co.Kind, + Name: co.Name, + UID: co.UID, + Controller: &[]bool{true}[0], + BlockOwnerDeletion: &[]bool{true}[0], + }, + }, + }, + Spec: v1beta1.OpenTelemetryCollectorSpec{ + Mode: v1beta1.ModeDeployment, + Config: collectorConfig, + OpenTelemetryCommonFields: v1beta1.OpenTelemetryCommonFields{ + Image: getCollectorImage(params.Config.CollectorImage), + Replicas: &replicas, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: &[]bool{false}[0], + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + RunAsNonRoot: &[]bool{true}[0], + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + PodSecurityContext: &corev1.PodSecurityContext{ + RunAsNonRoot: &[]bool{true}[0], + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + }, + }, + } + + return clusterCollector, nil +} + +// buildInstrumentations creates a single Instrumentation CR in the operator namespace +// Users can reference it via instrumentation.opentelemetry.io/ns annotation. +func buildInstrumentations(params manifests.Params) ([]client.Object, error) { + co := params.ClusterObservability + + // Build OTLP exporter endpoint for instrumentation + endpoint, err := buildInstrumentationEndpoint(co.Spec) + if err != nil { + return nil, fmt.Errorf("failed to build instrumentation endpoint: %w", err) + } + + // Create a single Instrumentation in the same namespace as the ClusterObservability resource + instrumentationLabels := manifestutils.Labels(co.ObjectMeta, DefaultInstrumentationName, "", ComponentClusterObservability, params.Config.LabelsFilter) + instrumentationLabels["app.kubernetes.io/managed-by"] = "opentelemetry-operator" + instrumentationLabels["app.kubernetes.io/component"] = ComponentClusterObservability + + instrumentation := &v1alpha1.Instrumentation{ + ObjectMeta: metav1.ObjectMeta{ + Name: DefaultInstrumentationName, + Namespace: co.Namespace, // Same namespace as ClusterObservability + Labels: instrumentationLabels, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: co.APIVersion, + Kind: co.Kind, + Name: co.Name, + UID: co.UID, + Controller: &[]bool{true}[0], + BlockOwnerDeletion: &[]bool{true}[0], + }, + }, + }, + Spec: v1alpha1.InstrumentationSpec{ + Exporter: v1alpha1.Exporter{ + Endpoint: endpoint, + }, + Propagators: []v1alpha1.Propagator{ + v1alpha1.TraceContext, + v1alpha1.Baggage, + v1alpha1.B3, + v1alpha1.Jaeger, + }, + Sampler: v1alpha1.Sampler{ + Type: v1alpha1.ParentBasedTraceIDRatio, + Argument: "1.0", + }, + }, + } + + // Enable instrumentation based on operator configuration + if params.Config.EnableJavaAutoInstrumentation { + instrumentation.Spec.Java = v1alpha1.Java{ + Image: params.Config.AutoInstrumentationJavaImage, + } + } + if params.Config.EnableNodeJSAutoInstrumentation { + instrumentation.Spec.NodeJS = v1alpha1.NodeJS{ + Image: params.Config.AutoInstrumentationNodeJSImage, + } + } + if params.Config.EnablePythonAutoInstrumentation { + instrumentation.Spec.Python = v1alpha1.Python{ + Image: params.Config.AutoInstrumentationPythonImage, + } + } + if params.Config.EnableDotNetAutoInstrumentation { + instrumentation.Spec.DotNet = v1alpha1.DotNet{ + Image: params.Config.AutoInstrumentationDotNetImage, + } + } + if params.Config.EnableGoAutoInstrumentation { + instrumentation.Spec.Go = v1alpha1.Go{ + Image: params.Config.AutoInstrumentationGoImage, + } + } + + return []client.Object{instrumentation}, nil +} + +// buildInstrumentationEndpoint builds the OTLP endpoint for instrumentation. +func buildInstrumentationEndpoint(spec v1alpha1.ClusterObservabilitySpec) (string, error) { + // Point to local node's agent collector + endpoint := "http://$(OTEL_NODE_IP):4317" + + return endpoint, nil +} + +// isOpenShiftEnvironment detects if we're running in an OpenShift environment using cached config. +func isOpenShiftEnvironment(params manifests.Params) bool { + return params.Config.OpenShiftRoutesAvailability == openshift.RoutesAvailable +} diff --git a/internal/manifests/clusterobservability/config/configs/agent-collector-base.yaml b/internal/manifests/clusterobservability/config/configs/agent-collector-base.yaml new file mode 100644 index 0000000000..48a82f33bc --- /dev/null +++ b/internal/manifests/clusterobservability/config/configs/agent-collector-base.yaml @@ -0,0 +1,73 @@ +# Base configuration for agent collectors (DaemonSet) +# Collects kubelet stats, container logs, and OTLP data from auto-instrumentation +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + kubeletstats: + collection_interval: 30s + auth_type: serviceAccount + endpoint: "https://${env:K8S_NODE_NAME}:10250" + metric_groups: + - container + - pod + - node + - volume + filelog: + include: + - "/var/log/pods/*/*/*.log" + exclude: + - "/var/log/pods/*/otc-container/*.log" + start_at: end + include_file_path: true + include_file_name: false + operators: + - type: router + id: get-format + routes: + - output: parser-docker + expr: 'body matches "^\\{"' + - output: parser-crio + expr: 'body matches "^[^ Z]+ "' + - output: parser-containerd + expr: 'body matches "^[^ Z]+Z"' + +processors: + batch: {} + resourcedetection: + detectors: ["env", "system", "k8snode"] + timeout: 2s + k8sattributes: + auth_type: serviceAccount + passthrough: false + filter: + node_from_env_var: K8S_NODE_NAME + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.node.name" + - "k8s.pod.name" + - "k8s.pod.uid" + - "k8s.pod.start_time" + pod_association: + - sources: + - from: resource_attribute + name: "k8s.pod.ip" + - sources: + - from: resource_attribute + name: "k8s.pod.uid" + - sources: + - from: connection + +exporters: {} # Will be populated by controller + +service: + pipelines: {} # Will be populated by controller based on enabled signals \ No newline at end of file diff --git a/internal/manifests/clusterobservability/config/configs/cluster-collector-base.yaml b/internal/manifests/clusterobservability/config/configs/cluster-collector-base.yaml new file mode 100644 index 0000000000..e78fe4712e --- /dev/null +++ b/internal/manifests/clusterobservability/config/configs/cluster-collector-base.yaml @@ -0,0 +1,22 @@ +# Base configuration for cluster collectors (Deployment) +# Collects cluster-level metrics and events +receivers: + k8s_cluster: + auth_type: serviceAccount + collection_interval: 30s + node_conditions_to_report: ["Ready", "MemoryPressure", "DiskPressure", "PIDPressure"] + allocatable_types_to_report: ["cpu", "memory", "storage", "pods"] + + k8s_events: + auth_type: serviceAccount + +processors: + batch: {} + resourcedetection: + detectors: ["env", "system"] + timeout: 2s + +exporters: {} # Will be populated by controller + +service: + pipelines: {} # Will be populated by controller based on enabled signals \ No newline at end of file diff --git a/internal/manifests/clusterobservability/config/configs/distros/openshift/agent-collector-overrides.yaml b/internal/manifests/clusterobservability/config/configs/distros/openshift/agent-collector-overrides.yaml new file mode 100644 index 0000000000..5c2fb04828 --- /dev/null +++ b/internal/manifests/clusterobservability/config/configs/distros/openshift/agent-collector-overrides.yaml @@ -0,0 +1,80 @@ +# OpenShift specific overrides for agent collector +# These settings will be merged with agent-collector-base.yaml + +receivers: + kubeletstats: + collection_interval: 10s + auth_type: serviceAccount + endpoint: "https://${env:K8S_NODE_NAME}:10250" + ca_file: /etc/kubelet-serving-ca/ca-bundle.crt + metric_groups: + - container + - pod + - node + - volume + extra_metadata_labels: + - container.id + + # OpenShift uses CRI-O container runtime + filelog: + include: + - "/var/log/pods/*/*/*.log" + - "/var/log/containers/*.log" + exclude: + - "/var/log/pods/openshift-*/*/*.log" # Skip OpenShift system logs + - "/var/log/pods/kube-*/*/*.log" # Skip kube-system logs + - "/var/log/pods/*/otc-container/*.log" + start_at: end + include_file_path: true + include_file_name: false + operators: + - type: router + id: get-format + routes: + - output: parser-crio + expr: 'body matches "^[^ Z]+ "' + - output: parser-docker + expr: 'body matches "^\\{"' + - type: json_parser + id: parser-docker + if: 'attributes["log_type"] == "docker"' + - type: regex_parser + id: parser-crio + if: 'attributes["log_type"] == "crio"' + regex: '^(?P