Skip to content

Commit

Permalink
Ensure PodDisruptionBudgetAtLimit alert is silenced
Browse files Browse the repository at this point in the history
Signed-off-by: machadovilaca <machadovilaca@gmail.com>
  • Loading branch information
machadovilaca committed Jul 11, 2024
1 parent 8e008fe commit 54b08c2
Show file tree
Hide file tree
Showing 11 changed files with 330 additions and 69 deletions.
18 changes: 11 additions & 7 deletions cmd/hyperconverged-cluster-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ import (
hcoutil "github.com/kubevirt/hyperconverged-cluster-operator/pkg/util"
)

const openshiftMonitoringNamespace = "openshift-monitoring"

// Change below variables to serve metrics on different host or port.
var (
logger = logf.Log.WithName("hyperconverged-operator-cmd")
Expand Down Expand Up @@ -172,12 +174,11 @@ func main() {
os.Exit(1)
}

if err = (&observability.Reconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
}).SetupWithManager(mgr); err != nil {
logger.Error(err, "unable to create controller", "controller", "Observability")
os.Exit(1)
if ci.IsOpenshift() {
if err = observability.SetupWithManager(mgr); err != nil {
logger.Error(err, "unable to create controller", "controller", "Observability")
os.Exit(1)
}
}

err = createPriorityClass(ctx, mgr)
Expand Down Expand Up @@ -258,7 +259,10 @@ func getCacheOption(operatorNamespace string, isMonitoringAvailable, isOpenshift

cacheOptionsByObjectForOpenshift := map[client.Object]cache.ByObject{
&openshiftroutev1.Route{}: {
Field: namespaceSelector,
Namespaces: map[string]cache.Config{
operatorNamespace: {},
openshiftMonitoringNamespace: {},
},
},
&imagev1.ImageStream{}: {
Label: labelSelector,
Expand Down
46 changes: 30 additions & 16 deletions controllers/observability/observability_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@ import (
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/source"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager"
)

var (
Expand All @@ -20,32 +21,34 @@ var (
)

type Reconciler struct {
client.Client
Scheme *runtime.Scheme

config *rest.Config
events chan event.GenericEvent

amApi *alertmanager.Api
}

func (r *Reconciler) Reconcile(_ context.Context, _ ctrl.Request) (ctrl.Result, error) {
log.Info("Reconciling Observability")

// TODO(user): your logic here
if err := r.ensurePodDisruptionBudgetAtLimitIsSilenced(); err != nil {
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
}

func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
func NewReconciler(config *rest.Config) *Reconciler {
return &Reconciler{
config: config,
events: make(chan event.GenericEvent),
}
}

func SetupWithManager(mgr ctrl.Manager) error {
log.Info("Setting up controller")

r.events = make(chan event.GenericEvent)
go func() {
for {
r.events <- event.GenericEvent{
Object: &metav1.PartialObjectMetadata{},
}
time.Sleep(periodicity)
}
}()
r := NewReconciler(mgr.GetConfig())
r.startEventLoop()

return ctrl.NewControllerManagedBy(mgr).
Named("observability").
Expand All @@ -55,3 +58,14 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
)).
Complete(r)
}

func (r *Reconciler) startEventLoop() {
go func() {
for {
r.events <- event.GenericEvent{
Object: &metav1.PartialObjectMetadata{},
}
time.Sleep(periodicity)
}
}()
}
32 changes: 0 additions & 32 deletions controllers/observability/observability_controller_test.go

This file was deleted.

13 changes: 0 additions & 13 deletions controllers/observability/observability_suite_test.go

This file was deleted.

107 changes: 107 additions & 0 deletions controllers/observability/pod_disruption_budget_at_limit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package observability

import (
"crypto/tls"
"crypto/x509"
"fmt"
"net/http"
"os"
"time"

"github.com/kubevirt/hyperconverged-cluster-operator/pkg/alertmanager"
)

const (
alertmanagerSvcHost = "alertmanager-main.openshift-monitoring.svc.cluster.local:9094"
tlsCertPath = "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt"
)

func (r *Reconciler) ensurePodDisruptionBudgetAtLimitIsSilenced() error {
if r.amApi == nil {
var err error
r.amApi, err = r.NewAlertmanagerApi()
if err != nil {
return fmt.Errorf("failed to initialize alertmanager api: %w", err)
}
}

amSilences, err := r.amApi.ListSilences()
if err != nil {
return fmt.Errorf("failed to list alertmanager silences: %w", err)
}

if FindPodDisruptionBudgetAtLimitSilence(amSilences) != nil {
log.Info("KubeVirt PodDisruptionBudgetAtLimit alerts are already silenced")
return nil
}

silence := alertmanager.Silence{
Comment: "Silence KubeVirt PodDisruptionBudgetAtLimit alerts",
CreatedBy: "hyperconverged-cluster-operator",
EndsAt: "3000-01-01T00:00:00Z",
Matchers: []alertmanager.Matcher{
{
IsEqual: true,
Name: "alertname",
Value: "PodDisruptionBudgetAtLimit",
},
{
IsRegex: true,
Name: "poddisruptionbudget",
Value: "kubevirt-disruption-budget-.*",
},
},
StartsAt: time.Now().Format(time.RFC3339),
}

if err := r.amApi.CreateSilence(silence); err != nil {
return fmt.Errorf("failed to create alertmanager silence: %w", err)
}
log.Info("Silenced PodDisruptionBudgetAtLimit alerts")

return nil
}

func (r *Reconciler) NewAlertmanagerApi() (*alertmanager.Api, error) {
caCert, err := os.ReadFile(tlsCertPath)
if err != nil {
return nil, fmt.Errorf("failed to read ca cert: %w", err)
}

caCertPool := x509.NewCertPool()
caCertPool.AppendCertsFromPEM(caCert)

httpClient := http.Client{}
httpClient.Transport = &http.Transport{
TLSClientConfig: &tls.Config{RootCAs: caCertPool},
}

return alertmanager.NewAPI(httpClient, alertmanagerSvcHost, r.config.BearerToken), nil
}

func FindPodDisruptionBudgetAtLimitSilence(amSilences []alertmanager.Silence) *alertmanager.Silence {
for _, silence := range amSilences {
if silence.Status.State != "active" {
continue
}

var isPDBSilence bool
var isKubeVirtPDBSilence bool

for _, matcher := range silence.Matchers {
if matcher.Name == "alertname" && matcher.Value == "PodDisruptionBudgetAtLimit" && matcher.IsEqual {
isPDBSilence = true
}

if matcher.Name == "poddisruptionbudget" && matcher.IsRegex && matcher.Value == "kubevirt-disruption-budget-.*" {
isKubeVirtPDBSilence = true
}
}

if isPDBSilence && isKubeVirtPDBSilence {
return &silence
}
}

return nil
}
7 changes: 7 additions & 0 deletions deploy/cluster_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1145,6 +1145,13 @@ rules:
- list
- watch
- update
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- alertmanagers/api
verbs:
- '*'
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,13 @@ spec:
- list
- watch
- update
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- alertmanagers/api
verbs:
- '*'
serviceAccountName: hyperconverged-cluster-operator
- rules:
- apiGroups:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
certified: "false"
console.openshift.io/disable-operand-delete: "true"
containerImage: quay.io/kubevirt/hyperconverged-cluster-operator:1.13.0-unstable
createdAt: "2024-07-11 05:05:11"
createdAt: "2024-07-11 09:38:14"
description: A unified operator deploying and controlling KubeVirt and its supporting
operators with opinionated defaults
features.operators.openshift.io/cnf: "false"
Expand Down Expand Up @@ -539,6 +539,13 @@ spec:
- list
- watch
- update
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- alertmanagers/api
verbs:
- '*'
serviceAccountName: hyperconverged-cluster-operator
- rules:
- apiGroups:
Expand Down
Loading

0 comments on commit 54b08c2

Please sign in to comment.