Skip to content

Commit

Permalink
WIP: Set degraded OperatorStatus based on critical alerts firing in
Browse files Browse the repository at this point in the history
monitoring stack

TODO: severity=critical
  • Loading branch information
lilic committed Apr 30, 2020
1 parent c8c4d41 commit 9f8a60a
Show file tree
Hide file tree
Showing 2 changed files with 150 additions and 11 deletions.
47 changes: 36 additions & 11 deletions pkg/operator/operator.go
Expand Up @@ -15,7 +15,7 @@
package operator

import (
"strings"
"fmt"
"time"

configv1 "github.com/openshift/api/config/v1"
Expand All @@ -31,6 +31,7 @@ import (

"github.com/openshift/cluster-monitoring-operator/pkg/client"
"github.com/openshift/cluster-monitoring-operator/pkg/manifests"
"github.com/openshift/cluster-monitoring-operator/pkg/status"
"github.com/openshift/cluster-monitoring-operator/pkg/tasks"
)

Expand All @@ -55,6 +56,7 @@ type Operator struct {
telemetryMatches []string
remoteWrite bool

config *rest.Config
client *client.Client

cmapInf cache.SharedIndexInformer
Expand Down Expand Up @@ -83,6 +85,7 @@ func New(config *rest.Config, version, namespace, namespaceUserWorkload, namespa
namespace: namespace,
namespaceUserWorkload: namespaceUserWorkload,
client: c,
config: config,
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cluster-monitoring"),
}

Expand Down Expand Up @@ -335,21 +338,43 @@ func (o *Operator) sync(key string) error {
klog.Errorf("error occurred while setting status to in progress: %v", err)
}

taskName, err := tl.RunAll()
_, err = tl.RunAll()
if err != nil {
klog.Infof("Updating ClusterOperator status to failed. Err: %v", err)
failedTaskReason := strings.Join(strings.Fields(taskName+"Failed"), "")
reportErr := o.client.StatusReporter().SetFailed(err, failedTaskReason)
if reportErr != nil {
klog.Errorf("error occurred while setting status to failed: %v", reportErr)
// TODO: instrument the tasks better:
// - set clear msg what failed
// - set SLO for tasks that failed
// - e.g. SLO alert for cluster-monitoring-operator tasks
degraded, msg, err := status.IsDegraded(o.config)
if err != nil {
fmt.Println(err)
}
if degraded {
fmt.Println("alert?")
failedError := errors.New("monitoring stack alert is degraded, we are breaching our SLO - alerts for our important components are firing")
reportErr := o.client.StatusReporter().SetFailed(failedError, msg)
if reportErr != nil {
klog.Errorf("error occurred while setting status to failed: %v", reportErr)
}
}
return err
}

klog.Info("Updating ClusterOperator status to done.")
err = o.client.StatusReporter().SetDone()
degraded, msg, err := status.IsDegraded(o.config)
if err != nil {
klog.Errorf("error occurred while setting status to done: %v", err)
fmt.Println(err)
}
if degraded {
fmt.Println("alert?")
failedError := errors.New("monitoring stack alert is firing, we are breaching our SLA")
reportErr := o.client.StatusReporter().SetFailed(failedError, msg)
if reportErr != nil {
klog.Errorf("error occurred while setting status to failed: %v", reportErr)
}
} else if !degraded {
klog.Info("Updating ClusterOperator status to done.")
err = o.client.StatusReporter().SetDone()
if err != nil {
klog.Errorf("error occurred while setting status to done: %v", err)
}
}

return nil
Expand Down
114 changes: 114 additions & 0 deletions pkg/status/status.go
@@ -0,0 +1,114 @@
// Copyright 2019 The Cluster Monitoring Operator Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package status

import (
"fmt"
"strings"

"github.com/Jeffail/gabs"
routev1 "github.com/openshift/client-go/route/clientset/versioned/typed/route/v1"
"github.com/openshift/cluster-monitoring-operator/test/e2e/framework"
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)

// IsDegraded returns true when there are any critical alerts firing in openshift-monitoring
// or openshift-user-workload-monitoring namespaces. Or when quering of alerts fails as that
// implies a problem somewhere in the cluster.
func IsDegraded(config *rest.Config) (bool, string, error) {
firing, msg, err := alertsFiring(config)
if err != nil {
return true, msg, errors.Wrap(err, "could not query for alerts firing")
}
if firing {
return true, "MonitoringAlertsFiring", errors.New("alerts around monitoring stack are firing")
}
return false, "", nil
}

func alertsFiring(config *rest.Config) (bool, string, error) {
// Prometheus client depends on setup above.
// So far only necessary for prometheusK8sClient.
openshiftRouteClient, err := routev1.NewForConfig(config)
if err != nil {
return false, "OpenShiftRouteClientError", errors.Wrap(err, "creating openshiftRouteClient failed")
}
kubeClient, err := kubernetes.NewForConfig(config)
if err != nil {
return false, "KubeConfigError", errors.Wrap(err, "creating kubeClient failed")
}
token, err := getServiceAccountToken(kubeClient, "openshift-monitoring", "cluster-monitoring-operator")
if err != nil {
return false, "ServiceAccountTokenMissing", err
}
thanosQuerierClient, err := framework.NewPrometheusClientFromRoute(
openshiftRouteClient,
"openshift-monitoring", "thanos-querier",
token,
)
if err != nil {
return false, "ThanosQuerierClientError", errors.Wrap(err, "creating ThanosQuerierClient failed")
}
// TODO: replace with actual alerts that we care about
// critical in openshift-monitoring and openshift-user-workload-monitoring
// if user workload is enabled then check that namespace as well
// Any critical monitoring alert
body, err := thanosQuerierClient.PrometheusQuery(`ALERTS{namespace="openshift-monitoring", severity="critical"}`)
if err != nil {
fmt.Println(err)
return false, "ThanosQuerierQueryFailed", err
}

res, err := gabs.ParseJSON(body)
if err != nil {
fmt.Println(err)
return false, "", err
}

count, err := res.ArrayCountP("data.result")
if err != nil {
fmt.Println(err)
return false, "", err
}

if count > 0 {
fmt.Println(res)
fmt.Println("----what")
return true, "AlertsFiring", nil
}

return false, "", nil
}

func getServiceAccountToken(kubeClient *kubernetes.Clientset, namespace, name string) (string, error) {
secrets, err := kubeClient.CoreV1().Secrets(namespace).List(metav1.ListOptions{})
if err != nil {
return "", err
}
for _, secret := range secrets.Items {
_, dockerToken := secret.Annotations["openshift.io/create-dockercfg-secrets"]
token := strings.Contains(secret.Name, fmt.Sprintf("%s-token-", name))

// we have to skip the token secret that contains the openshift.io/create-dockercfg-secrets annotation
// as this is the token to talk to the internal registry.
if !dockerToken && token {
return string(secret.Data["token"]), nil
}
}
return "", errors.Errorf("cannot find token for %s/%s service account", namespace, name)
}

0 comments on commit 9f8a60a

Please sign in to comment.