Skip to content

Commit

Permalink
fix: add prometheus metric label "k8sgpt" (#364)
Browse files Browse the repository at this point in the history
* fix: add prometheus metric label "k8sgpt"

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

* add k8sgpt label on grafana dashboard panel

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

---------

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>
  • Loading branch information
JuHyung-Son committed Mar 1, 2024
1 parent 539cba1 commit 4deaf1f
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 39 deletions.
117 changes: 83 additions & 34 deletions controllers/k8sgpt_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,30 +49,30 @@ const (
var (
// Metrics
// k8sgptReconcileErrorCount is a metric for the number of errors during reconcile
k8sgptReconcileErrorCount = prometheus.NewCounter(prometheus.CounterOpts{
k8sgptReconcileErrorCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "k8sgpt_reconcile_error_count",
Help: "The total number of errors during reconcile",
})
}, []string{"k8sgpt"})
// k8sgptNumberOfResults is a metric for the number of results
k8sgptNumberOfResults = prometheus.NewGauge(prometheus.GaugeOpts{
k8sgptNumberOfResults = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "k8sgpt_number_of_results",
Help: "The total number of results",
})
}, []string{"k8sgpt"})
// k8sgptNumberOfResultsByType is a metric for the number of results by type
k8sgptNumberOfResultsByType = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "k8sgpt_number_of_results_by_type",
Help: "The total number of results by type",
}, []string{"kind", "name"})
}, []string{"kind", "name", "k8sgpt"})
// k8sgptNumberOfBackendAICalls is a metric for the number of backend AI calls
k8sgptNumberOfBackendAICalls = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "k8sgpt_number_of_backend_ai_calls",
Help: "The total number of backend AI calls",
}, []string{"backend", "deployment", "namespace"})
}, []string{"backend", "deployment", "namespace", "k8sgpt"})
// k8sNumberOfFailedBackendAICalls is a metric for the number of failed backend AI calls
k8sgptNumberOfFailedBackendAICalls = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "k8sgpt_number_of_failed_backend_ai_calls",
Help: "The total number of failed backend AI calls",
}, []string{"backend", "deployment", "namespace"})
}, []string{"backend", "deployment", "namespace", "k8sgpt"})
// analysisRetryCount is for the number of analysis failures
analysisRetryCount int
// allowBackendAIRequest a circuit breaker that switching on/off backend AI calls
Expand Down Expand Up @@ -102,7 +102,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
err := r.Get(ctx, req.NamespacedName, k8sgptConfig)
if err != nil {
// Error reading the object - requeue the request.
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return ctrl.Result{}, client.IgnoreNotFound(err)
}

Expand All @@ -114,7 +116,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if !utils.ContainsString(k8sgptConfig.GetFinalizers(), FinalizerName) {
controllerutil.AddFinalizer(k8sgptConfig, FinalizerName)
if err := r.Update(ctx, k8sgptConfig); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -125,12 +129,16 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// Delete any external resources associated with the instance
err := resources.Sync(ctx, r.Client, *k8sgptConfig, resources.DestroyOp)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
controllerutil.RemoveFinalizer(k8sgptConfig, FinalizerName)
if err := r.Update(ctx, k8sgptConfig); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -144,7 +152,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
MaxRetries: 5,
}
if err := r.Update(ctx, k8sgptConfig); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -154,12 +164,16 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
err = r.Get(ctx, client.ObjectKey{Namespace: k8sgptConfig.Namespace,
Name: k8sgptConfig.Name}, &deployment)
if client.IgnoreNotFound(err) != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
err = resources.Sync(ctx, r.Client, *k8sgptConfig, resources.SyncOp)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}

Expand All @@ -179,7 +193,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
imageRepository, k8sgptConfig.Spec.Version)
err = r.Update(ctx, &deployment)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}

Expand All @@ -189,15 +205,19 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
// If the deployment is active, we will query it directly for sis data
address, err := kclient.GenerateAddress(ctx, r.Client, k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
// Log address
fmt.Printf("K8sGPT address: %s\n", address)

k8sgptClient, err := kclient.NewClient(address)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}

Expand All @@ -207,14 +227,18 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if k8sgptConfig.Spec.RemoteCache != nil {
err = k8sgptClient.AddConfig(k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
if k8sgptConfig.Spec.Integrations != nil {
err = k8sgptClient.AddIntegration(k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand All @@ -225,7 +249,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
k8sgptNumberOfFailedBackendAICalls.With(prometheus.Labels{
"backend": k8sgptConfig.Spec.AI.Backend,
"deployment": deployment.Name,
"namespace": deployment.Namespace}).Inc()
"namespace": deployment.Namespace,
"k8sgpt": k8sgptConfig.Name,
}).Inc()

if k8sgptConfig.Spec.AI.BackOff.Enabled {
if analysisRetryCount > k8sgptConfig.Spec.AI.BackOff.MaxRetries {
Expand All @@ -236,7 +262,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
analysisRetryCount++
}
}
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
// Reset analysisRetryCount
Expand All @@ -247,14 +275,21 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
k8sgptNumberOfBackendAICalls.With(prometheus.Labels{
"backend": k8sgptConfig.Spec.AI.Backend,
"deployment": deployment.Name,
"namespace": deployment.Namespace}).Inc()
"namespace": deployment.Namespace,
"k8sgpt": k8sgptConfig.Name,
}).Inc()
}

// Parse the k8sgpt-deployment response into a list of results
k8sgptNumberOfResults.Set(float64(len(response.Results)))
k8sgptNumberOfResults.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Set(float64(len(response.Results)))

rawResults, err := resources.MapResults(*r.Integrations, response.Results, *k8sgptConfig)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
// Prior to creating or updating any results we will delete any stale results that
Expand All @@ -266,7 +301,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
"k8sgpts.k8sgpt.ai/namespace": k8sgptConfig.Namespace,
}))
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
if len(resultList.Items) > 0 {
Expand All @@ -276,12 +313,15 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if _, ok := rawResults[result.Name]; !ok {
err = r.Delete(ctx, &result)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
} else {
k8sgptNumberOfResultsByType.With(prometheus.Labels{
"kind": result.Spec.Kind,
"name": result.Name,
"kind": result.Spec.Kind,
"name": result.Name,
"k8sgpt": k8sgptConfig.Name,
}).Dec()
}
}
Expand All @@ -292,15 +332,18 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
for _, result := range rawResults {
operation, err := resources.CreateOrUpdateResult(ctx, r.Client, result)
if err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)

}
// Update metrics
if operation == resources.CreatedResult {
k8sgptNumberOfResultsByType.With(prometheus.Labels{
"kind": result.Spec.Kind,
"name": result.Name,
"kind": result.Spec.Kind,
"name": result.Name,
"k8sgpt": k8sgptConfig.Name,
}).Inc()
} else if operation == resources.UpdatedResult {
fmt.Printf("Updated successfully %s \n", result.Name)
Expand Down Expand Up @@ -334,7 +377,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
Name: k8sgptConfig.Spec.Sink.Secret.Name,
}
if err := r.Get(ctx, secretNamespacedName, secret); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(fmt.Errorf("could not find sink secret: %w", err), false)
}

Expand All @@ -353,7 +398,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if sinkEnabled {
if res.Status.LifeCycle != string(resources.NoOpResult) || res.Status.Webhook == "" {
if err := sinkType.Emit(res.Spec); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
res.Status.Webhook = k8sgptConfig.Spec.Sink.Endpoint
Expand All @@ -363,7 +410,9 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
res.Status.Webhook = ""
}
if err := r.Status().Update(ctx, &res); err != nil {
k8sgptReconcileErrorCount.Inc()
k8sgptReconcileErrorCount.With(prometheus.Labels{
"k8sgpt": k8sgptConfig.Name,
}).Inc()
return r.finishReconcile(err, false)
}
}
Expand Down
2 changes: 1 addition & 1 deletion grafana/custom-metrics/custom-metrics-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": true,
"expr": "sum(rate(k8sgpt_reconcile_error_count{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod)",
"expr": "sum(rate(k8sgpt_reconcile_error_count{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod, k8sgpt)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
Expand Down
8 changes: 4 additions & 4 deletions grafana/k8sgpt-overview.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum by (kind) (k8sgpt_number_of_results_by_type)",
"expr": "sum by (kind, k8sgpt) (k8sgpt_number_of_results_by_type)",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down Expand Up @@ -192,7 +192,7 @@
"uid": "prometheus"
},
"editorMode": "builder",
"expr": "sum(k8sgpt_number_of_results)",
"expr": "sum by (k8sgpt) (k8sgpt_number_of_results)",
"range": true,
"refId": "A"
}
Expand Down Expand Up @@ -279,7 +279,7 @@
"uid": "prometheus"
},
"editorMode": "builder",
"expr": "count by(kind) (k8sgpt_number_of_results_by_type)",
"expr": "count by(kind, k8sgpt) (k8sgpt_number_of_results_by_type)",
"hide": false,
"range": true,
"refId": "A"
Expand Down Expand Up @@ -701,4 +701,4 @@
"uid": "U82QyO8Vz",
"version": 8,
"weekStart": ""
}
}

0 comments on commit 4deaf1f

Please sign in to comment.