-
Notifications
You must be signed in to change notification settings - Fork 1
/
values.yaml
625 lines (552 loc) · 25.1 KB
/
values.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
## Regatta configuration
# -- nameOverride: Provide a name in place of `regatta`
nameOverride: "" # default: `"regatta"`
# -- fullnameOverride: String to fully override `"regatta.fullname"`
fullnameOverride: ""
# Image configuration
# Note: Use overrides with caution since other Regatta versions might not be compatible with this helm chart!
image:
# -- repository: Default image repository
repository: ghcr.io/jamf/regatta
# -- imagePullPolicy: ref: https://kubernetes.io/docs/concepts/containers/images/#image-pull-policy
imagePullPolicy: IfNotPresent
# -- tag: Override to use different image version
tag: 0.1.0
# -- imagePullSecrets: For the Regatta image
imagePullSecrets: []
# -- replicas: Defines number of Regatta replicas
# Note: This value must match the number of raft initial members `raft.initialMembers`.
replicas: 1
# Specifies the pod disruption budget
## ref: https://kubernetes.io/docs/tasks/run-application/configure-pdb/
podDisruptionBudget:
# -- enabled: If true, the pdb object is created
enabled: false
# -- minAvailable: Sets the minAvailable field of the pdb object
minAvailable: 2
# -- resources: Define the resources of the pods
resources: {}
# requests:
# cpu: 1
# memory: 4Gi
# limits:
# cpu: 2
# memory: 8Gi
# -- startupProbe: Defines the startupProbe for the Regatta container
## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-startup-probes
startupProbe:
initialDelaySeconds: 90
periodSeconds: 60
timeoutSeconds: 5
failureThreshold: 3
successThreshold: 1
# -- startupProbe: Defines the readinessProbe for the Regatta container
## ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-readiness-probes
readinessProbe:
initialDelaySeconds: 5
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
successThreshold: 1
# -- priorityClassName: Defines the priorityClassName of the Regatta pods.
# Leave empty string if you don't want to use this feature.
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/
priorityClassName: ""
# -- tolerations: Defines tolerations for the Regatta pods
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
tolerations: []
#- key: "key1"
# operator: "Equal"
# value: "value1"
# effect: "NoSchedule"
# -- nodeSelector: Map of nodeSelector labels for the Regatta pods
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector
nodeSelector: {}
# nodeLabel: value
# Settings of Regatta pods anti-affinity
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
podAntiAffinity:
# -- enabled: Enable or disable the pod anti-affinity
enabled: false
# -- topologyKey: Use to override the topologyKey value
topologyKey: kubernetes.io/hostname
# -- additionalPodLabels: Optional map of additional pod labels
additionalPodLabels: {}
# -- podAnnotations: Optional map of pod annotations
podAnnotations: {}
# Allows definition of the persistentVolumeClaim.spec of the Regatta StatefulSet
persistentVolumeClaim:
# -- spec: The full content of the persistentVolumeClaim.spec
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 10Gi
# ServiceAccount configuration
serviceAccount:
# -- create: Create the ServiceAccount for regatta
create: true
# -- name: ServiceAccount name override
name: "" # default: `"regatta.fullname"`
# Regatta main gRPC API configuration
api:
# -- port: gRPC API port
port: 8443
tls:
# -- mode:
# May be one of:
# - certificate: the certificate is generated by `cert-manager.io/v1/Certificate` object
# - plaintext: enter `cert` and `key` content directly into values
# - none: no certificate nor secret is created, you need to provide a secret separately
#
# Secret example:
#
# apiVersion: v1
# kind: Secret
# metadata:
# name: regatta-api-cert
# data:
# tls.crt: Y2xpZW50LWNlcnQK
# tls.key: Y2xpZW50LWtleQo=
mode: plaintext
# -- issuerRef: IssuerRef configuration that is passed to the Certificate object
# Note: applicable only if `mode: certificate`
issuerRef: {}
# Example issuerRef configuration:
# kind: ClusterIssuer
# name: issuer-name
# -- cert: TLS cert in plaintext
# Note: applicable only if `mode: plaintext`
cert: |
plaintext server certificate
# -- key: TLS key in plaintext
# Note: applicable only if `mode: plaintext`
key: |
plaintext server certificate key
# -- externalLoadBalancer: If enabled, the Service of type LoadBalancer is created
## ref: https://kubernetes.io/docs/concepts/services-networking/service/#loadbalancer
externalLoadBalancer:
# -- enabled: true/false
enabled: false
# -- externalDomain: External Regatta API domain name
externalDomain: regatta.example.com
# -- annotations: Service annotations
annotations: {}
# Example annotations for K8S cluster running in AWS:
#
# external-dns.alpha.kubernetes.io/hostname: regatta.example.com
# service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "900"
# service.beta.kubernetes.io/aws-load-balancer-backend-protocol: ssl
# -- loadBalancerSourceRanges: external access whitelist, available on AWS only
## ref: https://kubernetes.io/docs/concepts/services-networking/service/#aws-nlb-support
loadBalancerSourceRanges: {}
# - 0.0.0.0/0
# -- metricsPort: Regatta metrics port
metricsPort: 8079
# -- mode: Regatta mode
# Can be either leader or follower.
mode: leader
# Regatta replication setup
replication:
# -- server: The replication server may be used when Regatta is in the leader mode (`mode: leader`).
# Follower Regatta replicates data from this server.
server:
# -- enabled: Enables the replication server
enabled: true
# -- port: Replication server port
port: 8444
# -- externalDomain:
externalDomain: "leader.regatta.example.com"
# -- serviceAnnotations: Replication server LoadBalancer service annotations
serviceAnnotations: {}
# external-dns.alpha.kubernetes.io/hostname: leader.regatta.example.com
# service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "900"
tls:
# -- mode:
# May be one of:
# - certificate: the certificate is generated by `cert-manager.io/v1/Certificate` object
# - plaintext: enter `cert` and `key` content directly into values
# - none: no certificate nor secret is created, you need to provide a secret separately
#
# Secret example:
#
# apiVersion: v1
# kind: Secret
# metadata:
# name: regatta-replication-cert
# data:
# tls.crt: Y2xpZW50LWNlcnQK
# tls.key: Y2xpZW50LWtleQo=
mode: plaintext
# -- issuerRef: issuerRef configuration that is passed to the Certificate object
# Note: applicable only if `mode: certificate`
issuerRef: {}
# Example issuerRef configuration:
# kind: ClusterIssuer
# name: issuer-name
# -- cert: TLS certificate in plaintext
# Note: Applicable only if `mode: plaintext`
cert: |
plaintext server certificate
# -- cert: CA in plaintext
# Note: Applicable only if `mode: plaintext`
ca: |
plaintext server ca
# -- key: TLS key in plaintext
# Note: Applicable only if `mode: plaintext`
key: |
plaintext server certificate key
# -- leaderAddress: The address of the leader to replicate from
# Note: Applicable only if the Regatta mode is follower (`mode: follower`)
leaderAddress: "leader.regatta.example.com"
# -- maxSnapshotRecvBytesPerSecond: Maximum number of bytes received per second by the snapshot API client,
# default value 0 means unlimited.
maxSnapshotRecvBytesPerSecond: 0
# -- logRpcTimeout: The log RPC timeout.
logRpcTimeout: 5m
reflectionAPI:
# -- enabled: Whether reflection API is provided. Should be false on in production.
enabled: false
# Maintenance API configuration
maintenance:
# -- secretKind
# May be one of:
# - sealedSecret: Use if you have SealedSecrets support on your cluster. (https://sealed-secrets.netlify.app/)
# - plaintext: Use to create Opaque Secret from the plaintext.
# - none: Do not create the secret with the token at all. The secret must be provided externally.
#
# Secret example:
#
# apiVersion: v1
# kind: Secret
# metadata:
# name: regatta-maintenance-token
# data:
# token: c2VjcmV0LXRva2Vu
#
secretKind: plaintext
# -- token:
# Depending on value of `secretKind`
# - sealedSecret: enter the encrypted value
# - plaintext: enter the plaintext secret value
# - none: the field is ignored
token: "secret-token"
server:
# -- enabled: Maintenance API enabled
enabled: true
# -- port: Port of maintenance server to listen on
port: 8445
tls:
# -- mode:
# May be one of:
# - certificate: the certificate is generated by `cert-manager.io/v1/Certificate` object
# - plaintext: enter `cert` and `key` content directly into values
# - none: no certificate nor secret is created, you need to provide a secret separately
#
# Secret example:
#
# apiVersion: v1
# kind: Secret
# metadata:
# name: regatta-maintenance-cert
# data:
# tls.crt: Y2xpZW50LWNlcnQK
# tls.key: Y2xpZW50LWtleQo=
mode: plaintext
# -- issuerRef: issuerRef configuration that is passed to the Certificate object
# Note: Applicable only if `mode: certificate`
issuerRef: { }
# Example issuerRef configuration:
# kind: ClusterIssuer
# name: issuer-name
# -- cert: TLS certificate in plaintext
# Note: Applicable only if `mode: plaintext`
cert: |
plaintext server certificate
# -- key: TLS key in plaintext
# Note: Applicable only if `mode: plaintext`
key: |
plaintext server certificate key
# -- Controls the creation of the backup CronJob that uses the Regatta maintenance API
# Note: the `maintenance.server.enabled` must be set to `true`
backup:
# -- enabled: Enable the backup CronJob
# Note: the maintenance server must be enabled
enabled: true
# -- successfulJobsHistoryLimit: CronJob config field
successfulJobsHistoryLimit: 4
# -- failedJobsHistoryLimit: CronJob config field
failedJobsHistoryLimit: 2
# -- schedule: Cron expression defining how often the backup is executed
schedule: "0 */4 * * *"
# -- bucket: Address of the s3 bucket where to upload backup
bucket: "s3-bucket-name"
# Kafka client configuration
# Note: Kafka client available ony if Regatta is in the leader mode (`mode: leader`)
kafka:
# -- enabled: Enables Kafka client
enabled: false
# -- brokers: The list of Kafka brokers
brokers: ""
# -- checkTopics: Checks the configured topics for existence if set to true
checkTopics: false
# -- dialerTimeout: Kafka dialer timeout
dialerTimeout: 10s
# -- groupID: Kafka consumer group ID
groupID: ""
# -- topics: Comma-separated list of Kafka topics to consume
topics: ""
# -- tls: Kafka client TLS configuration
tls:
# -- enabled: Enables kafka client TLS
enabled: false
# -- secretKind
# May be one of:
# - sealedSecret: Use if you have SealedSecrets support on your cluster. (https://sealed-secrets.netlify.app/)
# - plaintext: Use to create Opaque Secret from the plaintext.
# - none: Do not create the secret with the token at all. The secret must be provided externally.
#
# Secret example:
#
# apiVersion: v1
# kind: Secret
# metadata:
# name: regatta-kafka-cert
# data:
# ca.crt: c2VydmVyLWNlcnQK
# tls.crt: Y2xpZW50LWNlcnQK
# tls.key: Y2xpZW50LWtleQo=
#
secretKind: plaintext
# -- serverCert:
# Depending on value of `secretKind`
# - sealedSecret: enter the encrypted value
# - plaintext: enter the plaintext secret value
# - none: the field is ignored
serverCert: |
server-cert
# -- clientCert:
# Depending on value of `secretKind`
# - sealedSecret: enter the encrypted value
# - plaintext: enter the plaintext secret value
# - none: the field is ignored
clientCert: |
client-cert
# -- clientKey:
# Depending on value of `secretKind`
# - sealedSecret: enter the encrypted value
# - plaintext: enter the plaintext secret value
# - none: the field is ignored
clientKey: |
client-key
# -- tables: Comma separated list of Regatta tables
tables: table-1,table-2
# Raft configuration
raft:
# -- initialMembers: Raft cluster initial members defines a mapping of node IDs to their respective Raft address.
# The node ID must be Integer >= 1. Example for the initial 3 node cluster setup on the localhost:
# "1=127.0.0.1:5012,2=127.0.0.1:5013,3=127.0.0.1:5014"
initialMembers: "1=regatta-0.regatta.regatta.svc.cluster.local:5012"
# -- rtt: Defines the average Round Trip Time (RTT) between two NodeHost instances.
# Such an RTT interval is internally used as a logical clock tick, Raft heartbeat and election intervals
# are both defined in terms of how many such RTT intervals. Note that RTTMillisecond is the combined delays
# between two NodeHost instances including all delays caused by network transmission,
# delays caused by NodeHost queuing and processing.
# Specified as Go's duration string (https://pkg.go.dev/maze.io/x/duration#ParseDuration).
rtt: 50ms
# -- snapshotEntries: SnapshotEntries defines how often the state machine should be snapshotted automatically.
# It is defined in terms of the number of applied Raft log entries.
snapshotEntries: 10000
# -- compactionOverhead: Defines the number of most recent entries to keep after each Raft log compaction.
# Raft log compaction is performed automatically every time when a snapshot is created.
compactionOverhead: 5000
# TODO: revise raft.rtt
# -- heartbeatRTT: The number of message RTT between heartbeats. Message RTT is defined by raft.rtt.
# The Raft paper suggest the heartbeat interval to be close to the average RTT between nodes.
# As an example, assuming raft.rtt is 100 millisecond, to set the heartbeat interval to be every 200 milliseconds,
# then heartbeatRTT should be set to 2.
heartbeatRTT: 4
# TODO: revise raft.rtt
# -- electionRTT: The minimum number of message RTT between elections. Message RTT is defined by raft.rtt.
# The Raft paper suggests it to be a magnitude greater than heartbeatRTT, which is the interval between two heartbeats.
# In Raft, the actual interval between elections is randomized to be between electionRTT and 2 * electionRTT.
# As an example, assuming raft.rtt is 100 millisecond, to set the election interval to be 1 second,
# then electionRTT should be set to 10.
electionRTT: 100
# -- dragonboatSoftSettings: ref: https://github.com/lni/dragonboat/blob/v3.3.6/internal/settings/soft.go#L27
dragonboatSoftSettings: |
{
"TaskBatchSize": 128,
"PerConnectionSendBufSize": 10485760,
"MaxConcurrentStreamingSnapshot": 1
}
# Storage configuration
storage:
# -- blockCacheSize: Shared block cache size in bytes. The cache is used to hold uncompressed blocks of data in memory
blockCacheSize: 796917760
# Experimental features
experimental:
# -- tanlogdb: Whether experimental LogDB implementation Tan is used in-place of Pebble based one
tanlogdb: false
# -- serviceMonitorEnabled: ServiceMonitor object is created if set to true
serviceMonitorEnabled: false
# Defines the PrometheusRule object
prometheusRules:
# -- enabled: PrometheusRule object created if true
enabled: false
# -- groups: List of the PrometheusRule groups
groups: []
# Example groups:
#
# - name: regatta.rules
# rules:
# # RAFT
# - alert: RaftLeaderNotAvailable
# expr: min(sum(dragonboat_raftnode_has_leader{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}) by (shardid,job,namespace)) by (job,namespace) < 2
# for: 5m
# labels:
# severity: critical
# annotations:
# description: '{{ $labels.job }}: Raft leader not available, cluster is not able to serve requests properly.'
# summary: '{{ $labels.job }}: Raft leader not available'
# - alert: RaftLeaderNotAvailable
# expr: min(sum(dragonboat_raftnode_has_leader{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}) by (shardid,job,namespace)) by (job,namespace) < 2
# for: 5m
# labels:
# severity: critical
# annotations:
# description: '{{ $labels.job }}: Raft leader not available, cluster is not able to serve requests properly.'
# summary: '{{ $labels.job }}: Raft leader not available'
# - alert: RaftClusterNodeNotAvailableCritical
# expr: min(sum(dragonboat_raftnode_has_leader{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}) by (shardid,job,namespace)) by (job,namespace) < 3
# for: 30m
# labels:
# severity: critical
# annotations:
# description: '{{ $labels.job }}: Raft cluster has only {{ $value }} nodes for time period longer than 30m.'
# summary: '{{ $labels.job }}: Raft cluster node not available'
# - alert: RaftClusterNodeNotAvailableWarn
# expr: min(sum(dragonboat_raftnode_has_leader{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}) by (shardid,job,namespace)) by (job,namespace) < 3
# for: 2m
# labels:
# severity: warning
# annotations:
# description: '{{ $labels.job }}: Raft cluster has only {{ $value }} nodes.'
# summary: '{{ $labels.job }}: Raft cluster node not available'
# - alert: TooManyRaftnodeCampaignsLaunched
# expr: sum(increase(dragonboat_raftnode_campaign_launched_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m])) by (shardid,nodeid,job,namespace,pod) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: There was {{ $value }} raft node campaign launched in the last 30 minutes for {{ $labels.job }}
# summary: '{{ $labels.job }}: Too many Raft node campaign launched'
# - alert: TooManyRaftnodeCampaignsSkipped
# expr: sum(increase(dragonboat_raftnode_campaign_skipped_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m])) by (shardid,nodeid,job,namespace,pod) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: There was {{ $value }} raft node campaign skipped in the last 30 minutes for {{ $labels.job }}
# summary: '{{ $labels.job }}: Too many Raft node campaign skipped'
# - alert: TooManyRaftNodeProposalsDropped
# expr: sum(increase(dragonboat_raftnode_proposal_dropped_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m])) by (shardid,nodeid,job,namespace,pod) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Dropped {{ $value }} raft node proposals in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many raft node proposals dropped'
# - alert: TooManyRaftNodeReadIndexesDropped
# expr: sum(increase(dragonboat_raftnode_read_index_dropped_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m])) by (shardid,nodeid,job,namespace,pod) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Dropped {{ $value }} raft node read index in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many raft node read indexes dropped'
# - alert: TooManyRaftNodeReplicationsRejected
# expr: sum(increase(dragonboat_raftnode_replication_rejected_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m])) by (shardid,nodeid,job,namespace,pod) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Rejected {{ $value }} replications in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many raft node replications rejected'
# - alert: TooManyFailedTransportMessageConnectionAttempts
# expr: increase(dragonboat_transport_failed_message_connection_attempt_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m]) > 20
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Failed {{ $value }} transport message connection attempts in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many failed transport message connection attempts'
# - alert: TooManyFailedSnapshotConnectionAttempts
# expr: increase(dragonboat_transport_failed_snapshot_connection_attempt_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m]) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Failed {{ $value }} snapshot connection attempts in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many failed snapshot connection attempts'
# - alert: TooManyTransportMessageSendFailures
# expr: increase(dragonboat_transport_failed_snapshot_connection_attempt_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m]) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Failed {{ $value }} transport message send in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many failed transport message send'
# - alert: TooManyTransportReceiveMessagesDropped
# expr: increase(dragonboat_transport_received_message_dropped_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m]) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Dropped {{ $value }} transport message receive in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many dropped transport message receive'
# - alert: TooManyTransportSnapshotSendFailures
# expr: increase(dragonboat_transport_snapshot_send_failure_total{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}"}[5m]) > 50
# for: 10m
# labels:
# severity: warning
# type: raftTransport
# annotations:
# description: 'Failed {{ $value }} transport snapshot send in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: Too many failed transport snapshot send'
#
# # GRPC
# - alert: GRPCFailedRequests
# expr: increase(grpc_server_handled_total{namespace="{{ .Release.Namespace }}",grpc_code!~"OK|NotFound|InvalidArgument", job="{{ template "regatta.name" . }}",grpc_service!="grpc.reflection.v1alpha.ServerReflection"}[5m]) > {{ .Values.prometheusRules.grpcFailedRequests }}
# for: 1m
# labels:
# severity: warning
# annotations:
# description: 'Failed {{ $value }} GRPC requests in 5 minutes for {{ $labels.job }}'
# summary: '{{ $labels.job }}: GRPC request error'
# - alert: TooSlowGRPCResponseP99
# expr: histogram_quantile(0.99, sum by (job,namespace,pod,grpc_method,le) (rate(grpc_server_handling_seconds_bucket{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}",grpc_type="unary",grpc_service="regatta.v1.KV",grpc_method="Range"}[5m]))) > 0.5
# for: 1m
# labels:
# severity: critical
# annotations:
# description: '99 quantile of regatta.v1.KV response time is {{ $value }} s for {{ $labels.job }}'
# summary: '{{ $labels.job }}: GRPC response is too slow (P99)'
# - alert: TooSlowGRPCResponseP50
# expr: histogram_quantile(0.50, sum by (job,namespace,pod,grpc_method,le) (rate(grpc_server_handling_seconds_bucket{namespace="{{ .Release.Namespace }}",job="{{ template "regatta.name" . }}",grpc_type="unary",grpc_service="regatta.v1.KV",grpc_method="Range"}[5m]))) > 0.05
# for: 1m
# labels:
# severity: warning
# annotations:
# description: '50 quantile of regatta.v1.KV response time is {{ $value }} s for {{ $labels.job }}'
# summary: '{{ $labels.job }}: GRPC response is too slow (P50)'