From cc282b5c73b3d150cfefa82a0cb71db443f7af42 Mon Sep 17 00:00:00 2001 From: Injun Song Date: Tue, 13 Jul 2021 00:10:47 +0900 Subject: [PATCH] *: some fixes for e2e test on krane-based k8s (#443) **change resources for `deploy/k8s/dev`** To test the Varlog in a Krane-based cluster, the size of the cluster should be at least 20 nodes. Recommended numbers are as follows: the number of MRs is three and the number of SNs is 3 or 4 and the replication factor is 3. **daemonset doesn't respect podAntiAffinity** See https://github.com/kubernetes/kubernetes/issues/29276. We should attach label `varlog-type=telemetry` to nodes running jaeger, prometheus, otel-collector and grafana. Daemonset for MR and SN won't be deployed to those nodes. The e2e testing module ignores nodes labed with `varlog-type=telemetry`. Resolves [#VARLOG-509](VARLOG-509). --- deploy/k8s/base/jaeger.yaml | 6 +++--- deploy/k8s/base/mr.yaml | 5 +++++ deploy/k8s/base/otel-collector.yaml | 4 ++-- deploy/k8s/base/prometheus.yaml | 7 ++++--- deploy/k8s/base/sn.yaml | 5 +++++ deploy/k8s/dev/jaeger-patch.yaml | 10 +++++----- deploy/k8s/dev/otel-agent-patch.yaml | 18 +++++++++--------- deploy/k8s/dev/otel-collector-patch.yaml | 10 +++++----- deploy/k8s/dev/prometheus-patch.yaml | 10 +++++----- deploy/k8s/dev/varlog-mr-patch.yaml | 8 ++++---- deploy/k8s/dev/varlog-sn-patch.yaml | 8 ++++---- test/e2e/e2e_long_test.go | 8 ++++---- test/e2e/k8s_util.go | 18 ++++++++++++++++-- 13 files changed, 71 insertions(+), 46 deletions(-) diff --git a/deploy/k8s/base/jaeger.yaml b/deploy/k8s/base/jaeger.yaml index d7780d63c..10df1f207 100644 --- a/deploy/k8s/base/jaeger.yaml +++ b/deploy/k8s/base/jaeger.yaml @@ -49,19 +49,19 @@ spec: port: 14250 dnsPolicy: ClusterFirst affinity: - podAffinity: + podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app - operator: NotIn + operator: In values: - "varlog-mr" - "varlog-sn" - "varlog-vms" - "prometheus" - key: "component" - operator: NotIn + operator: In values: - "otel-collector" topologyKey: "kubernetes.io/hostname" diff --git a/deploy/k8s/base/mr.yaml b/deploy/k8s/base/mr.yaml index c32205f70..0b4aa7276 100644 --- a/deploy/k8s/base/mr.yaml +++ b/deploy/k8s/base/mr.yaml @@ -86,6 +86,11 @@ spec: operator: In values: - varlog-mr + - key: varlog-type + operator: NotIn + values: + - "telemetry" + # The podAntiAffinity in this is not effective. See https://github.com/kubernetes/kubernetes/issues/29276. podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: diff --git a/deploy/k8s/base/otel-collector.yaml b/deploy/k8s/base/otel-collector.yaml index 64279e8ff..b8cede821 100644 --- a/deploy/k8s/base/otel-collector.yaml +++ b/deploy/k8s/base/otel-collector.yaml @@ -158,12 +158,12 @@ spec: path: otel-collector-config.yaml dnsPolicy: ClusterFirst affinity: - podAffinity: + podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app - operator: NotIn + operator: In values: - "varlog-mr" - "varlog-sn" diff --git a/deploy/k8s/base/prometheus.yaml b/deploy/k8s/base/prometheus.yaml index 36f3dc807..70ab7221f 100644 --- a/deploy/k8s/base/prometheus.yaml +++ b/deploy/k8s/base/prometheus.yaml @@ -76,18 +76,19 @@ spec: - name: prometheus-storage-volume emptyDir: {} affinity: - podAffinity: + podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app - operator: NotIn + operator: In values: - "varlog-mr" - "varlog-sn" - "varlog-vms" + - "jaeger" - key: "component" - operator: NotIn + operator: In values: - "otel-collector" topologyKey: "kubernetes.io/hostname" diff --git a/deploy/k8s/base/sn.yaml b/deploy/k8s/base/sn.yaml index a8ea3504d..f48548ba2 100644 --- a/deploy/k8s/base/sn.yaml +++ b/deploy/k8s/base/sn.yaml @@ -85,6 +85,11 @@ spec: operator: In values: - varlog-sn + - key: varlog-type + operator: NotIn + values: + - "telemetry" + # The podAntiAffinity in this is not effective. See https://github.com/kubernetes/kubernetes/issues/29276. podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: diff --git a/deploy/k8s/dev/jaeger-patch.yaml b/deploy/k8s/dev/jaeger-patch.yaml index 5b76045f6..cbe9cd87c 100644 --- a/deploy/k8s/dev/jaeger-patch.yaml +++ b/deploy/k8s/dev/jaeger-patch.yaml @@ -6,15 +6,15 @@ metadata: labels: app: jaeger spec: - replicas: 0 + replicas: 1 template: spec: containers: - name: jaeger resources: limits: - cpu: 1000m - memory: 1Gi + cpu: 4000m + memory: 4Gi requests: - cpu: 1000m - memory: 1Gi + cpu: 4000m + memory: 4Gi diff --git a/deploy/k8s/dev/otel-agent-patch.yaml b/deploy/k8s/dev/otel-agent-patch.yaml index 35abe69bd..f2b3dbbeb 100644 --- a/deploy/k8s/dev/otel-agent-patch.yaml +++ b/deploy/k8s/dev/otel-agent-patch.yaml @@ -9,12 +9,12 @@ metadata: spec: template: spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: type - operator: In - values: - - skip-otel-agent + containers: + - name: otel-agent + resources: + limits: + cpu: 1000m + memory: 1Gi + requests: + cpu: 1000m + memory: 1Gi diff --git a/deploy/k8s/dev/otel-collector-patch.yaml b/deploy/k8s/dev/otel-collector-patch.yaml index 614517ffd..ab71ff55d 100644 --- a/deploy/k8s/dev/otel-collector-patch.yaml +++ b/deploy/k8s/dev/otel-collector-patch.yaml @@ -7,15 +7,15 @@ metadata: app: opentelemetry component: otel-collector spec: - replicas: 0 + replicas: 1 template: spec: containers: - name: otel-collector resources: limits: - cpu: 1000m - memory: 1Gi + cpu: 4000m + memory: 4Gi requests: - cpu: 1000m - memory: 1Gi + cpu: 4000m + memory: 4Gi diff --git a/deploy/k8s/dev/prometheus-patch.yaml b/deploy/k8s/dev/prometheus-patch.yaml index 48a37450e..b508c636e 100644 --- a/deploy/k8s/dev/prometheus-patch.yaml +++ b/deploy/k8s/dev/prometheus-patch.yaml @@ -6,15 +6,15 @@ metadata: labels: app: prometheus spec: - replicas: 0 + replicas: 1 template: spec: containers: - name: prometheus resources: limits: - cpu: 1000m - memory: 1Gi + cpu: 4000m + memory: 4Gi requests: - cpu: 1000m - memory: 1Gi + cpu: 4000m + memory: 4Gi diff --git a/deploy/k8s/dev/varlog-mr-patch.yaml b/deploy/k8s/dev/varlog-mr-patch.yaml index 88b5ceeda..79bde9e3d 100644 --- a/deploy/k8s/dev/varlog-mr-patch.yaml +++ b/deploy/k8s/dev/varlog-mr-patch.yaml @@ -23,11 +23,11 @@ spec: - name: varlog-mr resources: limits: - cpu: 3000m - memory: 3Gi + cpu: 4000m + memory: 4Gi requests: - cpu: 3000m - memory: 3Gi + cpu: 4000m + memory: 4Gi env: - name: VMR_HOME value: "/varlog/mr" diff --git a/deploy/k8s/dev/varlog-sn-patch.yaml b/deploy/k8s/dev/varlog-sn-patch.yaml index 993981ce9..62b5416bb 100644 --- a/deploy/k8s/dev/varlog-sn-patch.yaml +++ b/deploy/k8s/dev/varlog-sn-patch.yaml @@ -23,11 +23,11 @@ spec: - name: varlog-sn resources: limits: - cpu: 3000m - memory: 3000Mi + cpu: 4000m + memory: 4000Mi requests: - cpu: 3000m - memory: 3000Mi + cpu: 4000m + memory: 4000Mi env: - name: VOLUMES value: "/varlog/sn" diff --git a/test/e2e/e2e_long_test.go b/test/e2e/e2e_long_test.go index 6dbf56bdd..ee8ce462f 100644 --- a/test/e2e/e2e_long_test.go +++ b/test/e2e/e2e_long_test.go @@ -20,11 +20,11 @@ import ( func TestK8sVarlogAppendLongTime(t *testing.T) { const ( testTimeout = 15 * time.Minute - numRepFactor = 2 + numRepFactor = 3 numMRs = 3 - numSNs = 2 - numLSs = 1 - numClients = 50 + numSNs = 9 + numLSs = 3 + numClients = 10 clusterID = types.ClusterID(1) ) diff --git a/test/e2e/k8s_util.go b/test/e2e/k8s_util.go index 80573b4ac..6fdd033d9 100644 --- a/test/e2e/k8s_util.go +++ b/test/e2e/k8s_util.go @@ -49,6 +49,9 @@ const ( IngressNginxNamespace = "ingress-nginx" ENV_REP_FACTOR = "REP_FACTOR" + + // telemetry + TelemetryLabelValue = "telemetry" ) type K8sVarlogPodGetter interface { @@ -444,7 +447,10 @@ func (k8s *K8sVarlogCluster) RemoveLabelAll() (err error) { } for _, node := range nodes.Items { - if _, ok := node.Labels[TypeLabelKey]; ok { + if labelValue, ok := node.Labels[TypeLabelKey]; ok { + if labelValue == TelemetryLabelValue { + continue + } if erri := k8s.RemoveLabel(node.GetName(), TypeLabelKey); erri != nil { err = multierr.Append(err, erri) } @@ -582,6 +588,14 @@ func (k8s *K8sVarlogCluster) clearMRDatas() error { return err } + targetNodes := 0 + for _, node := range nodes.Items { + if _, ok := node.Labels[TypeLabelKey]; ok { + continue + } + targetNodes++ + } + for _, node := range nodes.Items { if _, ok := node.Labels[TypeLabelKey]; ok { continue @@ -595,7 +609,7 @@ func (k8s *K8sVarlogCluster) clearMRDatas() error { if err := testutil.CompareWaitErrorWithRetryIntervalN(1000, 10*time.Second, func() (bool, error) { numPods, err := k8s.numPodsReady(VarlogNamespace, podSelector) - return numPods == len(nodes.Items), errors.Wrap(err, "k8s") + return numPods == targetNodes, errors.Wrap(err, "k8s") }); err != nil { return err }