diff --git a/.github/workflows/cni-plugin-integration.yml b/.github/workflows/cni-plugin-integration.yml index 45824531..0256c591 100644 --- a/.github/workflows/cni-plugin-integration.yml +++ b/.github/workflows/cni-plugin-integration.yml @@ -8,6 +8,8 @@ on: - cni-plugin/integration/flannel/Dockerfile-tester - cni-plugin/integration/run.sh - cni-plugin/** + - cni-repair-controller/** + - justfile* jobs: cni-flannel-test: @@ -46,3 +48,11 @@ jobs: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 - name: Run CNI ordering tests run: just cni-plugin-test-ordering + repair-controller: + timeout-minutes: 15 + runs-on: ubuntu-latest + steps: + - uses: linkerd/dev/actions/setup-tools@v42 + - uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac + - name: Run repair-controller tests + run: just cni-repair-controller-integration diff --git a/cni-plugin/integration/calico-k3s-images.json b/cni-plugin/integration/calico-k3s-images.json new file mode 100644 index 00000000..4288dd9b --- /dev/null +++ b/cni-plugin/integration/calico-k3s-images.json @@ -0,0 +1,11 @@ +{ + "name": "docker.io/rancher/k3s", + "channels": { + "stable": "v1.27.6-k3s1", + "latest": "v1.27.6-k3s1", + "v1.27": "v1.27.6-k3s1" + }, + "digests": { + "v1.27.6-k3s1": "sha256:9486bbb9ca9b81c098ecd07f1c45441e143dab12577e22cf062586edcfd9d952" + } +} diff --git a/cni-repair-controller/integration/linkerd-cni-config.yml b/cni-repair-controller/integration/linkerd-cni-config.yml new file mode 100644 index 00000000..3a8cb0f3 --- /dev/null +++ b/cni-repair-controller/integration/linkerd-cni-config.yml @@ -0,0 +1,7 @@ +# This config adds an extra initContainer that will make linkerd-cni to delay +# its start for 15s, so to allow time for the pause DaemonSet to start before +# the full CNI config is ready and enter a failure mode +extraInitContainers: +- name: sleep + image: alpine:3.19.0 + command: ["/bin/sh", "-c", "sleep 15"] diff --git a/cni-repair-controller/integration/pause-ds.yml b/cni-repair-controller/integration/pause-ds.yml new file mode 100644 index 00000000..1b8e5d7c --- /dev/null +++ b/cni-repair-controller/integration/pause-ds.yml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: pause +spec: + selector: + matchLabels: + app: pause-app + template: + metadata: + annotations: + linkerd.io/inject: enabled + labels: + app: pause-app + spec: + priorityClassName: system-node-critical + containers: + - name: pause-container + image: k8s.gcr.io/pause diff --git a/cni-repair-controller/integration/run.sh b/cni-repair-controller/integration/run.sh new file mode 100755 index 00000000..ba6fc294 --- /dev/null +++ b/cni-repair-controller/integration/run.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# shellcheck disable=SC2086 +function step() { + repeat=$(seq 1 ${#1}) + printf "%0.s#" $repeat + printf "#####\n# %s...\n" "$1" + printf "%0.s#" $repeat + printf "#####\n" +} + +if [[ ! "$1" =~ (.*):(.*) ]]; then + echo 'Usage: run.sh name:tag' + exit 1 +fi +cni_plugin_image=${BASH_REMATCH[1]} +cni_image_version=${BASH_REMATCH[2]} + +cd "${BASH_SOURCE[0]%/*}" + +step 'Installing Calico' +kubectl apply -f https://k3d.io/v5.1.0/usage/advanced/calico.yaml +kubectl --namespace=kube-system wait --for=condition=available --timeout=120s \ + deploy/calico-kube-controllers + +step 'Installing latest linkerd edge' +scurl https://run.linkerd.io/install-edge | sh +export PATH=$PATH:$HOME/.linkerd2/bin +linkerd install --crds | kubectl apply -f - +# The linkerd-cni-config.yml config adds an extra initContainer that will make +# linkerd-cni to delay its start for 15s, so to allow time for the pause +# DaemonSet to start before the full CNI config is ready and enter a failure +# mode +linkerd install-cni \ + --use-wait-flag \ + --cni-image "$cni_plugin_image" \ + --cni-image-version "$cni_image_version" \ + --set repairController.enabled=true \ + -f linkerd-cni-config.yml \ + | kubectl apply -f - +linkerd check --pre --linkerd-cni-enabled +linkerd install --linkerd-cni-enabled | kubectl apply -f - +linkerd check + +step 'Installing pause DaemonSet' +kubectl apply -f pause-ds.yml +kubectl wait --for=condition=ready --timeout=120s -l app=pause-app po + +step 'Adding a node' +cluster=$(just-k3d --evaluate K3D_CLUSTER_NAME) +image=$(just --evaluate cni-plugin-image) +k3d node create node2 --cluster "$cluster" +k3d image import --cluster "$cluster" "$image" + +step 'Checking new DS replica fails with code 95' +sleep 10 +kubectl wait \ + --for=jsonpath='{.status.initContainerStatuses[0].lastState.terminated.exitCode}'=95 \ + --field-selector=spec.nodeName=k3d-node2-0 \ + pod + +step 'Checking new DS replica gets replaced' +for _ in {1..5}; do + if kubectl wait --for=condition=ready --timeout=10s -l app=pause-app po; then + break + fi +done +kubectl wait --for=condition=ready --timeout=10s -l app=pause-app po; diff --git a/justfile b/justfile index ae99ea4f..ba3cb506 100644 --- a/justfile +++ b/justfile @@ -17,7 +17,7 @@ lint: sh-lint md-lint rs-clippy action-lint action-dev-check go-lint *flags: (proxy-init-lint flags) (cni-plugin-lint flags) -test: rs-test proxy-init-test-unit proxy-init-test-integration +test: rs-test proxy-init-test-unit proxy-init-test-integration cni-repair-controller-integration # Check whether the Go code is formatted. go-fmt-check: @@ -82,6 +82,15 @@ cni-repair-controller *args: TARGETCRATE=linkerd-cni-repair-controller \ {{ just_executable() }} --justfile=justfile-rust {{ args }} +# The K3S_IMAGES_JSON file used instructs the creation of a cluster on version +# v1.27.6-k3s1, because after that Calico won't work. +# See https://github.com/k3d-io/k3d/issues/1375 +cni-repair-controller-integration $K3S_IMAGES_JSON='./cni-plugin/integration/calico-k3s-images.json': (cni-repair-controller "package") build-cni-plugin-image + @{{ just_executable() }} K3D_CREATE_FLAGS='{{ _K3D_CREATE_FLAGS_NO_CNI }}' _k3d-cni-create + @just-k3d use + @just-k3d import {{ cni-plugin-image }} + ./cni-repair-controller/integration/run.sh {{ cni-plugin-image }} + ## ## cni-plugin ## @@ -178,8 +187,11 @@ _cni-plugin-test-integration: # Run cni-plugin integration tests using calico, in a dedicated k3d environment # NOTE: we have to rely on a different set of dependencies here; specifically # `k3d-create` instead of `_k3d-ready`, since without a CNI DNS pods won't -# start -cni-plugin-test-integration-calico: +# start. +# The K3S_IMAGES_JSON file used instructs the creation of a cluster on version +# v1.27.6-k3s1, because after that Calico won't work. +# See https://github.com/k3d-io/k3d/issues/1375 +cni-plugin-test-integration-calico $K3S_IMAGES_JSON='./cni-plugin/integration/calico-k3s-images.json': @{{ just_executable() }} \ CNI_TEST_SCENARIO='calico' \ K3D_CLUSTER_NAME='l5d-calico-test' \