diff --git a/.github/actions/rosa-cli-setup/action.yml b/.github/actions/rosa-cli-setup/action.yml index 2a5a34c4..aab09a67 100644 --- a/.github/actions/rosa-cli-setup/action.yml +++ b/.github/actions/rosa-cli-setup/action.yml @@ -50,3 +50,4 @@ runs: run: | ADMIN_PASSWORD=$(aws secretsmanager get-secret-value --region $SECRET_MANAGER_REGION --secret-id $KEYCLOAK_MASTER_PASSWORD_SECRET_NAME --query SecretString --output text --no-cli-pager) echo "::add-mask::$ADMIN_PASSWORD" + echo "KEYCLOAK_ADMIN_PASSWORD=${ADMIN_PASSWORD}" >> $GITHUB_ENV diff --git a/.github/workflows/rosa-cluster-auto-provision-on-schedule.yml b/.github/workflows/rosa-cluster-auto-provision-on-schedule.yml index 2b59fb1c..19c2abab 100644 --- a/.github/workflows/rosa-cluster-auto-provision-on-schedule.yml +++ b/.github/workflows/rosa-cluster-auto-provision-on-schedule.yml @@ -53,7 +53,7 @@ jobs: createCluster: false secrets: inherit - run-scaling-benchmark-with-peristent-sessions: + run-scaling-benchmark-with-persistent-sessions: needs: keycloak-deploy-with-persistent-sessions uses: ./.github/workflows/rosa-scaling-benchmark.yml with: @@ -61,3 +61,41 @@ jobs: skipCreateDataset: true outputArchiveSuffix: 'persistent-sessions' secrets: inherit + + keycloak-undeploy-with-persistent-sessions: + needs: run-scaling-benchmark-with-persistent-sessions + name: Undeploy Keycloak deployment on the multi-az cluster + if: github.event_name != 'schedule' || github.repository == 'keycloak/keycloak-benchmark' + uses: ./.github/workflows/rosa-multi-az-cluster-undeploy.yml + with: + clusterPrefix: gh-keycloak # ${{ env.CLUSTER_PREFIX }} -- unfortunately 'env.' doesn't work here + skipAuroraDeletion: true + secrets: inherit + + keycloak-deploy-active-active: + needs: keycloak-undeploy-with-persistent-sessions + name: ROSA Scheduled Create Active/Active cluster with Persistent Sessions + if: github.event_name != 'schedule' || github.repository == 'keycloak/keycloak-benchmark' + uses: ./.github/workflows/rosa-multi-az-cluster-create.yml + with: + clusterPrefix: gh-keycloak # ${{ env.CLUSTER_PREFIX }} -- unfortunately 'env.' doesn't work here + enablePersistentSessions: true + createCluster: false + activeActive: true + secrets: inherit + + run-functional-tests-active-active: + needs: keycloak-deploy-active-active + uses: ./.github/workflows/rosa-run-crossdc-func-tests.yml + with: + activeActive: true + clusterPrefix: gh-keycloak # ${{ env.CLUSTER_PREFIX }} -- unfortunately 'env.' doesn't work here + secrets: inherit + + run-scaling-benchmark-active-active: + needs: run-functional-tests-active-active + uses: ./.github/workflows/rosa-scaling-benchmark.yml + with: + clusterName: gh-keycloak-a # ${{ env.CLUSTER_PREFIX }}-a -- unfortunately 'env.' doesn't work here ${{ env.CLUSTER_PREFIX }}-a + outputArchiveSuffix: 'active-active' + secrets: inherit diff --git a/.github/workflows/rosa-multi-az-cluster-create.yml b/.github/workflows/rosa-multi-az-cluster-create.yml index d0223c66..70765c70 100644 --- a/.github/workflows/rosa-multi-az-cluster-create.yml +++ b/.github/workflows/rosa-multi-az-cluster-create.yml @@ -16,6 +16,10 @@ on: keycloakRepository: description: 'The repository to deploy Keycloak from. If not set nightly image is used' type: string + activeActive: + description: 'When true deploy an Active/Active Keycloak deployment' + type: boolean + default: false enablePersistentSessions: description: 'To enable Persistent user and client sessions to the DB' type: boolean @@ -32,16 +36,20 @@ on: description: 'The AWS region to create both clusters in. Defaults to "vars.AWS_DEFAULT_REGION" if omitted.' type: string createCluster: - description: 'Check to Create Cluster' + description: 'Check to Create Cluster.' type: boolean default: true + keycloakRepository: + description: 'The repository to deploy Keycloak from. If not set nightly image is used' + type: string + activeActive: + description: 'When true deploy an Active/Active Keycloak deployment' + type: boolean + default: false enablePersistentSessions: description: 'To enable Persistent user and client sessions to the DB' type: boolean default: false - keycloakRepository: - description: 'The repository to deploy Keycloak from. If not set nightly image is used' - type: string keycloakBranch: description: 'The branch to deploy Keycloak from. If not set nightly image is used' type: string @@ -109,6 +117,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Setup OpenTofu + uses: opentofu/setup-opentofu@v1 + with: + tofu_wrapper: false + - name: Setup ROSA CLI uses: ./.github/actions/rosa-cli-setup with: @@ -140,6 +153,7 @@ jobs: ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b - name: Create Route53 Loadbalancer + if: ${{ !inputs.activeActive }} working-directory: provision/rosa-cross-dc run: | task route53 > route53 @@ -150,10 +164,51 @@ jobs: ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b - - name: Deploy + - name: Deploy Active/Passive + if: ${{ !inputs.activeActive }} working-directory: provision/rosa-cross-dc run: task env: + AURORA_CLUSTER: ${{ env.CLUSTER_PREFIX }} + AURORA_REGION: ${{ env.REGION }} + ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a + ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b + KC_ACTIVE_ACTIVE: ${{ inputs.activeActive }} + KC_CPU_REQUESTS: 6 + KC_INSTANCES: 3 + KC_DISABLE_STICKY_SESSION: true + KC_PERSISTENT_SESSIONS: ${{ env.KC_PERSISTENT_SESSIONS }} + KC_MEMORY_REQUESTS_MB: 3000 + KC_MEMORY_LIMITS_MB: 4000 + KC_DB_POOL_INITIAL_SIZE: 30 + KC_DB_POOL_MAX_SIZE: 30 + KC_DB_POOL_MIN_SIZE: 30 + KC_DATABASE: "aurora-postgres" + MULTI_AZ: "true" + KC_REPOSITORY: ${{ inputs.keycloakRepository }} + KC_BRANCH: ${{ inputs.keycloakBranch }} + + - name: Create Accelerator Loadbalancer + if: ${{ inputs.activeActive }} + working-directory: provision/rosa-cross-dc + run: | + task global-accelerator-create 2>&1 | tee accelerator + echo "ACCELERATOR_DNS=$(grep -Po 'ACCELERATOR DNS: \K.*' accelerator)" >> $GITHUB_ENV + echo "ACCELERATOR_WEBHOOK=$(grep -Po 'ACCELERATOR WEBHOOK: \K.*' accelerator)" >> $GITHUB_ENV + env: + ACCELERATOR_NAME: ${{ env.CLUSTER_PREFIX }} + ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a + ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b + + - name: Deploy Active/Active + if: ${{ inputs.activeActive }} + working-directory: provision/rosa-cross-dc + run: task active-active + env: + ACCELERATOR_DNS: ${{ env.ACCELERATOR_DNS }} + ACCELERATOR_WEBHOOK_URL: ${{ env.ACCELERATOR_WEBHOOK }} + ACCELERATOR_WEBHOOK_USERNAME: "keycloak" + ACCELERATOR_WEBHOOK_PASSWORD: ${{ env.KEYCLOAK_ADMIN_PASSWORD }} AURORA_CLUSTER: ${{ env.CLUSTER_PREFIX }} AURORA_REGION: ${{ env.REGION }} ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a diff --git a/.github/workflows/rosa-multi-az-cluster-delete.yml b/.github/workflows/rosa-multi-az-cluster-delete.yml index 71a8a5db..93faccb3 100644 --- a/.github/workflows/rosa-multi-az-cluster-delete.yml +++ b/.github/workflows/rosa-multi-az-cluster-delete.yml @@ -11,7 +11,7 @@ on: type: string jobs: - route53: + loadbalancer: runs-on: ubuntu-latest steps: - name: Checkout repository @@ -40,12 +40,24 @@ jobs: echo "SUBDOMAIN=$(echo $KEYCLOAK_URL | grep -oP '(?<=client.).*?(?=.keycloak-benchmark.com)')" >> $GITHUB_ENV - name: Delete Route53 Records - run: | - ./provision/aws/route53/route53_delete.sh + run: ./provision/aws/route53/route53_delete.sh env: SUBDOMAIN: ${{ env.SUBDOMAIN }} + - name: Set ACCELERATOR_DNS env variable for Global Accelerator processing + run: | + echo "ACCELERATOR_DNS=${KEYCLOAK_URL#"https://"}" >> $GITHUB_ENV + + - name: Delete Global Accelerator + run: ./provision/aws/global-accelerator/accelerator_multi_az_delete.sh + env: + ACCELERATOR_DNS: ${{ env.ACCELERATOR_DNS }} + CLUSTER_1: ${{ inputs.clusterPrefix }}-a + CLUSTER_2: ${{ inputs.clusterPrefix }}-b + KEYCLOAK_NAMESPACE: runner-keycloak + cluster1: + needs: loadbalancer uses: ./.github/workflows/rosa-cluster-delete.yml with: clusterName: ${{ inputs.clusterPrefix }}-a @@ -53,6 +65,7 @@ jobs: secrets: inherit cluster2: + needs: loadbalancer uses: ./.github/workflows/rosa-cluster-delete.yml with: clusterName: ${{ inputs.clusterPrefix }}-b diff --git a/.github/workflows/rosa-run-crossdc-func-tests.yml b/.github/workflows/rosa-run-crossdc-func-tests.yml index ad9eec70..4f9dd14a 100644 --- a/.github/workflows/rosa-run-crossdc-func-tests.yml +++ b/.github/workflows/rosa-run-crossdc-func-tests.yml @@ -6,12 +6,20 @@ on: clusterPrefix: description: 'The prefix used when creating the Cross DC clusters' type: string + activeActive: + description: 'Must be true when testing against an Active/Active Keycloak deployment' + type: boolean + default: false workflow_dispatch: inputs: clusterPrefix: description: 'The prefix used when creating the Cross DC clusters' type: string + activeActive: + description: 'Must be true when testing against an Active/Active Keycloak deployment' + type: boolean + default: false concurrency: # Only run once for the latest commit per ref and cancel other (previous) runs. @@ -32,6 +40,7 @@ jobs: distribution: 'temurin' java-version: '17' cache: 'maven' + - name: Cache Maven Wrapper uses: actions/cache@v4 with: @@ -40,6 +49,7 @@ jobs: key: ${{ runner.os }}-maven-wrapper-${{ hashFiles('**/maven-wrapper.properties') }} restore-keys: | ${{ runner.os }}-maven-wrapper- + - name: Setup ROSA CLI uses: ./.github/actions/rosa-cli-setup with: @@ -47,30 +57,26 @@ jobs: aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-default-region: ${{ vars.AWS_DEFAULT_REGION }} rosa-token: ${{ secrets.ROSA_TOKEN }} + - name: Login to OpenShift cluster A uses: ./.github/actions/oc-keycloak-login with: clusterName: ${{ inputs.clusterPrefix }}-a - - name: Get DC1 URLs + + - name: Get DC1 Context shell: bash - run: | - KEYCLOAK_DC1_URL=https://$(kubectl get routes -n "${{ env.PROJECT }}" aws-health-route -o jsonpath='{.spec.host}') - echo "KEYCLOAK_DC1_URL=$KEYCLOAK_DC1_URL" >> "$GITHUB_ENV" - LOAD_BALANCER_URL=https://$(kubectl get routes -n "${{ env.PROJECT }}" -l app=keycloak -o jsonpath='{.items[*].spec.host}') - echo "LOAD_BALANCER_URL=$LOAD_BALANCER_URL" >> "$GITHUB_ENV" - ISPN_DC1_URL=https://$(kubectl get routes -n "${{ env.PROJECT }}" -l app=infinispan-service-external -o jsonpath='{.items[*].spec.host}') - echo "ISPN_DC1_URL=$ISPN_DC1_URL" >> "$GITHUB_ENV" + run: echo "KUBERNETES_1_CONTEXT=$(kubectl config current-context)" >> "$GITHUB_ENV" + - name: Login to OpenShift cluster B uses: ./.github/actions/oc-keycloak-login with: clusterName: ${{ inputs.clusterPrefix }}-b - - name: Get DC2 URLs + + - name: Get DC2 Context shell: bash - run: | - KEYCLOAK_DC2_URL=https://$(kubectl get routes -n "${{ env.PROJECT }}" aws-health-route -o jsonpath='{.spec.host}') - echo "KEYCLOAK_DC2_URL=$KEYCLOAK_DC2_URL" >> "$GITHUB_ENV" - ISPN_DC2_URL=https://$(kubectl get routes -n "${{ env.PROJECT }}" -l app=infinispan-service-external -o jsonpath='{.items[*].spec.host}') - echo "ISPN_DC2_URL=$ISPN_DC2_URL" >> "$GITHUB_ENV" + run: echo "KUBERNETES_2_CONTEXT=$(kubectl config current-context)" >> "$GITHUB_ENV" + - name: Run CrossDC functional tests - run: | - ./provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/run-crossdc-tests.sh + run: ./provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/run-crossdc-tests.sh + env: + ACTIVE_ACTIVE: ${{ inputs.activeActive }} diff --git a/.gitignore b/.gitignore index 135dcd98..61d50340 100644 --- a/.gitignore +++ b/.gitignore @@ -101,3 +101,4 @@ provision/environment_data.json **/*.tfstate* **/*.terraform* !**/*.terraform.lock.hcl +provision/opentofu/modules/aws/accelerator/builds/* diff --git a/doc/kubernetes/collector/build.sh b/doc/kubernetes/collector/build.sh index 94f24176..50ced4ed 100755 --- a/doc/kubernetes/collector/build.sh +++ b/doc/kubernetes/collector/build.sh @@ -81,6 +81,10 @@ helm template --debug ${STARTDIR}/../../../provision/infinispan/ispn-helm \ --set metrics.histograms=false \ --set hotrodPassword="strong-password" \ --set cacheDefaults.crossSiteMode=SYNC \ + --set acceleratorDNS=a3da6a6cbd4e27b02.awsglobalaccelerator.com \ + --set alertmanager.webhook.username=keycloak \ + --set alertmanager.webhook.password=changme \ + --set alertmanager.webhook.url=https://tjqr2vgc664b6noj6vugprakoq0oausj.lambda-url.eu-west-1.on.aws/ \ > ${BUILDDIR}/helm/ispn-site-a.yaml # Infinispan site B deployment diff --git a/doc/kubernetes/modules/ROOT/examples/stonith_lambda.py b/doc/kubernetes/modules/ROOT/examples/stonith_lambda.py new file mode 120000 index 00000000..f9d0835c --- /dev/null +++ b/doc/kubernetes/modules/ROOT/examples/stonith_lambda.py @@ -0,0 +1 @@ +../../../../../provision/opentofu/modules/aws/accelerator/src/stonith_lambda.py \ No newline at end of file diff --git a/doc/kubernetes/modules/ROOT/pages/prerequisite/prerequisite-openshift.adoc b/doc/kubernetes/modules/ROOT/pages/prerequisite/prerequisite-openshift.adoc index c0641c18..eeacd3d1 100644 --- a/doc/kubernetes/modules/ROOT/pages/prerequisite/prerequisite-openshift.adoc +++ b/doc/kubernetes/modules/ROOT/pages/prerequisite/prerequisite-openshift.adoc @@ -76,9 +76,10 @@ oc login https://api.****:6443 -u **** NOTE: The session will expire approximately one a day, and you'll need to re-login. -== Enable user workload monitoring +== Enable alert routing for user-defined projects + +By default, OpenShift HCP doesn't enable alert routing for user-defined projects. -By default, OpenShift doesn't monitor user workloads. Apply the following ConfigMap link:{github-files}/provision/openshift/cluster-monitoring-config.yaml[cluster-monitoring-config.yaml] which is located in the `/provision/openshift` folder to OpenShift: [source,bash] @@ -93,14 +94,11 @@ After this has been deployed, several new pods spin up in the *openshift-user-wo kubectl get pods -n openshift-user-workload-monitoring ---- -The metrics and targets are then available in the menu entry *Observe* in the OpenShift console. - -Additional steps are necessary to enable persistent volumes for the recorded metrics. +Alerts defined in `PrometheusRule` CR are then available to view in the menu entry *Observe->Alerting* in the OpenShift console. Further reading: -* https://docs.openshift.com/container-platform/4.12/monitoring/configuring-the-monitoring-stack.html[Configure OpenShift monitoring stack] -* https://docs.openshift.com/container-platform/4.12/monitoring/enabling-monitoring-for-user-defined-projects.html[Enabling monitoring for user-defined projects] +* https://docs.openshift.com/rosa/observability/monitoring/enabling-alert-routing-for-user-defined-projects.html[Enabling alert routing for user-defined projects] [#switching-between-different-kubernetes-clusters] == Switching between different Kubernetes clusters diff --git a/doc/kubernetes/modules/ROOT/pages/running/bring-active-site-online.adoc b/doc/kubernetes/modules/ROOT/pages/running/bring-active-site-online.adoc new file mode 100644 index 00000000..dc193f11 --- /dev/null +++ b/doc/kubernetes/modules/ROOT/pages/running/bring-active-site-online.adoc @@ -0,0 +1,73 @@ += Bring Active/Active site online +:description: This guide describes how to bring an Active/Active site online so that it can process client requests. + +{description} + +== When to use procedure + +This procedure describes how to re-add a Keycloak site to the Global Accelerator, after it has previously been taken offline, +so that it can once again service client requests. + +== Procedure + +Follow these steps to re-add a Keycloak site to the AWS Global Accelerator so that it can handle client requests. + +=== Global Accelerator + +. Determine the ARN of the Network Load Balancer (NLB) associated with the site to be brought online ++ +include::partial$nlb-arn.adoc[] ++ +. Update the Accelerator EndpointGroup to include both sites + +include::partial$accelerator-endpoint-group.adoc[] ++ +.Output: +[source,bash] +---- +{ + "EndpointGroups": [ + { + "EndpointGroupArn": "arn:aws:globalaccelerator::606671647913:accelerator/d280fc09-3057-4ab6-9330-6cbf1f450748/listener/8769072f/endpoint-group/a30b64ec1700", + "EndpointGroupRegion": "eu-west-1", + "EndpointDescriptions": [ + { + "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a3c75f239541c4a6e9c48cf8d48d602f/5ba333e87019ccf0", + "Weight": 50, + "HealthState": "HEALTHY", + "ClientIPPreservationEnabled": false + } + ], + "TrafficDialPercentage": 100.0, + "HealthCheckPort": 443, + "HealthCheckProtocol": "TCP", + "HealthCheckIntervalSeconds": 30, + "ThresholdCount": 3 + } + ] +} +---- ++ +.. Update the EndpointGroup to include the existing Endpoint and the NLB retrieved in step 1. ++ +.Command: +[source,bash] +---- +aws globalaccelerator update-endpoint-group \ + --endpoint-group-arn arn:aws:globalaccelerator::606671647913:accelerator/d280fc09-3057-4ab6-9330-6cbf1f450748/listener/8769072f/endpoint-group/a30b64ec1700 \ + --region us-west-2 \ + --endpoint-configurations ' + [ + { + "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a3c75f239541c4a6e9c48cf8d48d602f/5ba333e87019ccf0", + "Weight": 50, + "ClientIPPreservationEnabled": false + }, + { + "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a49e56e51e16843b9a3bc686327c907b/9b786f80ed4eba3d", + "Weight": 50, + "ClientIPPreservationEnabled": false + } + ] +' +---- diff --git a/doc/kubernetes/modules/ROOT/pages/running/index.adoc b/doc/kubernetes/modules/ROOT/pages/running/index.adoc index 168df34a..b732aea3 100644 --- a/doc/kubernetes/modules/ROOT/pages/running/index.adoc +++ b/doc/kubernetes/modules/ROOT/pages/running/index.adoc @@ -13,4 +13,11 @@ Once they had been published as part of the Keycloak 23 release, they have been * xref:running/infinispan-deployment.adoc[] * xref:running/loadbalancing.adoc[] +* xref:running/split-brain-stonith.adoc[] * xref:running/timeout_tunning.adoc[] + +[#operational-procedures] +== Operational procedures not yet published on keycloak.org + +* xref:running/take-active-site-offline.adoc[] +* xref:running/bring-active-site-online.adoc[] diff --git a/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc b/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc index dff186bd..cb367892 100644 --- a/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc +++ b/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc @@ -19,6 +19,10 @@ accelerator to ensure that requests are routed to Keycloak clusters deployed in AWS Global Accelerator prevents clients from erroneously caching IP addresses of failed Keycloak deployments which is possible with DNS based failover strategies. +In the event of a Keycloak site failing, the Accelerator ensures that all client requests are routed to the remaining +healthy site. If both sites are marked as unhealthy, then the Accelerator will "fail-open" and forward requests to a site +chosen at random. + .AWS Global Accelerator Failover image::accelerator/accelerator-multi-az.dio.svg[] @@ -48,12 +52,13 @@ cat < metadata: name: accelerator-loadbalancer annotations: + service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags: accelerator=${ACCELERATOR_NAME},site=${CLUSTER_NAME},namespace=${NAMESPACE}# <2> service.beta.kubernetes.io/aws-load-balancer-type: "nlb" service.beta.kubernetes.io/aws-load-balancer-healthcheck-path: "/lb-check" service.beta.kubernetes.io/aws-load-balancer-healthcheck-protocol: "https" - service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval: "10"# <2> - service.beta.kubernetes.io/aws-load-balancer-healthcheck-healthy-threshold: "3"# <3> - service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold: "3"# <4> + service.beta.kubernetes.io/aws-load-balancer-healthcheck-interval: "10"# <3> + service.beta.kubernetes.io/aws-load-balancer-healthcheck-healthy-threshold: "3"# <4> + service.beta.kubernetes.io/aws-load-balancer-healthcheck-unhealthy-threshold: "3"# <5> spec: ports: - name: https @@ -69,9 +74,11 @@ cat < EOF ---- <1> `$NAMESPACE` should be replaced with the namespace of your Keycloak deployment -<2> How frequently the healthcheck probe is executed in seconds -<3> How many healthchecks must pass for the NLB to be considered healthy -<4> How many healthchecks must fail for the NLB to be considered unhealthy +<2> Add additional Tags to the resources created by AWS so that we can retrieve them later. `ACCELERATOR_NAME` should be +the name of the Global Accelerator created in subsequent steps and `CLUSTER_NAME` should be the name of the current site. +<3> How frequently the healthcheck probe is executed in seconds +<4> How many healthchecks must pass for the NLB to be considered healthy +<5> How many healthchecks must fail for the NLB to be considered unhealthy + .. Take note of the DNS hostname as this will be required later: + diff --git a/doc/kubernetes/modules/ROOT/pages/running/split-brain-stonith.adoc b/doc/kubernetes/modules/ROOT/pages/running/split-brain-stonith.adoc new file mode 100644 index 00000000..ff8c5b8c --- /dev/null +++ b/doc/kubernetes/modules/ROOT/pages/running/split-brain-stonith.adoc @@ -0,0 +1,282 @@ += Deploy an AWS Lambda to guard against Split-Brain +:description: This guide explains how to reduce the impact when split-brain scenarios occur between two sites in an Active/Active deployment. + +{description} + +== Architecture +In the event of a network communication failure between the two sites in a Multi-AZ Active/Active deployment, it is no +longer possible for the two sites to continue to replicate session state between themselves and the two sites +will become increasingly out-of-sync. As it's possible for subsequent Keycloak requests to be routed to different +sites, this may lead to unexpected behaviour as previous updates will not have been applied to both sites. + +Typically in such scenarios a quorum is used to determine which sites are marked as online or offline, however as Active/Active +deployments only consist of two sites, this is not possible. Instead, we leverage the STONITH (Shoot The Other In The Head) +pattern to ensure that once a split-brain has been detected, only one site can continue to serve user requests. + +STONITH is implemented by using https://prometheus.io/docs/alerting/latest/overview/[Prometheus Alerts] to call an AWS +Lambda based webhook whenever one of the sites is unable to establish or connect to the other side. The triggered Lambda +function inspects the current Global Accelerator configuration and removes the site reported to be offline. + +In a true split-brain scenario, where both sites are still up but network communication is down, it's probable that both +sites will trigger the webhook. We guard against this by ensuring that only a single Lambda instance can be executed at +a given time. + +== Prerequisites + +* ROSA HCP based Multi-AZ Keycloak deployment +* AWS CLI Installed +* AWS Global Accelerator Loadbalancer + +== Procedure +. Enable Openshift user alert routing ++ +.Command: +[source,bash] +---- +kubectl apply -f - << EOF +apiVersion: v1 +kind: ConfigMap +metadata: + name: user-workload-monitoring-config + namespace: openshift-user-workload-monitoring +data: + config.yaml: | + alertmanager: + enabled: true + enableAlertmanagerConfig: true +EOF +kubectl -n openshift-user-workload-monitoring rollout status --watch statefulset.apps/alertmanager-user-workload +---- ++ +. [[aws-secret]]Decide upon a username/password combination which will be used to authenticate the Lambda webhook and create an AWS Secret storing the password ++ +.Command: +[source,bash] +---- +aws secretsmanager create-secret \ + --name webhook-password \ # <1> + --secret-string changeme \ # <2> + --region eu-west-1 # <3> +---- +<1> The name of the secret +<2> The password to be used for authentication +<3> The AWS region that hosts the secret ++ +. Create the Role used to execute the Lambda. ++ +.Command: +[source,bash] +---- +FUNCTION_NAME= # <1> +ROLE_ARN=$(aws iam create-role \ + --role-name ${FUNCTION_NAME} \ + --assume-role-policy-document \ + '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Service": "lambda.amazonaws.com" + }, + "Action": "sts:AssumeRole" + } + ] + }' \ + --query 'Role.Arn' \ + --region eu-west-1 \#<2> + --output text +) +---- +<1> A name of your choice to associate with the Lambda and related resources +<2> The AWS Region hosting your Kubernetes clusters ++ +. Create and attach the 'LambdaSecretManager' Policy so that the Lambda can access AWS Secrets ++ +.Command: +[source,bash] +---- +POLICY_ARN=$(aws iam create-policy \ + --policy-name LambdaSecretManager \ + --policy-document \ + '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "secretsmanager:GetSecretValue" + ], + "Resource": "*" + } + ] + }' \ + --query 'Policy.Arn' \ + --output text +) +aws iam attach-role-policy \ + --role-name ${FUNCTION_NAME} \ + --policy-arn ${POLICY_ARN} +---- ++ +. Attach the `ElasticLoadBalancingReadOnly` policy so that the Lambda can query the provisioned Network Load Balancers ++ +.Command: +[source,bash] +---- +aws iam attach-role-policy \ + --role-name ${FUNCTION_NAME} \ + --policy-arn arn:aws:iam::aws:policy/ElasticLoadBalancingReadOnly +---- ++ +. Attach the `GlobalAcceleratorFullAccess` policy so that the Lambda can update the Global Accelerator EndpointGroup ++ +.Command: +[source,bash] +---- +aws iam attach-role-policy \ + --role-name ${FUNCTION_NAME} \ + --policy-arn arn:aws:iam::aws:policy/GlobalAcceleratorFullAccess +---- ++ +. Create a Lambda ZIP file containing the required STONITH logic ++ +.Command: +[source,bash] +---- +LAMBDA_ZIP=/tmp/lambda.zip +cat << EOF > /tmp/lambda.py + +include::example$stonith_lambda.py[tag=stonith-start] + expected_user = 'keycloak' # <1> + secret_name = 'webhook-password' # <2> + secret_region = 'eu-west-1' # <3> +include::example$stonith_lambda.py[tag=stonith-end] + +EOF +zip -FS --junk-paths ${LAMBDA_ZIP} /tmp/lambda.py +---- +<1> The username required to authenticate Lambda requests +<2> The AWS secret containing the password <> +<3> The AWS region which stores the password secret ++ +. Create the Lambda function. ++ +.Command: +[source,bash] +---- +aws lambda create-function \ + --function-name ${FUNCTION_NAME} \ + --zip-file fileb://${LAMBDA_ZIP} \ + --handler lambda.handler \ + --runtime python3.12 \ + --role ${ROLE_ARN} \ + --region eu-west-1 #<1> +---- +<1> The AWS Region hosting your Kubernetes clusters ++ +. Expose a Function URL so the Lambda can be triggered as webhook ++ +.Command: +[source,bash] +---- +aws lambda create-function-url-config \ + --function-name ${FUNCTION_NAME} \ + --auth-type NONE \ + --region eu-west-1 #<1> +---- +<1> The AWS Region hosting your Kubernetes clusters ++ +. Allow public invocations of the Function URL ++ +.Command: +[source,bash] +---- +aws lambda add-permission \ + --action "lambda:InvokeFunctionUrl" \ + --function-name ${FUNCTION_NAME} \ + --principal "*" \ + --statement-id FunctionURLAllowPublicAccess \ + --function-url-auth-type NONE \ + --region eu-west-1 # <1> +---- +<1> The AWS Region hosting your Kubernetes clusters ++ +. Retieve the Lambda Function URL ++ +.Command: +[source,bash] +---- +aws lambda get-function-url-config \ + --function-name ${FUNCTION_NAME} \ + --query "FunctionUrl" \ + --region eu-west-1 \#<1> + --output text +---- +<1> The AWS region where the Lambda was created ++ +.Output: +[source,bash] +---- +https://tjqr2vgc664b6noj6vugprakoq0oausj.lambda-url.eu-west-1.on.aws +---- +. In each Kubernetes cluster, configure Prometheus Alert routing to trigger the Lambda on split-brain +.Command: +[source,bash] +---- +ACCELERATOR_NAME= # <1> +NAMESPACE= # <2> +LOCAL_SITE= # <3> +REMOTE_SITE= # <4> + +kubectl apply -n ${NAMESPACE} -f - << EOF +include::example$helm/ispn-site-a.yaml[tag=stonith-secret] +--- +include::example$helm/ispn-site-a.yaml[tag=stonith-alert-manager-config] +--- +include::example$helm/ispn-site-a.yaml[tag=stonith-prometheus-rule] +---- +<1> The username required to authenticate Lambda requests +<2> The password required to authenticate Lambda requests +<3> The Lambda Function URL +<4> The namespace value should be the namespace hosting the Infinispan CR and the site should be the remote site defined +by `spec.service.sites.locations[0].name` in your Infinispan CR +<5> The name of your local site defined by `spec.service.sites.local.name` in your Infinispan CR +<6> The DNS of your Global Accelerator + +== Verify + +To test that the Prometheus alert triggers the webhook as expected, perform the following steps to simulate a split-brain: + +. In each of your clusters execute the following: ++ +.Command: +[source,bash] +---- +kubectl -n openshift-operators scale --replicas=0 deployment/infinispan-operator-controller-manager #<1> +kubectl -n openshift-operators rollout status -w deployment/infinispan-operator-controller-manager +kubectl -n ${NAMESPACE} scale --replicas=0 deployment/infinispan-router #<2> +kubectl -n ${NAMESPACE} rollout status -w deployment/infinispan-router +---- +<1> Scale down the Infinispan Operator so that <2> does not result in the deployment being recreated by the operator +<2> Scale down the Gossip Router deployment.Replace `$\{NAMESPACE}` with the namespace containing your Infinispan server ++ +. Verify the `SiteOffline` event has been fired on a cluster by executing ++ +. Inspect the Global Accelerator EndpointGroup in the AWS console and there should only be a single endpoint present ++ +. Scale up the Infinispan Operator and Gossip Router to re-establish a connection between sites: ++ +.Command: +[source,bash] +---- +kubectl -n openshift-operators scale --replicas=1 deployment/infinispan-operator-controller-manager +kubectl -n openshift-operators rollout status -w deployment/infinispan-operator-controller-manager +kubectl -n ${NAMESPACE} scale --replicas=1 deployment/infinispan-router #<1> +kubectl -n ${NAMESPACE} rollout status -w deployment/infinispan-router +---- +<1> Replace `$\{NAMESPACE}` with the namespace containing your Infinispan server ++ +. Inspect the `vendor_jgroups_site_view_status` metric in each site. A value of `1` indicates that the site is reachable. ++ +. Update the Accelerator EndpointGroup to contain both Endpoints diff --git a/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc b/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc new file mode 100644 index 00000000..224542c6 --- /dev/null +++ b/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc @@ -0,0 +1,74 @@ += Take Active/Active site offline +:description: This guide describes how to take an Active/Active site offline so that it no longer processes client requests. + +{description} + +== When to use procedure + +During the deployment lifecycle it might be required that one of the Active/Active sites is temporarily taken offline +for maintenance or to allow for software upgrades. + +== Procedure + +Follow these steps to remove a site from the AWS Global Accelerator so that no traffic can be routed to it. + +=== Global Accelerator + +. Determine the ARN of the Network Load Balancer (NLB) associated with the site to be kept online ++ +include::partial$nlb-arn.adoc[] ++ +. Update the Accelerator EndpointGroup to only include a single site ++ +include::partial$accelerator-endpoint-group.adoc[] ++ +.Output: +[source,bash] +---- +{ + "EndpointGroups": [ + { + "EndpointGroupArn": "arn:aws:globalaccelerator::606671647913:accelerator/d280fc09-3057-4ab6-9330-6cbf1f450748/listener/8769072f/endpoint-group/a30b64ec1700", + "EndpointGroupRegion": "eu-west-1", + "EndpointDescriptions": [ + { + "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a49e56e51e16843b9a3bc686327c907b/9b786f80ed4eba3d", + "Weight": 50, + "HealthState": "HEALTHY", + "ClientIPPreservationEnabled": false + }, + { + "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a3c75f239541c4a6e9c48cf8d48d602f/5ba333e87019ccf0", + "Weight": 50, + "HealthState": "HEALTHY", + "ClientIPPreservationEnabled": false + } + ], + "TrafficDialPercentage": 100.0, + "HealthCheckPort": 443, + "HealthCheckProtocol": "TCP", + "HealthCheckIntervalSeconds": 30, + "ThresholdCount": 3 + } + ] +} +---- ++ +.. Update the EndpointGroup to only include the NLB retrieved in step 1. ++ +.Command: +[source,bash] +---- +aws globalaccelerator update-endpoint-group \ + --endpoint-group-arn arn:aws:globalaccelerator::606671647913:accelerator/d280fc09-3057-4ab6-9330-6cbf1f450748/listener/8769072f/endpoint-group/a30b64ec1700 \ + --region us-west-2 \ + --endpoint-configurations ' + [ + { + "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a49e56e51e16843b9a3bc686327c907b/9b786f80ed4eba3d", + "Weight": 50, + "ClientIPPreservationEnabled": false + } + ] +' +---- diff --git a/doc/kubernetes/modules/ROOT/pages/testing/index.adoc b/doc/kubernetes/modules/ROOT/pages/testing/index.adoc index 5673b4a2..9b9c0226 100644 --- a/doc/kubernetes/modules/ROOT/pages/testing/index.adoc +++ b/doc/kubernetes/modules/ROOT/pages/testing/index.adoc @@ -23,17 +23,26 @@ From the Testsuite root directory run the below command to run the tests ---- mvn clean install -DcrossDCTests \ - -Dinfinispan.dc1.url= -Dkeycloak.dc1.url= \ - -Dinfinispan.dc2.url= -Dkeycloak.dc2.url= \ - -Dinfinispan.password= + -Dmain.password= \#<1> + -Ddeployment.type=active-active|active-passive \#<2> + -Ddeployment.namespace=runner-keycloak \#<3> + -Dkubernetes.1.context= \#<4> + -Dkubernetes.2.context= #<5> ---- +<1> The main password of the Keycloak deployment +<2> The type of deployment to be tests, it can be either "active-active" or "active-passive". If omitted, defaults to "active-passive". +<3> The namespace containing the Keycloak deployment in the Kubernetes clusters specified in 4 & 5. +<4> The Kubeconfig context of cluster 1 +<5> The Kubeconfig context of cluster 2 -Alternatively could use the `run-crossdc-tests.sh` (located in the Testsuite root) directory to execute the tests when using a ROSA style provisioning setup to fetch the `ISPN_PASSWORD` on the fly, or by setting it manually. +Alternatively you can use the `run-crossdc-tests.sh` (located in the Testsuite root) directory to execute the tests when +using a ROSA style provisioning setup to fetch the `MAIN_PASSWORD` on the fly. Example usage: - ---- -ISPN_DC1_URL= ISPN_DC2_URL= \ -KEYCLOAK_DC1_URL= KEYCLOAK_DC2_URL= \ +ACTIVE_ACTIVE=true|false \ +DEPLOYMENT_NAMESPACE=runner-keycloak \ +KUBERNETES_1_CONTEXT= \ +KUBERNETES_2_CONTEXT= \ ./run-crossdc-tests.sh ---- diff --git a/doc/kubernetes/modules/ROOT/partials/accelerator-endpoint-group.adoc b/doc/kubernetes/modules/ROOT/partials/accelerator-endpoint-group.adoc new file mode 100644 index 00000000..8d729013 --- /dev/null +++ b/doc/kubernetes/modules/ROOT/partials/accelerator-endpoint-group.adoc @@ -0,0 +1,23 @@ +.. List the current endpoints in the Global Accelerator's EndpointGroup ++ +.Command: +[source,bash] +---- +ACCELERATOR_NAME= # <1> +ACCELERATOR_ARN=$(aws globalaccelerator list-accelerators \ + --query "Accelerators[?Name=='${ACCELERATOR_NAME}'].AcceleratorArn" \ + --region us-west-2 \ # <2> + --output text +) +LISTENER_ARN=$(aws globalaccelerator list-listeners \ + --accelerator-arn ${ACCELERATOR_ARN} \ + --query "Listeners[*].ListenerArn" \ + --region us-west-2 \ + --output text +) +aws globalaccelerator list-endpoint-groups \ + --listener-arn ${LISTENER_ARN} \ + --region us-west-2 +---- +<1> The name of the Accelerator to be updated +<2> The region must always be set to us-west-2 when querying AWS Global Accelerators diff --git a/doc/kubernetes/modules/ROOT/partials/nlb-arn.adoc b/doc/kubernetes/modules/ROOT/partials/nlb-arn.adoc new file mode 100644 index 00000000..b880612e --- /dev/null +++ b/doc/kubernetes/modules/ROOT/partials/nlb-arn.adoc @@ -0,0 +1,19 @@ +.Command: +[source,bash] +---- +NAMESPACE= # <1> +REGION= # <2> +HOSTNAME=$(kubectl -n $NAMESPACE get svc accelerator-loadbalancer --template="{{range .status.loadBalancer.ingress}}{{.hostname}}{{end}}") +aws elbv2 describe-load-balancers \ + --query "LoadBalancers[?DNSName=='${HOSTNAME}'].LoadBalancerArn" \ + --region ${REGION} \ + --output text +---- +<1> The Kubernetes namespace containing the Keycloak deployment +<2> The AWS Region hosting the Kubernetes cluster ++ +.Output: +[source,bash] +---- +arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a49e56e51e16843b9a3bc686327c907b/9b786f80ed4eba3d +---- diff --git a/provision/aws/global-accelerator/accelerator_multi_az_create.sh b/provision/aws/global-accelerator/accelerator_multi_az_create.sh index aae61e24..adfbe679 100755 --- a/provision/aws/global-accelerator/accelerator_multi_az_create.sh +++ b/provision/aws/global-accelerator/accelerator_multi_az_create.sh @@ -27,9 +27,9 @@ function waitForHostname() { function createLoadBalancer() { export CLUSTER_NAME=$1 - REGION=$2 - SVC_NAME=$3 - NAMESPACE=$4 + SVC_NAME=$2 + NAMESPACE=$3 + ACCELERATOR_NAME=$4 bash ${SCRIPT_DIR}/../rosa_oc_login.sh > /dev/null oc create namespace ${NAMESPACE} > /dev/null || true @@ -39,6 +39,7 @@ function createLoadBalancer() { metadata: name: ${SVC_NAME} annotations: + service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags: accelerator=${ACCELERATOR_NAME},site=${CLUSTER_NAME},namespace=${NAMESPACE} service.beta.kubernetes.io/aws-load-balancer-type: "nlb" service.beta.kubernetes.io/aws-load-balancer-healthcheck-path: "/lb-check" service.beta.kubernetes.io/aws-load-balancer-healthcheck-protocol: "https" @@ -59,25 +60,10 @@ function createLoadBalancer() { type: LoadBalancer EOF LB_DNS=$(waitForHostname ${SVC_NAME} ${NAMESPACE}) - LB_ARN=$(aws elbv2 describe-load-balancers \ - --query "LoadBalancers[?DNSName=='${LB_DNS}'].LoadBalancerArn" \ - --region ${REGION} \ - --output text - ) - echo ${LB_ARN} } requiredEnv ACCELERATOR_NAME CLUSTER_1 CLUSTER_2 KEYCLOAK_NAMESPACE -EXISTING_ACCELERATOR=$(aws globalaccelerator list-accelerators \ - --query "Accelerators[?Name=='${ACCELERATOR_NAME}'].AcceleratorArn" \ - --output text -) -if [ -n "${EXISTING_ACCELERATOR}" ]; then - echo "Global Accelerator already exists with name '${ACCELERATOR_NAME}'" - exit 1 -fi - CLUSTER_1_REGION=$(rosa describe cluster -c ${CLUSTER_1} -o json | jq -r .region.id) CLUSTER_2_REGION=$(rosa describe cluster -c ${CLUSTER_2} -o json | jq -r .region.id) @@ -86,51 +72,17 @@ if [[ "${CLUSTER_1_REGION}" != "${CLUSTER_2_REGION}" ]]; then exit 1 fi -ENDPOINT_GROUP_REGION=${CLUSTER_1_REGION} - -CLUSTER_1_ENDPOINT_ARN=$(createLoadBalancer ${CLUSTER_1} ${CLUSTER_1_REGION} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE}) -CLUSTER_2_ENDPOINT_ARN=$(createLoadBalancer ${CLUSTER_2} ${CLUSTER_2_REGION} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE}) - -ACCELERATOR=$(aws globalaccelerator create-accelerator \ - --name ${ACCELERATOR_NAME} \ - --query 'Accelerator' \ - --ip-address-type DUAL_STACK \ - --output json -) - -ACCELERATOR_ARN=$(echo ${ACCELERATOR} | jq -r .AcceleratorArn) -ACCELERATOR_DNS=$(echo ${ACCELERATOR} | jq -r .DnsName) -ACCELERATOR_DUAL_STACK_DNS=$(echo ${ACCELERATOR} | jq -r .DualStackDnsName) - -LISTENER_ARN=$(aws globalaccelerator create-listener \ - --accelerator-arn ${ACCELERATOR_ARN} \ - --port-ranges '[{"FromPort":443,"ToPort":443}]' \ - --protocol TCP \ - --query 'Listener.ListenerArn' \ - --output text -) - -ENDPOINTS=$(echo ' -[ - { - "EndpointId": "'${CLUSTER_1_ENDPOINT_ARN}'", - "Weight": 50, - "ClientIPPreservationEnabled": false - }, - { - "EndpointId": "'${CLUSTER_2_ENDPOINT_ARN}'", - "Weight": 50, - "ClientIPPreservationEnabled": false - } -]' | jq -c . -) +createLoadBalancer ${CLUSTER_1} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE} ${ACCELERATOR_NAME} +createLoadBalancer ${CLUSTER_2} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE} ${ACCELERATOR_NAME} -ENDPOINT_GROUP_ARN=$(aws globalaccelerator create-endpoint-group \ - --listener-arn ${LISTENER_ARN} \ - --traffic-dial-percentage 100 \ - --endpoint-configurations ${ENDPOINTS} \ - --endpoint-group-region ${ENDPOINT_GROUP_REGION} -) +TOFU_CMD="tofu apply -auto-approve \ + -var aws_region=${CLUSTER_1_REGION} \ + -var lb_service_name="${KEYCLOAK_NAMESPACE}/${ACCELERATOR_LB_NAME}" \ + -var name=${ACCELERATOR_NAME} \ + -var site_a=${CLUSTER_1} \ + -var site_b=${CLUSTER_2}" -echo "ACCELERATOR DNS: ${ACCELERATOR_DNS}" -echo "ACCELERATOR DUAL_STACK DNS: ${ACCELERATOR_DUAL_STACK_DNS}" +cd ${SCRIPT_DIR}/../../opentofu/modules/aws/accelerator +source ${SCRIPT_DIR}/../../opentofu/create.sh ${ACCELERATOR_NAME} "${TOFU_CMD}" +echo "ACCELERATOR DNS: $(tofu output -json | jq -r .dns_name.value)" +echo "ACCELERATOR WEBHOOK: $(tofu output -json | jq -r .webhook_url.value)" diff --git a/provision/aws/global-accelerator/accelerator_multi_az_delete.sh b/provision/aws/global-accelerator/accelerator_multi_az_delete.sh index 45df6263..0cdd344c 100755 --- a/provision/aws/global-accelerator/accelerator_multi_az_delete.sh +++ b/provision/aws/global-accelerator/accelerator_multi_az_delete.sh @@ -17,7 +17,23 @@ function deleteLoadBalancer() { oc delete -n ${NAMESPACE} svc ${SVC_NAME} || true } -requiredEnv ACCELERATOR_NAME +if [ -z "${ACCELERATOR_NAME}" ]; then + if [ -z "${ACCELERATOR_DNS}" ]; then + echo "ACCELERATOR_NAME or ACCELERATOR_DNS must be set" + exit 1 + fi + ACCELERATOR_NAME=$(aws globalaccelerator list-accelerators \ + --query "Accelerators[?ends_with(DnsName, '${ACCELERATOR_DNS}')].Name" \ + --output text + ) + if [ -z "${ACCELERATOR_NAME}" ]; then + echo "Unable to find Global Accelerator with DnsName '${ACCELERATOR_DNS}'" + exit 1 + fi +fi + +cd ${SCRIPT_DIR}/../../opentofu/modules/aws/accelerator +bash ${SCRIPT_DIR}/../../opentofu/destroy.sh ${ACCELERATOR_NAME} DELETE_LB=${DELETE_LB:=true} if [ "${DELETE_LB}" = true ]; then @@ -26,50 +42,3 @@ if [ "${DELETE_LB}" = true ]; then deleteLoadBalancer ${CLUSTER_1} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE} deleteLoadBalancer ${CLUSTER_2} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE} fi - -ACCELERATOR_ARN=$(aws globalaccelerator list-accelerators \ - --query "Accelerators[?Name=='${ACCELERATOR_NAME}'].AcceleratorArn" \ - --output text -) - -if [ -z "${ACCELERATOR_ARN}" ]; then - echo "${ACCELERATOR_NAME} not found" - exit 0 -fi - -aws globalaccelerator update-accelerator \ - --accelerator-arn ${ACCELERATOR_ARN} \ - --no-enabled - -LISTENER_ARN=$(aws globalaccelerator list-listeners \ - --accelerator-arn ${ACCELERATOR_ARN} \ - --query "Listeners[0].ListenerArn" \ - --output text -) - -if [[ "${LISTENER_ARN}" != "None" ]]; then - ENDPOINT_GROUP_ARN=$(aws globalaccelerator list-endpoint-groups \ - --listener-arn ${LISTENER_ARN} \ - --query 'EndpointGroups[].EndpointGroupArn' \ - --output text - ) - - if [[ -n "${ENDPOINT_GROUP_ARN}" ]]; then - aws globalaccelerator delete-endpoint-group \ - --endpoint-group-arn ${ENDPOINT_GROUP_ARN} - fi - - aws globalaccelerator delete-listener \ - --listener-arn ${LISTENER_ARN} -fi - -count=0 -until acceleratorDisabled ${ACCELERATOR_ARN} || (( count++ >= 300 )); do - sleep 1 -done - -if [ $count -gt 300 ]; then - echo "Timeout waiting for accelerator ${ACCELERATOR_ARN} to be removed" - exit 1 -fi -aws globalaccelerator delete-accelerator --accelerator-arn ${ACCELERATOR_ARN} diff --git a/provision/aws/rosa_common.sh b/provision/aws/rosa_common.sh new file mode 100755 index 00000000..68c77fa8 --- /dev/null +++ b/provision/aws/rosa_common.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +function requiredEnv() { + for ENV in $@; do + if [ -z "${!ENV}" ]; then + echo "${ENV} variable must be set" + exit 1 + fi + done +} + +# Wait for k8s resource to exist. See: https://github.com/kubernetes/kubernetes/issues/83242 +function waitFor() { + xtrace=$(set +o|grep xtrace); set +x + local ns=${1?namespace is required}; shift + local type=${1?type is required}; shift + + echo "Waiting for $type $*" + until oc -n "$ns" get "$type" "$@" -o=jsonpath='{.items[0].metadata.name}' >/dev/null 2>&1; do + echo "Waiting for $type $*" + sleep 1 + done + eval "$xtrace" +} diff --git a/provision/aws/rosa_create_cluster.sh b/provision/aws/rosa_create_cluster.sh index eca19efb..e881717a 100755 --- a/provision/aws/rosa_create_cluster.sh +++ b/provision/aws/rosa_create_cluster.sh @@ -9,16 +9,8 @@ if [ -f ./.env ]; then source ./.env fi -function requiredEnv() { - for ENV in $@; do - if [ -z "${!ENV}" ]; then - echo "${ENV} variable must be set" - exit 1 - fi - done -} - SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source ${SCRIPT_DIR}/rosa_common.sh AWS_ACCOUNT=${AWS_ACCOUNT:-$(aws sts get-caller-identity --query "Account" --output text)} @@ -43,10 +35,7 @@ else echo "Installing ROSA cluster ${CLUSTER_NAME}" cd ${SCRIPT_DIR}/../opentofu/modules/rosa/hcp - tofu init WORKSPACE=${CLUSTER_NAME}-${REGION} - tofu workspace new ${WORKSPACE} || true - export TF_WORKSPACE=${WORKSPACE} AVAILABILITY_ZONES=${AVAILABILITY_ZONES:-"${REGION}a"} @@ -69,9 +58,7 @@ else TOFU_CMD+=" -var replicas=${REPLICAS}" fi - echo ${TOFU_CMD} - ${TOFU_CMD} - + bash ${SCRIPT_DIR}/../opentofu/create.sh ${WORKSPACE} "${TOFU_CMD}" fi SCALING_MACHINE_POOL=$(rosa list machinepools -c "${CLUSTER_NAME}" -o json | jq -r '.[] | select(.id == "scaling") | .id') @@ -91,4 +78,9 @@ cd ${SCRIPT_DIR} ./rosa_install_openshift_logging.sh +echo "Enabling user alert routing." +oc apply -f ${SCRIPT_DIR}/../openshift/cluster-monitoring-config.yaml +waitFor openshift-user-workload-monitoring statefulset alertmanager-user-workload +oc -n openshift-user-workload-monitoring rollout status --watch --timeout=2m statefulset.apps/alertmanager-user-workload + echo "Cluster ${CLUSTER_NAME} is ready." diff --git a/provision/aws/rosa_install_openshift_logging.sh b/provision/aws/rosa_install_openshift_logging.sh index c8608d16..4d45ab42 100755 --- a/provision/aws/rosa_install_openshift_logging.sh +++ b/provision/aws/rosa_install_openshift_logging.sh @@ -6,19 +6,8 @@ if [[ "$RUNNER_DEBUG" == "1" ]]; then set -x fi -# Wait for k8s resource to exist. See: https://github.com/kubernetes/kubernetes/issues/83242 -waitFor() { - xtrace=$(set +o|grep xtrace); set +x - local ns=${1?namespace is required}; shift - local type=${1?type is required}; shift - - echo "Waiting for $type $*" - until oc -n "$ns" get "$type" "$@" -o=jsonpath='{.items[0].metadata.name}' >/dev/null 2>&1; do - echo "Waiting for $type $*" - sleep 1 - done - eval "$xtrace" -} +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +source ${SCRIPT_DIR}/rosa_common.sh echo "Installing openshift operator." diff --git a/provision/infinispan/Taskfile.yaml b/provision/infinispan/Taskfile.yaml index 36dc6dfb..9c816b00 100644 --- a/provision/infinispan/Taskfile.yaml +++ b/provision/infinispan/Taskfile.yaml @@ -58,3 +58,13 @@ tasks: desc: "Deletes the Infinispan CR from cross-site deployment" cmd: task: utils:{{.TASK}} + + crossdc-split: + desc: "Simulate a split-brain by taking down the GossipRouter on each site" + cmd: + task: utils:{{.TASK}} + + crossdc-heal: + desc: "Heal a simulated split-brain by recreating down the GossipRouter on each site" + cmd: + task: utils:{{.TASK}} diff --git a/provision/infinispan/Utils.yaml b/provision/infinispan/Utils.yaml index 3100cb26..ae73d0ae 100644 --- a/provision/infinispan/Utils.yaml +++ b/provision/infinispan/Utils.yaml @@ -104,6 +104,10 @@ tasks: --set image={{.CROSS_DC_IMAGE}} --set fd.interval={{.CROSS_DC_FD_INTERVAL}} --set fd.timeout={{.CROSS_DC_FD_TIMEOUT}} + --set acceleratorDNS={{ .ACCELERATOR_DNS }} + --set alertmanager.webhook.url={{ .ACCELERATOR_WEBHOOK_URL }} + --set alertmanager.webhook.username={{ .ACCELERATOR_WEBHOOK_USERNAME }} + --set alertmanager.webhook.password={{ .ACCELERATOR_WEBHOOK_PASSWORD }} {{if eq .KC_PERSISTENT_SESSIONS "true"}}--values ispn-helm/persistent-session-caches.yaml{{end}} ./ispn-helm preconditions: @@ -505,3 +509,67 @@ tasks: - defer: KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n {{ .NAMESPACE }} delete Batch/take-offline || true - defer: KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n {{ .NAMESPACE }} logs job/take-offline || true - KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n {{ .NAMESPACE }} wait --for=jsonpath='{.status.phase}'=Succeeded Batch/take-offline + + crossdc-split: + internal: true + desc: "Simulate a split-brain by taking down the GossipRouter on each site" + requires: + vars: + - NAMESPACE + - ROSA_CLUSTER_NAME_1 + - ROSA_CLUSTER_NAME_2 + cmds: + - task: gossip-router-kill + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: gossip-router-kill + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + + crossdc-heal: + internal: true + desc: "Heal a simulated split-brain by recreating down the GossipRouter on each site" + requires: + vars: + - NAMESPACE + - ROSA_CLUSTER_NAME_1 + - ROSA_CLUSTER_NAME_2 + cmds: + - task: gossip-router-resurrect + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: gossip-router-resurrect + vars: + NAMESPACE: "{{.NAMESPACE}}" + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + + gossip-router-kill: + internal: true + requires: + vars: + - NAMESPACE + - ROSA_CLUSTER_NAME + cmds: + - task: rosa-oc-login + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}" + - | + KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n openshift-operators scale --replicas=0 deployment/infinispan-operator-controller-manager + KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n openshift-operators rollout status -w deployment/infinispan-operator-controller-manager + KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n ${NAMESPACE} scale --replicas=0 deployment/infinispan-router + KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n ${NAMESPACE} rollout status -w deployment/infinispan-router + + gossip-router-resurrect: + internal: true + requires: + vars: + - ROSA_CLUSTER_NAME + - NAMESPACE + cmds: + - task: rosa-oc-login + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME}}" + - | + KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n openshift-operators scale --replicas=1 deployment/infinispan-operator-controller-manager + KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n openshift-operators rollout status -w deployment/infinispan-operator-controller-manager + KUBECONFIG=".task/kubecfg/{{.ROSA_CLUSTER_NAME}}" kubectl -n ${NAMESPACE} rollout status -w deployment/infinispan-router diff --git a/provision/infinispan/ispn-helm/templates/infinispan-alerts.yaml b/provision/infinispan/ispn-helm/templates/infinispan-alerts.yaml new file mode 100644 index 00000000..c3ba1636 --- /dev/null +++ b/provision/infinispan/ispn-helm/templates/infinispan-alerts.yaml @@ -0,0 +1,57 @@ +{{ if .Values.acceleratorDNS }} +# tag::stonith-secret[] +apiVersion: v1 +kind: Secret +type: kubernetes.io/basic-auth +metadata: + name: webhook-credentials +stringData: + username: '{{ .Values.alertmanager.webhook.username }}' # <1> + password: '{{ .Values.alertmanager.webhook.password }}' # <2> +# end::stonith-secret[] +--- +# tag::stonith-alert-manager-config[] +apiVersion: monitoring.coreos.com/v1beta1 +kind: AlertmanagerConfig +metadata: + name: example-routing +spec: + route: + receiver: default + matchers: + - matchType: = + name: alertname + value: SiteOffline + receivers: + - name: default + webhookConfigs: + - url: '{{ .Values.alertmanager.webhook.url }}' # <3> + httpConfig: + basicAuth: + username: + key: username + name: webhook-credentials + password: + key: password + name: webhook-credentials + tlsConfig: + insecureSkipVerify: true +# end::stonith-alert-manager-config[] +--- +# tag::stonith-prometheus-rule[] +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: xsite-status +spec: + groups: + - name: xsite-status + rules: + - alert: SiteOffline + expr: 'min by (namespace, site) (vendor_jgroups_site_view_status{namespace="{{.Release.Namespace}}",site="{{ .Values.crossdc.remote.name }}"}) == 0' # <4> + labels: + severity: critical + reporter: {{ .Values.crossdc.local.name }} # <5> + accelerator: {{ .Values.acceleratorDNS }} # <6> +# end::stonith-prometheus-rule[] +{{ end }} diff --git a/provision/infinispan/ispn-helm/values.yaml b/provision/infinispan/ispn-helm/values.yaml index 0ef3aed3..cb90f3de 100644 --- a/provision/infinispan/ispn-helm/values.yaml +++ b/provision/infinispan/ispn-helm/values.yaml @@ -3,7 +3,7 @@ # Declare variables to be passed into your templates. replicas: 3 -defaultImage: quay.io/infinispan/server:15.0.4.Final +defaultImage: quay.io/infinispan-test/server:15.0.x cacheDefaults: owners: 2 # SYNC or ASYNC @@ -54,3 +54,9 @@ metrics: fd: interval: 2000 timeout: 15000 +acceleratorDNS: +alertmanager: + webhook: + url: '' + username: '' + password: '' diff --git a/provision/openshift/Taskfile.yaml b/provision/openshift/Taskfile.yaml index 8c7f388d..ada8897c 100644 --- a/provision/openshift/Taskfile.yaml +++ b/provision/openshift/Taskfile.yaml @@ -296,3 +296,12 @@ tasks: - .task/remote-store-host - .task/remote-store-username - .task/remote-store-password + + user-alert-routing: + deps: + - common:split + - common:env + cmds: + - kubectl apply -f cluster-monitoring-config.yaml + sources: + - cluster-monitoring-config.yaml diff --git a/provision/openshift/cluster-monitoring-config.yaml b/provision/openshift/cluster-monitoring-config.yaml index 0d768886..0fe408a0 100644 --- a/provision/openshift/cluster-monitoring-config.yaml +++ b/provision/openshift/cluster-monitoring-config.yaml @@ -1,8 +1,10 @@ apiVersion: v1 kind: ConfigMap metadata: - name: cluster-monitoring-config - namespace: openshift-monitoring + name: user-workload-monitoring-config + namespace: openshift-user-workload-monitoring data: config.yaml: | - enableUserWorkload: true + alertmanager: + enabled: true + enableAlertmanagerConfig: true diff --git a/provision/opentofu/create.sh b/provision/opentofu/create.sh new file mode 100755 index 00000000..ddd7648d --- /dev/null +++ b/provision/opentofu/create.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -e + +if [[ "$RUNNER_DEBUG" == "1" ]]; then + set -x +fi + +WORKSPACE=$1 +TOFU_CMD=$2 + +echo "Workspace: ${WORKSPACE}" +tofu init +tofu workspace new ${WORKSPACE} || echo "Workspace ${WORKSPACE} already exists" +export TF_WORKSPACE=${WORKSPACE} +echo ${TOFU_CMD} +${TOFU_CMD} diff --git a/provision/opentofu/destroy.sh b/provision/opentofu/destroy.sh index d7d9a9b9..fa8c781d 100755 --- a/provision/opentofu/destroy.sh +++ b/provision/opentofu/destroy.sh @@ -6,7 +6,7 @@ if [[ "$RUNNER_DEBUG" == "1" ]]; then fi WORKSPACE=$1 -echo ${WORKSPACE} +echo "Workspace: ${WORKSPACE}" tofu init if tofu workspace select ${WORKSPACE}; then tofu state pull diff --git a/provision/opentofu/modules/aws/accelerator/.terraform.lock.hcl b/provision/opentofu/modules/aws/accelerator/.terraform.lock.hcl new file mode 100644 index 00000000..230722e3 --- /dev/null +++ b/provision/opentofu/modules/aws/accelerator/.terraform.lock.hcl @@ -0,0 +1,74 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/aws" { + version = "5.48.0" + constraints = ">= 5.38.0" + hashes = [ + "h1:CzRLrszN3gtE7/fuxQcPzJM0MIzaE4pViSlOHk6QlkI=", + "zh:212b33b4270a4f20025dec83b181b0e8044ef382491e0c89ad07c64d6dfacff0", + "zh:2dd2dadd6fc8752edb6241bdac1bdd49ce64384527dc335a021d61d3870a0393", + "zh:3d449e369958ab3d0afe2db6be5de22061f8635fe176771c98af41a7f770f1b5", + "zh:3dd6ca9a102c6164683800d8b1b5def29a51d575b5223063961125de81cca136", + "zh:422586cf2ea78f8464c97b95f153acdc84b660b2eb474a100338e360593e2d84", + "zh:70ea10113b724cc69f83e2c1fd65d7d304aaf6bd9f6a45cd1622a5f36506690c", + "zh:84a48c4a7eb8498beb9f5d78bef5e58516e11a8df131042fb43d3dec62dc899b", + "zh:9724c095fb8d8d7695769a828e6cc0de95da264487c91af39a645713b293323c", + "zh:ad9117ef8c7fd8e26aab482a286aa2e641e4887d1816117caa1fd7eaff6a050c", + "zh:ff32af11624e5104fd4ddd38cecd1beb09da9a7be7f49b0d496080667882b90e", + ] +} + +provider "registry.opentofu.org/hashicorp/external" { + version = "2.3.3" + constraints = ">= 1.0.0" + hashes = [ + "h1:bDJy8Mj5PMTEuxm6Wu9A9dATBL+mQDmHx8NnLzjvCcc=", + "zh:1ec36864a1872abdfd1c53ba3c6837407564ac0d86ab80bf4fdc87b41106fe68", + "zh:2117e0edbdc88f0d22fe02fe6b2cfbbbc5d5ce40f8f58e484d8d77d64dd7340f", + "zh:4bcfdacd8e2508c16e131de9072cecd359e0ade3b8c6798a049883f37a5872ea", + "zh:4da71bc601a37bf8b7413c142d43f5f28e97e531d4836ee8624f41b9fb62e250", + "zh:55b9eebac79a46f88db5615f1ee0ac4c3f9351caa4eb8542171ef5d87de60338", + "zh:74d64afaef190321f8ddf1c4a9c6489d6cf51098704a2456c1553406e8306328", + "zh:8a357e51a0ec69872fafc64da3c6a1039277d325255ef5a264b727d83995d18b", + "zh:aacd2e6c13fe19115d51cd28a40a28da017bb48c2e18dec4460d1c37506b1495", + "zh:e19c8bdf0e059341d008a50f9138c44009e9ebb3a8047a300e6bc63ed8af8ea0", + "zh:fafa9639d8b8402e35f3864c6cfb0762ec57cc365a8f383e2acf81105b1b9eea", + ] +} + +provider "registry.opentofu.org/hashicorp/local" { + version = "2.5.1" + constraints = ">= 1.0.0" + hashes = [ + "h1:GgW5qncKu4KnXLE1ZYv5iwmhSYtTNzsOvJAOQIyFR7E=", + "zh:031c2c2070672b7e78e0aa15560839278dc57fe7cf1e58a617ac13c67b31d5fb", + "zh:1ef64ea4f8382cd538a76f3d319f405d18130dc3280f1c16d6aaa52a188ecaa4", + "zh:422ce45691b2f384dbd4596fdc8209d95cb43d85a82aaa0173089d38976d6e96", + "zh:7415fbd8da72d9363ba55dd8115837714f9534f5a9a518ec42268c2da1b9ed2f", + "zh:92aa22d071339c8ef595f18a9f9245c287266c80689f5746b26e10eaed04d542", + "zh:9cd0d99f5d3be835d6336c19c4057af6274e193e677ecf6370e5b0de12b4aafe", + "zh:a8c1525b389be5809a97f02aa7126e491ba518f97f57ed3095a3992f2134bb8f", + "zh:b336fa75f72643154b07c09b3968e417a41293358a54fe03efc0db715c5451e6", + "zh:c66529133599a419123ad2e42874afbd9aba82bd1de2b15cc68d2a1e665d4c8e", + "zh:c7568f75ba6cb7c3660b69eaab8b0e4278533bd9a7a4c33ee6590cc7e69743ea", + ] +} + +provider "registry.opentofu.org/hashicorp/null" { + version = "3.2.2" + constraints = ">= 2.0.0" + hashes = [ + "h1:xN1tSeF/rUBfaddk/AVqk4i65z/MMM9uVZWd2cWCCH0=", + "zh:00e5877d19fb1c1d8c4b3536334a46a5c86f57146fd115c7b7b4b5d2bf2de86d", + "zh:1755c2999e73e4d73f9de670c145c9a0dc5a373802799dff06a0e9c161354163", + "zh:2b29d706353bc9c4edda6a2946af3322abe94372ffb421d81fa176f1e57e33be", + "zh:34f65259c6d2bd51582b6da536e782b181b23725782b181193b965f519fbbacd", + "zh:370f6eb744475926a1fa7464d82d46ad83c2e1148b4b21681b4cec4d75b97969", + "zh:5950bdb23b4fcc6431562d7eba3dea37844aa4220c4da2eb898ae3e4d1b64ec4", + "zh:8f3d5c8d4b9d497fec36953a227f80c76d37fc8431b683a23fb1c42b9cccbf8a", + "zh:8f6eb5e65c047bf490ad3891efecefc488503b65898d4ee106f474697ba257d7", + "zh:a7040eed688316fe00379574c72bb8c47dbe2638b038bb705647cbf224de8f72", + "zh:e561f28df04d9e51b75f33004b7767a53c45ad96e3375d86181ba1363bffbc77", + ] +} diff --git a/provision/opentofu/modules/aws/accelerator/main.tf b/provision/opentofu/modules/aws/accelerator/main.tf new file mode 100644 index 00000000..31c3f1d6 --- /dev/null +++ b/provision/opentofu/modules/aws/accelerator/main.tf @@ -0,0 +1,100 @@ +data "aws_lb" "site_a" { + provider = aws.clusters + tags = { + "kubernetes.io/service-name" = var.lb_service_name + "site" = var.site_a + } +} + +data "aws_lb" "site_b" { + provider = aws.clusters + tags = { + "kubernetes.io/service-name" = var.lb_service_name + "site" = var.site_b + } +} + +# https://github.com/terraform-aws-modules/terraform-aws-global-accelerator +module "global_accelerator" { + source = "terraform-aws-modules/global-accelerator/aws" + + name = var.name + + listeners = { + listener_1 = { + endpoint_group = { + endpoint_group_region = var.aws_region + traffic_dial_percentage = 100 + + endpoint_configuration = [ + { + client_ip_preservation_enabled = false + endpoint_id = data.aws_lb.site_a.arn + weight = 50 + }, { + client_ip_preservation_enabled = false + endpoint_id = data.aws_lb.site_b.arn + weight = 50 + } + ] + } + port_ranges = [ + { + from_port = 443 + to_port = 443 + } + ] + protocol = "TCP" + } + } + + tags = { + accelerator = var.name + } +} + +# https://github.com/terraform-aws-modules/terraform-aws-lambda +module "lambda_function" { + source = "terraform-aws-modules/lambda/aws" + + providers = { + aws = aws.clusters + } + + function_name = var.name + handler = "stonith_lambda.handler" + runtime = "python3.12" + source_path = "src/stonith_lambda.py" + create_lambda_function_url = true + timeout = 15 + + attach_policies = true + policies = [ + "arn:aws:iam::aws:policy/ElasticLoadBalancingReadOnly", + "arn:aws:iam::aws:policy/GlobalAcceleratorFullAccess" + ] + number_of_policies = 2 + + attach_policy_jsons = true + policy_jsons = [ + <<-EOT + { + "Version": "2012-10-17", + "Statement": [ + { + "Action": [ + "secretsmanager:GetSecretValue" + ], + "Effect": "Allow", + "Resource": ["*"] + } + ] + } + EOT + ] + number_of_policy_jsons = 1 + + tags = { + accelerator = var.name + } +} diff --git a/provision/opentofu/modules/aws/accelerator/output.tf b/provision/opentofu/modules/aws/accelerator/output.tf new file mode 100644 index 00000000..cf72b3e7 --- /dev/null +++ b/provision/opentofu/modules/aws/accelerator/output.tf @@ -0,0 +1,27 @@ +output "dns_name" { + value = module.global_accelerator.dns_name +} + +output "webhook_url" { + value = module.lambda_function.lambda_function_url +} + +output "input_lb_service_name" { + value = var.lb_service_name +} + +output "input_name" { + value = var.name +} + +output "input_aws_region" { + value = var.aws_region +} + +output "input_site_a" { + value = var.site_a +} + +output "input_site_b" { + value = var.site_b +} diff --git a/provision/opentofu/modules/aws/accelerator/provider.tf b/provision/opentofu/modules/aws/accelerator/provider.tf new file mode 100644 index 00000000..926c4c0f --- /dev/null +++ b/provision/opentofu/modules/aws/accelerator/provider.tf @@ -0,0 +1,30 @@ +terraform { + backend "s3" { + bucket = "kcb-tf-state" + key = "accelerator" + region = "eu-west-1" + encrypt = true + dynamodb_table = "app-state" + } + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.38.0" + } + } + + required_version = ">= 1.4.0" +} + +provider "aws" { + # us-west-2 must be used for all global accelerator commands + region = "us-west-2" + # Force set sts_region to preventing hanging on invalid regions + sts_region = "us-east-1" +} + +provider "aws" { + region = var.aws_region + alias = "clusters" +} diff --git a/provision/opentofu/modules/aws/accelerator/src/stonith_lambda.py b/provision/opentofu/modules/aws/accelerator/src/stonith_lambda.py new file mode 100644 index 00000000..185f34e4 --- /dev/null +++ b/provision/opentofu/modules/aws/accelerator/src/stonith_lambda.py @@ -0,0 +1,118 @@ +# tag::stonith-start[] +import boto3 +import jmespath +import json + +from base64 import b64decode +from urllib.parse import unquote + + +def handle_site_offline(labels): + a_client = boto3.client('globalaccelerator', region_name='us-west-2') + + accelerator = jmespath.search("Accelerators[?DnsName=='%s']" % labels['accelerator'], a_client.list_accelerators()) + if not accelerator: + raise Exception("Unable to find Global Accelerator with DNS '%s'" % labels['accelerator']) + + accelerator_arn = accelerator[0]['AcceleratorArn'] + listener_arn = a_client.list_listeners(AcceleratorArn=accelerator_arn)['Listeners'][0]['ListenerArn'] + + endpoint_group = a_client.list_endpoint_groups(ListenerArn=listener_arn)['EndpointGroups'][0] + endpoints = endpoint_group['EndpointDescriptions'] + + # Only update accelerator endpoints if two entries exist + if len(endpoints) > 1: + # If the reporter endpoint is not healthy then do nothing for now + # A Lambda will eventually be triggered by the other offline site for this reporter + reporter = labels['reporter'] + reporter_endpoint = [e for e in endpoints if endpoint_belongs_to_site(e, reporter)][0] + if reporter_endpoint['HealthState'] == 'UNHEALTHY': + print(f"Ignoring SiteOffline alert as reporter '{reporter}' endpoint is marked UNHEALTHY") + return + + offline_site = labels['site'] + endpoints = [e for e in endpoints if not endpoint_belongs_to_site(e, offline_site)] + del reporter_endpoint['HealthState'] + a_client.update_endpoint_group( + EndpointGroupArn=endpoint_group['EndpointGroupArn'], + EndpointConfigurations=endpoints + ) + print(f"Removed site={offline_site} from Accelerator EndpointGroup") + else: + print("Ignoring SiteOffline alert only one Endpoint defined in the EndpointGroup") + + +def endpoint_belongs_to_site(endpoint, site): + lb_arn = endpoint['EndpointId'] + region = lb_arn.split(':')[3] + client = boto3.client('elbv2', region_name=region) + tags = client.describe_tags(ResourceArns=[lb_arn])['TagDescriptions'][0]['Tags'] + for tag in tags: + if tag['Key'] == 'site': + return tag['Value'] == site + return false + + +def get_secret(secret_name, region_name): + session = boto3.session.Session() + client = session.client( + service_name='secretsmanager', + region_name=region_name + ) + return client.get_secret_value(SecretId=secret_name)['SecretString'] + + +def decode_basic_auth_header(encoded_str): + split = encoded_str.strip().split(' ') + if len(split) == 2: + if split[0].strip().lower() == 'basic': + try: + username, password = b64decode(split[1]).decode().split(':', 1) + except: + raise DecodeError + else: + raise DecodeError + else: + raise DecodeError + + return unquote(username), unquote(password) + + +def handler(event, context): + print(json.dumps(event)) + + authorization = event['headers'].get('authorization') + if authorization is None: + print("'Authorization' header missing from request") + return { + "statusCode": 401 + } + +# end::stonith-start[] + expected_user = 'keycloak' + secret_name = 'keycloak-master-password' + secret_region = 'eu-central-1' +# tag::stonith-end[] + expectedPass = get_secret(secret_name, secret_region) + username, password = decode_basic_auth_header(authorization) + if username != expected_user and password != expectedPass: + print('Invalid username/password combination') + return { + "statusCode": 403 + } + + body = event.get('body') + if body is None: + raise Exception('Empty request body') + + body = json.loads(body) + print(json.dumps(body)) + for alert in body['alerts']: + labels = alert['labels'] + if labels['alertname'] == 'SiteOffline': + handle_site_offline(labels) + + return { + "statusCode": 204 + } +# end::stonith-end[] diff --git a/provision/opentofu/modules/aws/accelerator/variables.tf b/provision/opentofu/modules/aws/accelerator/variables.tf new file mode 100644 index 00000000..7c1c5680 --- /dev/null +++ b/provision/opentofu/modules/aws/accelerator/variables.tf @@ -0,0 +1,21 @@ +variable "lb_service_name" { + type = string +} + +variable "name" { + type = string + nullable = false +} + +variable "aws_region" { + type = string + nullable = false +} + +variable "site_a" { + type = string +} + +variable "site_b" { + type = string +} diff --git a/provision/rosa-cross-dc/Taskfile.yaml b/provision/rosa-cross-dc/Taskfile.yaml index 1538684c..08e33ab4 100644 --- a/provision/rosa-cross-dc/Taskfile.yaml +++ b/provision/rosa-cross-dc/Taskfile.yaml @@ -261,7 +261,6 @@ tasks: vars: - ROSA_CLUSTER_NAME - KC_HOSTNAME_OVERRIDE - - KC_HEALTH_HOSTNAME cmds: - task: create-{{ ternary "global" "single" (eq .CROSS_DC_MODE "ASYNC") }}-peering-connection vars: @@ -319,8 +318,59 @@ tasks: preconditions: - test -f {{.ISPN_DIR}}/.task/kubecfg/{{.ROSA_CLUSTER_NAME}} + active-active: + desc: "Deploys Infinispan, Aurora DB and Keycloak in a Active/Active deployment using ROSA clusters" + deps: + - common:split + - common:env + requires: + vars: + - ROSA_CLUSTER_NAME_1 + - ROSA_CLUSTER_NAME_2 + - AURORA_CLUSTER + - ACCELERATOR_DNS + - ACCELERATOR_WEBHOOK_URL + - ACCELERATOR_WEBHOOK_USERNAME + - ACCELERATOR_WEBHOOK_PASSWORD + cmds: + - task: common:datasetprovider + - task: create-aurora + # Using the CROSS_DC_MODE value to determine which aurora task to run, ASYNC --> global, SYNC --> single + # We expect to use ASYNC mode only in a cross region, cross site deployment and SYNC in a single region multi-az deployment. + - task: fetch-{{ ternary "global" "single" (eq .CROSS_DC_MODE "ASYNC") }}-aurora-endpoint + - task: deploy-infinispan-crossdc + vars: + OC_NAMESPACE_1: "{{.KC_ISPN_NAMESPACE}}" + OC_NAMESPACE_2: "{{.KC_ISPN_NAMESPACE}}" + - task: deploy-keycloak + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + KC_HOSTNAME_OVERRIDE: "{{.ACCELERATOR_DNS}}" + - task: deploy-keycloak + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + KC_HOSTNAME_OVERRIDE: "{{.ACCELERATOR_DNS}}" + - task: wait-cryostat + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: wait-cryostat + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + - task: wait-keycloak + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: wait-keycloak + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + - task: create-env-configmap + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: create-env-configmap + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + default: - desc: "Deploys Infinispan, Aurora DB and Keycloak in a Cross-Site deployment using ROSA clusters" + desc: "Deploys Infinispan, Aurora DB and Keycloak in a Active/Passive deployment using ROSA clusters" deps: - common:split - common:env @@ -374,7 +424,7 @@ tasks: ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" undeploy: - desc: "Undeploy Infinispan and Keycloak in a Cross-Site deployment using ROSA clusters" + desc: "Undeploy Infinispan and Keycloak in a Active/Passive deployment" deps: - common:split - common:env @@ -415,6 +465,43 @@ tasks: vars: ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + undeploy-active-active: + desc: "Undeploy Infinispan and Keycloak in a Active/Active deployment" + deps: + - common:split + - common:env + requires: + vars: + - ROSA_CLUSTER_NAME_1 + - ROSA_CLUSTER_NAME_2 + cmds: + - task: ispn:rosa-oc-login + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + - task: ispn:rosa-oc-login + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: keycloak:uninstall-keycloak + vars: + KUBECONFIG: "{{.ISPN_DIR}}/.task/kubecfg/{{.ROSA_CLUSTER_NAME_2}}" + NAMESPACE: "{{.KC_NAMESPACE_PREFIX}}keycloak" + - task: keycloak:uninstall-keycloak + vars: + KUBECONFIG: "{{.ISPN_DIR}}/.task/kubecfg/{{.ROSA_CLUSTER_NAME_1}}" + NAMESPACE: "{{.KC_NAMESPACE_PREFIX}}keycloak" + - task: uninstall-infinispan + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + - task: uninstall-infinispan + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: delete-env-configmap + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_1}}" + - task: delete-env-configmap + vars: + ROSA_CLUSTER_NAME: "{{.ROSA_CLUSTER_NAME_2}}" + helm-add-repos: internal: true cmds: @@ -595,6 +682,11 @@ tasks: - ACCELERATOR_NAME cmd: KEYCLOAK_NAMESPACE="{{.KC_NAMESPACE_PREFIX}}keycloak" ACCELERATOR_NAME={{.ACCELERATOR_NAME}} CLUSTER_1={{.ROSA_CLUSTER_NAME_1}} CLUSTER_2={{.ROSA_CLUSTER_NAME_2}} ./accelerator_multi_az_delete.sh + global-accelerator-recover: + desc: "Recover from Global Accelerator split-brain" + cmds: + - task: global-accelerator-create + route53: desc: "Creates Route53 primary/backup DNS records" dir: "{{.ROUTE53_DIR}}" diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml index 6913d4ee..3fe8f582 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml @@ -17,6 +17,7 @@ 17 UTF-8 2.20.43 + 6.13.0 @@ -61,6 +62,33 @@ + + software.amazon.awssdk + globalaccelerator + ${aws.java.sdk.version} + + + software.amazon.awssdk + netty-nio-client + + + + + software.amazon.awssdk + elasticloadbalancingv2 + ${aws.java.sdk.version} + + + software.amazon.awssdk + netty-nio-client + + + + + io.fabric8 + openshift-client + ${fabric8.version} + @@ -76,6 +104,23 @@ true + + active-active + + + + org.apache.maven.plugins + maven-surefire-plugin + + + active-active + + active-passive + + + + + @@ -84,20 +129,16 @@ org.apache.maven.plugins maven-surefire-plugin + active-active org.jboss.logmanager.LogManager + active-passive - ${LOAD_BALANCER_URL} - - ${KEYCLOAK_DC1_URL} - - ${KEYCLOAK_DC2_URL} - - ${ISPN_DC1_URL} + ${DEPLOYMENT_NAMESPACE} - ${ISPN_DC2_URL} + ${KUBERNETES_1_CONTEXT} - ${ISPN_PASSWORD} + ${KUBERNETES_2_CONTEXT} diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/run-crossdc-tests.sh b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/run-crossdc-tests.sh index 84daf6a8..a9cf9a60 100755 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/run-crossdc-tests.sh +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/run-crossdc-tests.sh @@ -11,8 +11,15 @@ SECRET_MANAGER_REGION="eu-central-1" MAIN_PASSWORD=$(aws secretsmanager get-secret-value --region $SECRET_MANAGER_REGION --secret-id $KEYCLOAK_MASTER_PASSWORD_SECRET_NAME --query SecretString --output text --no-cli-pager) -./mvnw -B -f provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml clean install -DcrossDCTests \ --Dload-balancer.url=$LOAD_BALANCER_URL \ --Dinfinispan.dc1.url=$ISPN_DC1_URL -Dkeycloak.dc1.url=$KEYCLOAK_DC1_URL \ --Dinfinispan.dc2.url=$ISPN_DC2_URL -Dkeycloak.dc2.url=$KEYCLOAK_DC2_URL \ --Dmain.password=$MAIN_PASSWORD +MVN_CMD="./mvnw -B -f provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml clean install -DcrossDCTests \ + -Ddeployment.namespace=${DEPLOYMENT_NAMESPACE} \ + -Dkubernetes.1.context=${KUBERNETES_1_CONTEXT} \ + -Dkubernetes.2.context=${KUBERNETES_2_CONTEXT} \ + -Dmain.password=${MAIN_PASSWORD}" + +if [ "${ACTIVE_ACTIVE}" == "true" ]; then + MVN_CMD+=" -Pactive-active" +fi + +echo ${MVN_CMD} +${MVN_CMD} diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/AbstractCrossDCTest.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/AbstractCrossDCTest.java index e40745cd..ea857d5f 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/AbstractCrossDCTest.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/AbstractCrossDCTest.java @@ -1,54 +1,62 @@ package org.keycloak.benchmark.crossdc; -import jakarta.ws.rs.NotFoundException; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; +import static org.keycloak.benchmark.crossdc.util.HttpClientUtils.MOCK_COOKIE_MANAGER; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.CLIENT_SESSIONS; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.DISTRIBUTED_CACHES; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.LockSupport; +import java.util.function.Supplier; + import org.jboss.logging.Logger; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestInstance; import org.keycloak.admin.client.Keycloak; import org.keycloak.admin.client.resource.RealmResource; -import org.keycloak.benchmark.crossdc.client.AWSClient; import org.keycloak.benchmark.crossdc.client.DatacenterInfo; import org.keycloak.benchmark.crossdc.client.KeycloakClient; +import org.keycloak.benchmark.crossdc.junit.tags.ActivePassive; import org.keycloak.benchmark.crossdc.util.HttpClientUtils; import org.keycloak.benchmark.crossdc.util.InfinispanUtils; +import org.keycloak.benchmark.crossdc.util.PropertyUtils; import org.keycloak.representations.idm.ClientRepresentation; import org.keycloak.representations.idm.CredentialRepresentation; import org.keycloak.representations.idm.RealmRepresentation; import org.keycloak.representations.idm.UserRepresentation; -import java.io.IOException; -import java.net.URISyntaxException; -import java.net.UnknownHostException; -import java.net.http.HttpClient; -import java.util.Collections; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.keycloak.benchmark.crossdc.util.HttpClientUtils.MOCK_COOKIE_MANAGER; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.CLIENT_SESSIONS; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.DISTRIBUTED_CACHES; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; +import jakarta.ws.rs.NotFoundException; +@TestInstance(TestInstance.Lifecycle.PER_CLASS) public abstract class AbstractCrossDCTest { private static final Logger LOG = Logger.getLogger(AbstractCrossDCTest.class); - protected static HttpClient HTTP_CLIENT = HttpClientUtils.newHttpClient(); - protected static final DatacenterInfo DC_1, DC_2; - protected static final KeycloakClient LOAD_BALANCER_KEYCLOAK; + protected final DatacenterInfo DC_1, DC_2; + protected final KeycloakClient LOAD_BALANCER_KEYCLOAK; + + protected final boolean activePassive; public static String ISPN_USERNAME = System.getProperty("infinispan.username", "developer"); public static final String REALM_NAME = "cross-dc-test-realm"; public static final String CLIENTID = "cross-dc-test-client"; public static final String CLIENT_SECRET = "cross-dc-test-client-secret"; public static final String USERNAME = "cross-dc-test-user"; - public static final String MAIN_PASSWORD = System.getProperty("main.password"); - - static { - assertNotNull(MAIN_PASSWORD, "Main password must be set"); - DC_1 = new DatacenterInfo(HTTP_CLIENT, System.getProperty("keycloak.dc1.url"), System.getProperty("infinispan.dc1.url")); - DC_2 = new DatacenterInfo(HTTP_CLIENT, System.getProperty("keycloak.dc2.url"), System.getProperty("infinispan.dc2.url")); - LOAD_BALANCER_KEYCLOAK = new KeycloakClient(HTTP_CLIENT, System.getProperty("load-balancer.url")); + public static final String MAIN_PASSWORD = PropertyUtils.getRequired("main.password"); + + public AbstractCrossDCTest() { + var httpClient = HttpClientUtils.newHttpClient(); + this.activePassive = !System.getProperty("deployment.type", "").equals(ActivePassive.TAG); + this.DC_1 = new DatacenterInfo(httpClient, 1, activePassive); + this.DC_2 = new DatacenterInfo(httpClient, 2, activePassive); + this.LOAD_BALANCER_KEYCLOAK = new KeycloakClient(httpClient, DC_1.getLoadbalancerURL(), activePassive); } @BeforeEach @@ -148,20 +156,19 @@ public void tearDownTestEnvironment() throws URISyntaxException, IOException, In }); MOCK_COOKIE_MANAGER.getCookieStore().removeAll(); - failbackHealthChecks(); + failbackLoadBalancers(); } @AfterAll - public static void tearDown() throws URISyntaxException, IOException, InterruptedException { - failbackHealthChecks(); + public void tearDown() throws URISyntaxException, IOException, InterruptedException { + failbackLoadBalancers(); + DC_1.close(); + DC_2.close(); } - private static void failbackHealthChecks() throws URISyntaxException, IOException, InterruptedException { + protected void failbackLoadBalancers() throws URISyntaxException, IOException, InterruptedException { DC_1.kc().markLBCheckUp(); DC_2.kc().markLBCheckUp(); - String domain = DC_1.getKeycloakServerURL().substring("https://".length()); - AWSClient.updateRoute53HealthCheckPath(domain, "/lb-check"); - DC_1.kc().waitToBeActive(LOAD_BALANCER_KEYCLOAK); } protected void assertCacheSize(String cache, int size) { @@ -173,4 +180,32 @@ protected void assertCacheSize(String cache, int size) { assertEquals(size, DC_1.ispn().cache(cache).size(), () -> "External cache " + cache + " in DC1 has " + DC_1.ispn().cache(cache).size() + " entries"); assertEquals(size, DC_2.ispn().cache(cache).size(), () -> "External cache " + cache + " in DC2 has " + DC_2.ispn().cache(cache).size() + " entries"); } + + protected void eventually(Supplier messageSupplier, Supplier condition) { + eventually(messageSupplier, condition, 30, TimeUnit.SECONDS); + } + + protected void eventually(Supplier messageSupplier, Supplier condition, long timeout, TimeUnit timeUnit) { + try { + long timeoutNanos = timeUnit.toNanos(timeout); + // We want the sleep time to increase in arithmetic progression + // 30 loops with the default timeout of 30 seconds means the initial wait is ~ 65 millis + int loops = 30; + int progressionSum = loops * (loops + 1) / 2; + long initialSleepNanos = timeoutNanos / progressionSum; + long sleepNanos = initialSleepNanos; + long expectedEndTime = System.nanoTime() + timeoutNanos; + while (expectedEndTime - System.nanoTime() > 0) { + if (condition.get()) + return; + LockSupport.parkNanos(sleepNanos); + sleepNanos += initialSleepNanos; + } + if (!condition.get()) { + fail(messageSupplier.get()); + } + } catch (Exception e) { + throw new RuntimeException("Unexpected Exception during eventually!", e); + } + } } diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/EntityReplicationTest.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/EntityReplicationTest.java index db824fad..9f23bdde 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/EntityReplicationTest.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/EntityReplicationTest.java @@ -1,13 +1,13 @@ package org.keycloak.benchmark.crossdc; -import org.junit.jupiter.api.Test; -import org.keycloak.admin.client.resource.UsersResource; -import org.keycloak.representations.idm.UserRepresentation; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.keycloak.benchmark.crossdc.util.KeycloakUtils.getCreatedId; +import org.junit.jupiter.api.Test; +import org.keycloak.admin.client.resource.UsersResource; +import org.keycloak.representations.idm.UserRepresentation; + public class EntityReplicationTest extends AbstractCrossDCTest { @Test diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/FailoverTest.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/FailoverTest.java index 4adadd30..97176f7b 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/FailoverTest.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/FailoverTest.java @@ -1,18 +1,48 @@ package org.keycloak.benchmark.crossdc; -import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; import java.io.IOException; import java.net.URISyntaxException; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; +import org.junit.jupiter.api.Test; +import org.keycloak.benchmark.crossdc.client.AWSClient; +import org.keycloak.benchmark.crossdc.client.DatacenterInfo; +import org.keycloak.benchmark.crossdc.junit.tags.ActiveActive; +import org.keycloak.benchmark.crossdc.junit.tags.ActivePassive; + +import io.fabric8.kubernetes.client.KubernetesClient; public class FailoverTest extends AbstractCrossDCTest { + static final String OPERATORS_NS = "openshift-operators"; + + @Override + protected void failbackLoadBalancers() throws URISyntaxException, IOException, InterruptedException { + if (activePassive) { + String domain = DC_1.getKeycloakServerURL().substring("https://".length()); + AWSClient.updateRoute53HealthCheckPath(domain, "/lb-check"); + DC_1.kc().waitToBeActive(LOAD_BALANCER_KEYCLOAK); + } else { + // Heal split-brain if previously initiated + scaleUpGossipRouter(DC_1); + scaleUpGossipRouter(DC_2); + // Wait for JGroups site view to contain both sites + AWSClient.acceleratorFallback(LOAD_BALANCER_KEYCLOAK.getKeycloakServerUrl()); + // Assert that both sites are part of the Accelerator EndpointGroup + waitForAcceleratorEndpointCount(2); + } + super.failbackLoadBalancers(); + } + @Test + @ActivePassive public void logoutUserWithFailoverTest() throws IOException, URISyntaxException, InterruptedException { // Login and exchange code in DC1 String code = LOAD_BALANCER_KEYCLOAK.usernamePasswordLogin( REALM_NAME, USERNAME, MAIN_PASSWORD, CLIENTID); @@ -31,4 +61,61 @@ public void logoutUserWithFailoverTest() throws IOException, URISyntaxException, LOAD_BALANCER_KEYCLOAK.refreshToken(REALM_NAME, (String) tokensMap.get("refresh_token"), CLIENTID, CLIENT_SECRET, 400); } + + @Test + @ActiveActive + public void ensureAcceleratorUpdatedOnSplitBrainTest() throws Exception { + // Assert that both sites are part of the Accelerator EndpointGroup + assertEquals(2, AWSClient.getAcceleratorEndpoints(DC_1.getLoadbalancerURL()).size()); + + // Trigger a split-brain by scaling down the GossipRouter in both sites + scaleDownGossipRouter(DC_1); + scaleDownGossipRouter(DC_2); + + // Wait for both sites to detect split-brain + waitForSitesViewCount(1); + + // Assert that the AWS Lambda was executed and that only one site LB remains in the Accelerator EndpointGroup + waitForAcceleratorEndpointCount(1); + } + + private void waitForAcceleratorEndpointCount(int count) { + eventually( + () -> String.format("Expected the Accelerator EndpointGroup size to be %d", count), + () -> AWSClient.getAcceleratorEndpoints(DC_1.getLoadbalancerURL()).size() == count, + 2, TimeUnit.MINUTES + ); + } + + private void waitForSitesViewCount(int count) { + Supplier msg = () -> "Timedout waiting for cross-site view to reform"; + eventually(msg, () -> DC_1.ispn().getSiteView().size() == count); + eventually(msg, () -> DC_2.ispn().getSiteView().size() == count); + } + + private void scaleDownGossipRouter(DatacenterInfo datacenter) throws InterruptedException { + var oc = datacenter.oc(); + scaleDeployment(oc, "infinispan-operator-controller-manager", OPERATORS_NS, 0); + scaleDeployment(oc, "infinispan-router", datacenter.namespace(), 0); + } + + private void scaleUpGossipRouter(DatacenterInfo datacenter) throws InterruptedException { + var oc = datacenter.oc(); + scaleDeployment(oc, "infinispan-operator-controller-manager", OPERATORS_NS, 1); + scaleDeployment(oc, "infinispan-router", datacenter.namespace(), 1); + } + + private void scaleDeployment(KubernetesClient k8s, String name, String namespace, int replicas) throws InterruptedException { + k8s.apps() + .deployments() + .inNamespace(namespace) + .withName(name) + .scale(replicas); + + k8s.apps() + .deployments() + .inNamespace(namespace) + .withName(name) + .waitUntilReady(30, TimeUnit.SECONDS); + } } diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/LoginLogoutTest.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/LoginLogoutTest.java index 5c5c0f07..cf9f9f84 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/LoginLogoutTest.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/LoginLogoutTest.java @@ -1,19 +1,19 @@ package org.keycloak.benchmark.crossdc; -import org.jboss.logging.Logger; -import org.junit.jupiter.api.Test; -import org.keycloak.benchmark.crossdc.util.InfinispanUtils; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.CLIENT_SESSIONS; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; import java.io.IOException; import java.net.URISyntaxException; import java.net.http.HttpResponse; import java.util.Map; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.CLIENT_SESSIONS; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; +import org.jboss.logging.Logger; +import org.junit.jupiter.api.Test; +import org.keycloak.benchmark.crossdc.util.InfinispanUtils; public class LoginLogoutTest extends AbstractCrossDCTest { diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/SessionExpirationTest.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/SessionExpirationTest.java index 1178848e..fe1e514c 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/SessionExpirationTest.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/SessionExpirationTest.java @@ -1,15 +1,15 @@ package org.keycloak.benchmark.crossdc; -import org.junit.jupiter.api.Test; -import org.keycloak.representations.idm.RealmRepresentation; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.CLIENT_SESSIONS; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; import java.io.IOException; import java.net.URISyntaxException; import java.util.Map; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.CLIENT_SESSIONS; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS; +import org.junit.jupiter.api.Test; +import org.keycloak.representations.idm.RealmRepresentation; public class SessionExpirationTest extends AbstractCrossDCTest { diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/AWSClient.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/AWSClient.java index 993fec2f..dc012fba 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/AWSClient.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/AWSClient.java @@ -1,9 +1,10 @@ package org.keycloak.benchmark.crossdc.client; import java.time.Duration; +import java.util.List; +import java.util.function.BiFunction; import org.jboss.logging.Logger; -import org.keycloak.benchmark.crossdc.AbstractCrossDCTest; import software.amazon.awssdk.core.waiters.WaiterOverrideConfiguration; import software.amazon.awssdk.http.SdkHttpClient; @@ -12,6 +13,16 @@ import software.amazon.awssdk.services.cloudwatch.CloudWatchClient; import software.amazon.awssdk.services.cloudwatch.model.DescribeAlarmsRequest; import software.amazon.awssdk.services.cloudwatch.model.StateValue; +import software.amazon.awssdk.services.elasticloadbalancingv2.ElasticLoadBalancingV2Client; +import software.amazon.awssdk.services.elasticloadbalancingv2.model.LoadBalancer; +import software.amazon.awssdk.services.elasticloadbalancingv2.model.LoadBalancerTypeEnum; +import software.amazon.awssdk.services.elasticloadbalancingv2.model.Tag; +import software.amazon.awssdk.services.elasticloadbalancingv2.model.TagDescription; +import software.amazon.awssdk.services.globalaccelerator.GlobalAcceleratorClient; +import software.amazon.awssdk.services.globalaccelerator.model.Accelerator; +import software.amazon.awssdk.services.globalaccelerator.model.EndpointConfiguration; +import software.amazon.awssdk.services.globalaccelerator.model.EndpointDescription; +import software.amazon.awssdk.services.globalaccelerator.model.EndpointGroup; import software.amazon.awssdk.services.route53.Route53Client; import software.amazon.awssdk.services.route53.model.HealthCheck; import software.amazon.awssdk.services.route53.model.UpdateHealthCheckRequest; @@ -54,4 +65,107 @@ public static void updateRoute53HealthCheckPath(String domainName, String path) ); } } + + public static void acceleratorFallback(String acceleratorDns) { + acceleratorClient((httpClient, gaClient) -> { + // Retrieve Accelerator instance based upon DNS + Accelerator accelerator = gaClient.listAccelerators().accelerators() + .stream() + .filter(a -> acceleratorDns.contains(a.dnsName())) + .findFirst() + .orElseThrow(); + + var endpointGroup = getEndpointGroup(gaClient, accelerator); + var region = endpointGroup.endpointGroupRegion(); + + List endpoints; + try (ElasticLoadBalancingV2Client elbClient = + ElasticLoadBalancingV2Client.builder() + .region(Region.of(region)) + .httpClient(httpClient) + .build() + ) { + // Get all LBs associated with the Accelerator + var elbs = elbClient.describeLoadBalancers() + .loadBalancers() + .stream() + .filter(lb -> lb.type() == LoadBalancerTypeEnum.NETWORK) + .map(LoadBalancer::loadBalancerArn) + .toList(); + + endpoints = elbClient.describeTags(b -> b.resourceArns(elbs)) + .tagDescriptions() + .stream() + .filter(td -> td.tags() + .contains( + Tag.builder() + .key("accelerator") + .value(accelerator.name()) + .build() + ) + ) + .map(TagDescription::resourceArn) + .toList(); + } + + var endpointConfigs = endpoints.stream() + .map(elb -> EndpointConfiguration.builder() + .clientIPPreservationEnabled(false) + .endpointId(elb) + .weight(50) + .build() + ).toList(); + + // Add all LBs to the Accelerator EndpointGroup + return gaClient.updateEndpointGroup( + g -> g.endpointGroupArn(endpointGroup.endpointGroupArn()) + .endpointConfigurations(endpointConfigs) + ); + }); + } + + private static EndpointGroup getEndpointGroup(GlobalAcceleratorClient gaClient, Accelerator accelerator) { + var listenerArn = gaClient.listListeners(b -> b.acceleratorArn(accelerator.acceleratorArn())) + .listeners() + .stream() + .findFirst() + .orElseThrow() + .listenerArn(); + + return gaClient.listEndpointGroups(b -> b.listenerArn(listenerArn)) + .endpointGroups() + .stream() + .findFirst() + .orElseThrow(); + } + + public static List getAcceleratorEndpoints(String acceleratorDns) { + return acceleratorClient((httpClient, gaClient) -> { + // Retrieve Accelerator instance based upon DNS + Accelerator accelerator = gaClient.listAccelerators().accelerators() + .stream() + .filter(a -> acceleratorDns.contains(a.dnsName())) + .findFirst() + .orElseThrow(); + + return getEndpointGroup(gaClient, accelerator) + .endpointDescriptions() + .stream() + .map(EndpointDescription::endpointId) + .toList(); + } + ); + } + + private static T acceleratorClient(BiFunction fn) { + try ( + SdkHttpClient httpClient = ApacheHttpClient.builder().build(); + GlobalAcceleratorClient gaClient = GlobalAcceleratorClient.builder() + .region(Region.US_WEST_2) + .httpClient(httpClient) + .build() + ) { + return fn.apply(httpClient, gaClient); + } + } } diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/DatacenterInfo.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/DatacenterInfo.java index f4cc623a..91fc56ca 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/DatacenterInfo.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/DatacenterInfo.java @@ -1,24 +1,78 @@ package org.keycloak.benchmark.crossdc.client; +import java.net.http.HttpClient; + import org.keycloak.benchmark.crossdc.AbstractCrossDCTest; +import org.keycloak.benchmark.crossdc.util.PropertyUtils; -import java.net.http.HttpClient; +import io.fabric8.kubernetes.client.Config; +import io.fabric8.kubernetes.client.KubernetesClientBuilder; +import io.fabric8.openshift.client.OpenShiftClient; -public class DatacenterInfo { +public class DatacenterInfo implements AutoCloseable { + private final String namespace; private final String keycloakServerURL; private final String infinispanServerURL; + private final String loadbalancerURL; private final KeycloakClient keycloak; private final ExternalInfinispanClient infinispan; + private final OpenShiftClient oc; + + public DatacenterInfo(HttpClient httpClient, int index, boolean activePassive) { + oc = new KubernetesClientBuilder() + .withConfig( + Config.autoConfigure(PropertyUtils.getRequired(String.format("kubernetes.%d.context", index))) + ) + .build() + .adapt(OpenShiftClient.class); - public DatacenterInfo(HttpClient httpClient, String keycloakServerURL, String infinispanServerURL) { - this.keycloak = new KeycloakClient(httpClient, keycloakServerURL); + this.namespace = PropertyUtils.getRequired("deployment.namespace"); + this.infinispanServerURL = getRouteHost("infinispan-service-external"); + + if (activePassive) { + this.keycloakServerURL = "https://" + oc.routes() + .inNamespace(namespace) + .withName("aws-health-route") + .item() + .getSpec() + .getHost(); + } else { + this.keycloakServerURL = "https://" + oc.services() + .inNamespace(namespace) + .withName("accelerator-loadbalancer") + .get() + .getStatus() + .getLoadBalancer() + .getIngress() + .get(0) + .getHostname(); + } + this.loadbalancerURL = getRouteHost("keycloak"); + + this.keycloak = new KeycloakClient(httpClient, keycloakServerURL, activePassive); this.infinispan = new ExternalInfinispanClient(httpClient, infinispanServerURL, AbstractCrossDCTest.ISPN_USERNAME, AbstractCrossDCTest.MAIN_PASSWORD, keycloakServerURL); + } + + private String getRouteHost(String app) { + return "https://" + oc.routes() + .inNamespace(namespace) + .withLabel("app", app) + .list() + .getItems() + .stream() + .findFirst() + .orElseThrow() + .getSpec() + .getHost(); + } - this.keycloakServerURL = keycloakServerURL; - this.infinispanServerURL = infinispanServerURL; + @Override + public void close() { + this.oc.close(); } + public String getKeycloakServerURL() { return keycloakServerURL; } @@ -27,6 +81,10 @@ public String getInfinispanServerURL() { return infinispanServerURL; } + public String getLoadbalancerURL() { + return loadbalancerURL; + } + public KeycloakClient kc() { return keycloak; } @@ -34,4 +92,12 @@ public KeycloakClient kc() { public ExternalInfinispanClient ispn() { return infinispan; } + + public OpenShiftClient oc() { + return oc; + } + + public String namespace() { + return namespace; + } } diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/ExternalInfinispanClient.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/ExternalInfinispanClient.java index b5581618..9c553508 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/ExternalInfinispanClient.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/ExternalInfinispanClient.java @@ -1,8 +1,11 @@ package org.keycloak.benchmark.crossdc.client; -import org.apache.http.client.utils.URIBuilder; -import org.keycloak.benchmark.crossdc.util.InfinispanUtils; -import org.keycloak.util.JsonSerialization; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.keycloak.benchmark.crossdc.AbstractCrossDCTest.ISPN_USERNAME; +import static org.keycloak.benchmark.crossdc.AbstractCrossDCTest.MAIN_PASSWORD; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.getBasicAuthenticationHeader; +import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.getNestedValue; import java.io.IOException; import java.net.URI; @@ -11,18 +14,16 @@ import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.keycloak.benchmark.crossdc.AbstractCrossDCTest.ISPN_USERNAME; -import static org.keycloak.benchmark.crossdc.AbstractCrossDCTest.MAIN_PASSWORD; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.getBasicAuthenticationHeader; -import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.getNestedValue; +import org.apache.http.client.utils.URIBuilder; +import org.keycloak.benchmark.crossdc.util.InfinispanUtils; +import org.keycloak.util.JsonSerialization; public class ExternalInfinispanClient implements InfinispanClient { private final HttpClient httpClient; @@ -42,7 +43,7 @@ public ExternalInfinispanClient(HttpClient httpClient, String infinispanUrl, Str this.password = password; this.keycloakServerURL = keycloakServerURL; - HttpResponse stringHttpResponse = sendRequestWithAction(infinispanUrl + "/rest/v2/cache-managers/default", "GET", null); + HttpResponse stringHttpResponse = sendRequestWithAction(infinispanUrl + "/rest/v2/cache-managers/default", "GET"); assertEquals(200, stringHttpResponse.statusCode()); Map returnedValues; @@ -213,7 +214,7 @@ public void bringOnline(String backupSiteName) { @Override public boolean isBackupOnline(String backupSiteName) throws IOException { - String response = sendRequestWithAction(infinispanUrl + "/rest/v2/caches/" + cacheName + "/x-site/backups/", "GET", null).body(); + String response = sendRequestWithAction(infinispanUrl + "/rest/v2/caches/" + cacheName + "/x-site/backups/", "GET").body(); Map returnedValues = JsonSerialization.readValue(response, Map.class); String status = getNestedValue(returnedValues, backupSiteName, "status"); @@ -221,6 +222,10 @@ public boolean isBackupOnline(String backupSiteName) throws IOException { } } + private HttpResponse sendRequestWithAction(String url, String method) { + return sendRequestWithAction(url, method, null); + } + private HttpResponse sendRequestWithAction(String url, String method, String action) { URI uri = null; try { @@ -254,4 +259,16 @@ private HttpResponse sendRequestWithAction(String url, String method, St public ExternalCache cache(String name) { return new ExternalCache(name); } + + @SuppressWarnings("unchecked") + public List getSiteView() { + HttpResponse response = sendRequestWithAction(infinispanUrl + "/rest/v2/container", "GET"); + assertEquals(200, response.statusCode()); + try { + Map info = JsonSerialization.readValue(response.body(), Map.class); + return (List) info.get("sites_view"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } } diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/KeycloakClient.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/KeycloakClient.java index 49ae4c53..89bc74a3 100644 --- a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/KeycloakClient.java +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/client/KeycloakClient.java @@ -39,14 +39,17 @@ public class KeycloakClient { private final String keycloakServerUrl; private final String keycloakDownURL; private final String keycloakUpURL; + + private final boolean activePassive; private static final Map adminClients = new ConcurrentHashMap<>(); private static final Logger LOG = Logger.getLogger(KeycloakClient.class); - public KeycloakClient(HttpClient httpClient, String keycloakServerUrl) { + public KeycloakClient(HttpClient httpClient, String keycloakServerUrl, boolean activePassive) { assertNotNull(keycloakServerUrl, "Keycloak server URL must not be null."); this.httpClient = httpClient; this.keycloakServerUrl = keycloakServerUrl; + this.activePassive = activePassive; this.keycloakDownURL = keycloakServerUrl + "/realms/master/dataset/take-dc-down"; this.keycloakUpURL = keycloakServerUrl + "/realms/master/dataset/take-dc-up"; @@ -223,7 +226,10 @@ public void markLBCheckUp() throws URISyntaxException, IOException, InterruptedE HttpClientUtils.newHttpClient().send(request, HttpResponse.BodyHandlers.ofString()).body(); } + public boolean isActive(KeycloakClient loadBalancer) throws UnknownHostException { + if (!activePassive) return true; + String loadBalancerHost = URIToHostString(loadBalancer.getKeycloakServerUrl()); String thisKeycloakHost = URIToHostString(getKeycloakServerUrl()); @@ -231,6 +237,8 @@ public boolean isActive(KeycloakClient loadBalancer) throws UnknownHostException } public void waitToBeActive(KeycloakClient loadBalancer) throws UnknownHostException, InterruptedException { + if (!activePassive) return; + int startTime = Time.currentTime(); int timeLimit = startTime + 600; // 10 minutes LOG.infof("Waiting for Keycloak %d to be active.", keycloakServerUrl); diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/junit/tags/ActiveActive.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/junit/tags/ActiveActive.java new file mode 100644 index 00000000..398724c9 --- /dev/null +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/junit/tags/ActiveActive.java @@ -0,0 +1,17 @@ +package org.keycloak.benchmark.crossdc.junit.tags; + +import java.lang.annotation.Documented; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +import org.junit.jupiter.api.Tag; + +@Target({ ElementType.TYPE, ElementType.METHOD }) +@Retention(RetentionPolicy.RUNTIME) +@Documented +@Tag(ActiveActive.TAG) +public @interface ActiveActive { + String TAG = "active-active"; +} diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/junit/tags/ActivePassive.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/junit/tags/ActivePassive.java new file mode 100644 index 00000000..3d4628a1 --- /dev/null +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/junit/tags/ActivePassive.java @@ -0,0 +1,17 @@ +package org.keycloak.benchmark.crossdc.junit.tags; + +import java.lang.annotation.Documented; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +import org.junit.jupiter.api.Tag; + +@Target({ ElementType.TYPE, ElementType.METHOD }) +@Retention(RetentionPolicy.RUNTIME) +@Documented +@Tag(ActivePassive.TAG) +public @interface ActivePassive { + String TAG = "active-passive"; +} diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/util/PropertyUtils.java b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/util/PropertyUtils.java new file mode 100644 index 00000000..4e74dd00 --- /dev/null +++ b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/util/PropertyUtils.java @@ -0,0 +1,11 @@ +package org.keycloak.benchmark.crossdc.util; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class PropertyUtils { + public static String getRequired(String property) { + var prop = System.getProperty(property); + assertNotNull(prop, String.format("Property '%s' must be set", property)); + return prop; + } +}