Skip to content

Commit

Permalink
Active/Active XSite fencing. Resolves keycloak/keycloak#29303
Browse files Browse the repository at this point in the history
- User alert routing enabled on ROSA clusters

- PrometheusRule used to trigger AWS Lambda webhook in the event of a
  split-brain so that only a single site remains in the global accelerator endpoints

- Global Accelerator scripts refactored to use OpenTofu when creating
  AWS resources

- Task created to deploy/undeploy Active/Active

- Task created to simulate split-brain scenarios

- 'active-active' flag added to GH actions to differentiate between
  active/passive and active/active deployments

Signed-off-by: Ryan Emerson <remerson@redhat.com>
  • Loading branch information
ryanemerson committed May 21, 2024
1 parent 2403d36 commit 587ceda
Show file tree
Hide file tree
Showing 24 changed files with 682 additions and 162 deletions.
63 changes: 58 additions & 5 deletions .github/workflows/rosa-multi-az-cluster-create.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ on:
keycloakRepository:
description: 'The repository to deploy Keycloak from. If not set nightly image is used'
type: string
activeActive:
description: 'When true deploy an Active/Active Keycloak deployment'
type: boolean
default: false
enablePersistentSessions:
description: 'To enable Persistent user and client sessions to the DB'
type: boolean
Expand All @@ -32,16 +36,20 @@ on:
description: 'The AWS region to create both clusters in. Defaults to "vars.AWS_DEFAULT_REGION" if omitted.'
type: string
createCluster:
description: 'Check to Create Cluster'
description: 'Check to Create Cluster.'
type: boolean
default: true
keycloakRepository:
description: 'The repository to deploy Keycloak from. If not set nightly image is used'
type: string
activeActive:
description: 'When true deploy an Active/Active Keycloak deployment'
type: boolean
default: false
enablePersistentSessions:
description: 'To enable Persistent user and client sessions to the DB'
type: boolean
default: false
keycloakRepository:
description: 'The repository to deploy Keycloak from. If not set nightly image is used'
type: string
keycloakBranch:
description: 'The branch to deploy Keycloak from. If not set nightly image is used'
type: string
Expand Down Expand Up @@ -109,6 +117,11 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4

- name: Setup OpenTofu
uses: opentofu/setup-opentofu@v1
with:
tofu_wrapper: false

- name: Setup ROSA CLI
uses: ./.github/actions/rosa-cli-setup
with:
Expand Down Expand Up @@ -140,6 +153,7 @@ jobs:
ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b

- name: Create Route53 Loadbalancer
if: ${{ !inputs.activeActive }}
working-directory: provision/rosa-cross-dc
run: |
task route53 > route53
Expand All @@ -150,10 +164,49 @@ jobs:
ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a
ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b

- name: Deploy
- name: Deploy Active/Passive
if: ${{ !inputs.activeActive }}
working-directory: provision/rosa-cross-dc
run: task
env:
AURORA_CLUSTER: ${{ env.CLUSTER_PREFIX }}
AURORA_REGION: ${{ env.REGION }}
ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a
ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b
KC_ACTIVE_ACTIVE: ${{ inputs.activeActive }}
KC_CPU_REQUESTS: 6
KC_INSTANCES: 3
KC_DISABLE_STICKY_SESSION: true
KC_PERSISTENT_SESSIONS: ${{ env.KC_PERSISTENT_SESSIONS }}
KC_MEMORY_REQUESTS_MB: 3000
KC_MEMORY_LIMITS_MB: 4000
KC_DB_POOL_INITIAL_SIZE: 30
KC_DB_POOL_MAX_SIZE: 30
KC_DB_POOL_MIN_SIZE: 30
KC_DATABASE: "aurora-postgres"
MULTI_AZ: "true"
KC_REPOSITORY: ${{ inputs.keycloakRepository }}
KC_BRANCH: ${{ inputs.keycloakBranch }}

- name: Create Accelerator Loadbalancer
if: ${{ inputs.activeActive }}
working-directory: provision/rosa-cross-dc
run: |
task global-accelerator-create 2>&1 | tee accelerator
echo "ACCELERATOR_DNS=$(grep -Po 'ACCELERATOR DNS: \K.*' accelerator)" >> $GITHUB_ENV
echo "ACCELERATOR_WEBHOOK=$(grep -Po 'ACCELERATOR WEBHOOK: \K.*' accelerator)" >> $GITHUB_ENV
env:
ACCELERATOR_NAME: ${{ env.CLUSTER_PREFIX }}
ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a
ROSA_CLUSTER_NAME_2: ${{ env.CLUSTER_PREFIX }}-b

- name: Deploy Active/Active
if: ${{ inputs.activeActive }}
working-directory: provision/rosa-cross-dc
run: task active-active
env:
ACCELERATOR_DNS: ${{ env.ACCELERATOR_DNS }}
ACCELERATOR_WEBHOOK_URL: ${{ env.ACCELERATOR_WEBHOOK }}
AURORA_CLUSTER: ${{ env.CLUSTER_PREFIX }}
AURORA_REGION: ${{ env.REGION }}
ROSA_CLUSTER_NAME_1: ${{ env.CLUSTER_PREFIX }}-a
Expand Down
19 changes: 16 additions & 3 deletions .github/workflows/rosa-multi-az-cluster-delete.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ on:
type: string

jobs:
route53:
loadbalancer:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
Expand Down Expand Up @@ -40,19 +40,32 @@ jobs:
echo "SUBDOMAIN=$(echo $KEYCLOAK_URL | grep -oP '(?<=client.).*?(?=.keycloak-benchmark.com)')" >> $GITHUB_ENV
- name: Delete Route53 Records
run: |
./provision/aws/route53/route53_delete.sh
run: ./provision/aws/route53/route53_delete.sh
env:
SUBDOMAIN: ${{ env.SUBDOMAIN }}

- name: Set ACCELERATOR_DNS env variable for Global Accelerator processing
run: |
echo "ACCELERATOR_DNS=${KEYCLOAK_URL#"https://"}" >> $GITHUB_ENV
- name: Delete Global Accelerator
run: ./provision/aws/global-accelerator/accelerator_multi_az_delete.sh
env:
ACCELERATOR_DNS: ${{ env.ACCELERATOR_DNS }}
CLUSTER_1: ${{ inputs.clusterPrefix }}-a
CLUSTER_2: ${{ inputs.clusterPrefix }}-b
KEYCLOAK_NAMESPACE: runner-keycloak

cluster1:
needs: loadbalancer
uses: ./.github/workflows/rosa-cluster-delete.yml
with:
clusterName: ${{ inputs.clusterPrefix }}-a
deleteAll: no
secrets: inherit

cluster2:
needs: loadbalancer
uses: ./.github/workflows/rosa-cluster-delete.yml
with:
clusterName: ${{ inputs.clusterPrefix }}-b
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,4 @@ provision/environment_data.json
**/*.tfstate*
**/*.terraform*
!**/*.terraform.lock.hcl
provision/opentofu/modules/aws/accelerator/builds/*
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ oc login https://api.**<domain name>**:6443 -u **<username>**

NOTE: The session will expire approximately one a day, and you'll need to re-login.

== Enable user workload monitoring
== Enable alert routing for user-defined projects

By default, OpenShift HCP doesn't enable alert routing for user-defined projects.

By default, OpenShift doesn't monitor user workloads.
Apply the following ConfigMap link:{github-files}/provision/openshift/cluster-monitoring-config.yaml[cluster-monitoring-config.yaml] which is located in the `/provision/openshift` folder to OpenShift:

[source,bash]
Expand All @@ -93,14 +94,11 @@ After this has been deployed, several new pods spin up in the *openshift-user-wo
kubectl get pods -n openshift-user-workload-monitoring
----

The metrics and targets are then available in the menu entry *Observe* in the OpenShift console.

Additional steps are necessary to enable persistent volumes for the recorded metrics.
Alerts defined in `PrometheusRule` CR are then available to view in the menu entry *Observe->Alerting* in the OpenShift console.

Further reading:

* https://docs.openshift.com/container-platform/4.12/monitoring/configuring-the-monitoring-stack.html[Configure OpenShift monitoring stack]
* https://docs.openshift.com/container-platform/4.12/monitoring/enabling-monitoring-for-user-defined-projects.html[Enabling monitoring for user-defined projects]
* https://docs.openshift.com/rosa/observability/monitoring/enabling-alert-routing-for-user-defined-projects.html[Enabling alert routing for user-defined projects]

[#switching-between-different-kubernetes-clusters]
== Switching between different Kubernetes clusters
Expand Down
79 changes: 15 additions & 64 deletions provision/aws/global-accelerator/accelerator_multi_az_create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ function waitForHostname() {

function createLoadBalancer() {
export CLUSTER_NAME=$1
REGION=$2
SVC_NAME=$3
NAMESPACE=$4
SVC_NAME=$2
NAMESPACE=$3

bash ${SCRIPT_DIR}/../rosa_oc_login.sh > /dev/null
oc create namespace ${NAMESPACE} > /dev/null || true
Expand All @@ -39,6 +38,7 @@ function createLoadBalancer() {
metadata:
name: ${SVC_NAME}
annotations:
service.beta.kubernetes.io/aws-load-balancer-additional-resource-tags: site=${CLUSTER_NAME},namespace=${NAMESPACE}
service.beta.kubernetes.io/aws-load-balancer-type: "nlb"
service.beta.kubernetes.io/aws-load-balancer-healthcheck-path: "/lb-check"
service.beta.kubernetes.io/aws-load-balancer-healthcheck-protocol: "https"
Expand All @@ -59,25 +59,10 @@ function createLoadBalancer() {
type: LoadBalancer
EOF
LB_DNS=$(waitForHostname ${SVC_NAME} ${NAMESPACE})
LB_ARN=$(aws elbv2 describe-load-balancers \
--query "LoadBalancers[?DNSName=='${LB_DNS}'].LoadBalancerArn" \
--region ${REGION} \
--output text
)
echo ${LB_ARN}
}

requiredEnv ACCELERATOR_NAME CLUSTER_1 CLUSTER_2 KEYCLOAK_NAMESPACE

EXISTING_ACCELERATOR=$(aws globalaccelerator list-accelerators \
--query "Accelerators[?Name=='${ACCELERATOR_NAME}'].AcceleratorArn" \
--output text
)
if [ -n "${EXISTING_ACCELERATOR}" ]; then
echo "Global Accelerator already exists with name '${ACCELERATOR_NAME}'"
exit 1
fi

CLUSTER_1_REGION=$(rosa describe cluster -c ${CLUSTER_1} -o json | jq -r .region.id)
CLUSTER_2_REGION=$(rosa describe cluster -c ${CLUSTER_2} -o json | jq -r .region.id)

Expand All @@ -86,51 +71,17 @@ if [[ "${CLUSTER_1_REGION}" != "${CLUSTER_2_REGION}" ]]; then
exit 1
fi

ENDPOINT_GROUP_REGION=${CLUSTER_1_REGION}

CLUSTER_1_ENDPOINT_ARN=$(createLoadBalancer ${CLUSTER_1} ${CLUSTER_1_REGION} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE})
CLUSTER_2_ENDPOINT_ARN=$(createLoadBalancer ${CLUSTER_2} ${CLUSTER_2_REGION} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE})

ACCELERATOR=$(aws globalaccelerator create-accelerator \
--name ${ACCELERATOR_NAME} \
--query 'Accelerator' \
--ip-address-type DUAL_STACK \
--output json
)

ACCELERATOR_ARN=$(echo ${ACCELERATOR} | jq -r .AcceleratorArn)
ACCELERATOR_DNS=$(echo ${ACCELERATOR} | jq -r .DnsName)
ACCELERATOR_DUAL_STACK_DNS=$(echo ${ACCELERATOR} | jq -r .DualStackDnsName)

LISTENER_ARN=$(aws globalaccelerator create-listener \
--accelerator-arn ${ACCELERATOR_ARN} \
--port-ranges '[{"FromPort":443,"ToPort":443}]' \
--protocol TCP \
--query 'Listener.ListenerArn' \
--output text
)

ENDPOINTS=$(echo '
[
{
"EndpointId": "'${CLUSTER_1_ENDPOINT_ARN}'",
"Weight": 50,
"ClientIPPreservationEnabled": false
},
{
"EndpointId": "'${CLUSTER_2_ENDPOINT_ARN}'",
"Weight": 50,
"ClientIPPreservationEnabled": false
}
]' | jq -c .
)
createLoadBalancer ${CLUSTER_1} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE}
createLoadBalancer ${CLUSTER_2} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE}

ENDPOINT_GROUP_ARN=$(aws globalaccelerator create-endpoint-group \
--listener-arn ${LISTENER_ARN} \
--traffic-dial-percentage 100 \
--endpoint-configurations ${ENDPOINTS} \
--endpoint-group-region ${ENDPOINT_GROUP_REGION}
)
TOFU_CMD="tofu apply -auto-approve \
-var aws_region=${CLUSTER_1_REGION} \
-var lb_service_name="${KEYCLOAK_NAMESPACE}/${ACCELERATOR_LB_NAME}" \
-var name=${ACCELERATOR_NAME} \
-var site_a=${CLUSTER_1} \
-var site_b=${CLUSTER_2}"

echo "ACCELERATOR DNS: ${ACCELERATOR_DNS}"
echo "ACCELERATOR DUAL_STACK DNS: ${ACCELERATOR_DUAL_STACK_DNS}"
cd ${SCRIPT_DIR}/../../opentofu/modules/aws/accelerator
source ${SCRIPT_DIR}/../../opentofu/create.sh ${ACCELERATOR_NAME} "${TOFU_CMD}"
echo "ACCELERATOR DNS: $(tofu output -json | jq -r .dns_name.value)"
echo "ACCELERATOR WEBHOOK: $(tofu output -json | jq -r .webhook_url.value)"
65 changes: 17 additions & 48 deletions provision/aws/global-accelerator/accelerator_multi_az_delete.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,23 @@ function deleteLoadBalancer() {
oc delete -n ${NAMESPACE} svc ${SVC_NAME} || true
}

requiredEnv ACCELERATOR_NAME
if [ -z "${ACCELERATOR_NAME}" ]; then
if [ -z "${ACCELERATOR_DNS}" ]; then
echo "ACCELERATOR_NAME or ACCELERATOR_DNS must be set"
exit 1
fi
ACCELERATOR_NAME=$(aws globalaccelerator list-accelerators \
--query "Accelerators[?ends_with(DnsName, '${ACCELERATOR_DNS}')].Name" \
--output text
)
if [ -z "${ACCELERATOR_NAME}" ]; then
echo "Unable to find Global Accelerator with DnsName '${ACCELERATOR_DNS}'"
exit 1
fi
fi

cd ${SCRIPT_DIR}/../../opentofu/modules/aws/accelerator
bash ${SCRIPT_DIR}/../../opentofu/destroy.sh ${ACCELERATOR_NAME}

DELETE_LB=${DELETE_LB:=true}
if [ "${DELETE_LB}" = true ]; then
Expand All @@ -26,50 +42,3 @@ if [ "${DELETE_LB}" = true ]; then
deleteLoadBalancer ${CLUSTER_1} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE}
deleteLoadBalancer ${CLUSTER_2} ${ACCELERATOR_LB_NAME} ${KEYCLOAK_NAMESPACE}
fi

ACCELERATOR_ARN=$(aws globalaccelerator list-accelerators \
--query "Accelerators[?Name=='${ACCELERATOR_NAME}'].AcceleratorArn" \
--output text
)

if [ -z "${ACCELERATOR_ARN}" ]; then
echo "${ACCELERATOR_NAME} not found"
exit 0
fi

aws globalaccelerator update-accelerator \
--accelerator-arn ${ACCELERATOR_ARN} \
--no-enabled

LISTENER_ARN=$(aws globalaccelerator list-listeners \
--accelerator-arn ${ACCELERATOR_ARN} \
--query "Listeners[0].ListenerArn" \
--output text
)

if [[ "${LISTENER_ARN}" != "None" ]]; then
ENDPOINT_GROUP_ARN=$(aws globalaccelerator list-endpoint-groups \
--listener-arn ${LISTENER_ARN} \
--query 'EndpointGroups[].EndpointGroupArn' \
--output text
)

if [[ -n "${ENDPOINT_GROUP_ARN}" ]]; then
aws globalaccelerator delete-endpoint-group \
--endpoint-group-arn ${ENDPOINT_GROUP_ARN}
fi

aws globalaccelerator delete-listener \
--listener-arn ${LISTENER_ARN}
fi

count=0
until acceleratorDisabled ${ACCELERATOR_ARN} || (( count++ >= 300 )); do
sleep 1
done

if [ $count -gt 300 ]; then
echo "Timeout waiting for accelerator ${ACCELERATOR_ARN} to be removed"
exit 1
fi
aws globalaccelerator delete-accelerator --accelerator-arn ${ACCELERATOR_ARN}
24 changes: 24 additions & 0 deletions provision/aws/rosa_common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

function requiredEnv() {
for ENV in $@; do
if [ -z "${!ENV}" ]; then
echo "${ENV} variable must be set"
exit 1
fi
done
}

# Wait for k8s resource to exist. See: https://github.com/kubernetes/kubernetes/issues/83242
function waitFor() {
xtrace=$(set +o|grep xtrace); set +x
local ns=${1?namespace is required}; shift
local type=${1?type is required}; shift

echo "Waiting for $type $*"
until oc -n "$ns" get "$type" "$@" -o=jsonpath='{.items[0].metadata.name}' >/dev/null 2>&1; do
echo "Waiting for $type $*"
sleep 1
done
eval "$xtrace"
}
Loading

0 comments on commit 587ceda

Please sign in to comment.