From 6b05dfe105915e1db6e1352aeda406f588bb6677 Mon Sep 17 00:00:00 2001 From: Anil Prajapati Date: Mon, 5 May 2025 13:42:36 +0530 Subject: [PATCH] Revert ocp 417 and cpd 51 changes --- docs/playbooks/ocp.md | 6 +- .../playbooks/ocp_convert_to_disconnected.yml | 2 +- .../playbooks/ocp_fyre_provision.yml | 2 +- .../playbooks/ocp_roks_provision.yml | 2 +- .../cp4d_service/tasks/wait/wait-ccs.yml | 151 +++--------------- ibm/mas_devops/roles/mirror_ocp/README.md | 8 +- .../roles/ocp_cluster_monitoring/README.md | 4 +- .../roles/ocp_provision/defaults/main.yml | 2 +- 8 files changed, 38 insertions(+), 139 deletions(-) diff --git a/docs/playbooks/ocp.md b/docs/playbooks/ocp.md index 4c2da1ea0d..0efb9732e8 100644 --- a/docs/playbooks/ocp.md +++ b/docs/playbooks/ocp.md @@ -16,7 +16,7 @@ export AWS_SECRET_ACCESS_KEY=xxx export ROSA_TOKEN=xxx export CLUSTER_NAME=masonrosa -export OCP_VERSION=4.17 +export OCP_VERSION=4.16 export ROSA_COMPUTE_NODES=5 export ROSA_CLUSTER_ADMIN_PASSWORD=xxx ansible-playbook ibm.mas_devops.ocp_rosa_provision @@ -31,7 +31,7 @@ This also supports upgrading the storage volume used for the cluster's internal ```bash export CLUSTER_NAME=masinst1 -export OCP_VERSION=4.17_openshift +export OCP_VERSION=4.16_openshift export IBMCLOUD_APIKEY=xxx export REBOOT_WORKER_NODES=true export CPD_ENTITLEMENT_KEY=xxx @@ -44,7 +44,7 @@ This playbook will provision a QuickBurn OCP cluster in IBM DevIT Fyre service, ```bash export CLUSTER_NAME=masinst1 -export OCP_VERSION=4.17 +export OCP_VERSION=4.16 export FYRE_USERNAME=xxx export FYRE_APIKEY=xxx export FYRE_PRODUCT_ID=xxx diff --git a/ibm/mas_devops/playbooks/ocp_convert_to_disconnected.yml b/ibm/mas_devops/playbooks/ocp_convert_to_disconnected.yml index 47176294c1..8f39fd92c3 100644 --- a/ibm/mas_devops/playbooks/ocp_convert_to_disconnected.yml +++ b/ibm/mas_devops/playbooks/ocp_convert_to_disconnected.yml @@ -5,7 +5,7 @@ vars: ocp_operatorhub_disable_redhat_sources: true - ocp_release: "{{ lookup('env', 'OCP_RELEASE') | default('4.17', true) }}" + ocp_release: "{{ lookup('env', 'OCP_RELEASE') | default('4.16', true) }}" setup_redhat_release: true setup_redhat_catalogs: true diff --git a/ibm/mas_devops/playbooks/ocp_fyre_provision.yml b/ibm/mas_devops/playbooks/ocp_fyre_provision.yml index b8b4b577b9..9a351e6b7e 100644 --- a/ibm/mas_devops/playbooks/ocp_fyre_provision.yml +++ b/ibm/mas_devops/playbooks/ocp_fyre_provision.yml @@ -2,7 +2,7 @@ - hosts: localhost vars: cluster_type: fyre - ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.17', True) }}" + ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.16', True) }}" # We update the cipher support on all installs, even though it's only technically # requires for FIPS clusters diff --git a/ibm/mas_devops/playbooks/ocp_roks_provision.yml b/ibm/mas_devops/playbooks/ocp_roks_provision.yml index ddd4c69627..c6d0480f84 100644 --- a/ibm/mas_devops/playbooks/ocp_roks_provision.yml +++ b/ibm/mas_devops/playbooks/ocp_roks_provision.yml @@ -3,7 +3,7 @@ any_errors_fatal: true vars: cluster_type: roks - ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.17_openshift', True) }}" + ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.16_openshift', True) }}" prometheus_storage_class: ibmc-block-gold prometheus_alertmgr_storage_class: ibmc-file-gold-gid diff --git a/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml b/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml index 6ceeea55b9..d0b0430b01 100644 --- a/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml +++ b/ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml @@ -23,15 +23,12 @@ - "CCS CR already patched? .............. {{ is_ccs_already_patched }}" - "CCS Block Storage Class .............. {{ ccscr_output.resources[0].spec.blockStorageClass | default('', true) }}" - # 2. Apply the patch per recommendation from CP4D team # https://github.ibm.com/NGP-TWC/ml-planning/issues/32683 # https://medium.com/@dany.drouin/scaling-watson-knowledge-catalog-on-cloud-pak-for-data-11623f41f7df # ----------------------------------------------------------------------------- # only run following block if is_ccs_already_patched == False -- name: "wait-ccs : Apply CCS Scaling Patch" - when: not is_ccs_already_patched - block: +- block: - name: "wait-ccs : Patch ccs-cr to increase resource limits" kubernetes.core.k8s: api_version: ccs.cpd.ibm.com/v1beta1 @@ -62,7 +59,8 @@ field_manager: ansible force_conflicts: true - # Delete ccs-operator pod to force the reconcile from the beginning after ccs-cr is patched. + # 3. Delete ccs-operator pod to force the reconcile from the beginning after ccs-cr is patched. + # ----------------------------------------------------------------------------- - name: "wait-ccs : Scale down ccs-operator" kubernetes.core.k8s: api_version: apps/v1 @@ -91,7 +89,8 @@ field_manager: ansible force_conflicts: true - # Wait for ccs operator ... + # 4. Wait for ccs operator ... + # ----------------------------------------------------------------------------- - name: "wait-ccs : Wait for ccs-operator to be ready again (60s delay)" kubernetes.core.k8s_info: api_version: apps/v1 @@ -103,141 +102,41 @@ retries: 20 # Approximately 20 minutes before we give up delay: 60 # 1 minute + when: not is_ccs_already_patched - include_tasks: "tasks/wait/wait-elasticsearch.yml" when: - cpd_48_or_higher # elastic search operator was just introduced with cpd 4.8 - not skip_ibm_entitlement_injection # eventually we hope to be able to skip patching the elastic search cr with image pull secret, but not for now - -# 3. Wait for CouchDB Stateful Set to be ready +# 5. Wait for CouchDB Stateful Set to be ready # ----------------------------------------------------------------------------- # There have been issues with CouchDB not starting due to Persistent Storage, # This task restarts any failing pods - include_tasks: "tasks/wait/wait-couchdb.yml" - when: cpd_48 - + when: + - cpd_48 -# 4. Wait for CCS CR to be ready +# 6. Wait for CCS CR to be ready # ----------------------------------------------------------------------------- # Note: We can't fail early when we see Failed status, as the operator will # report failed multiple times during initial reconcile. -# We give it an hour to have made it past upgrading elasticsearch if still -# failing check to see if elasticsearch needs rescuing by deleting and -# recreating. -# regardless of if the elasticsearch rescue is performed we wait another 4 hours -# to let the process complete (in the always section) -- block: - - name: "wait-ccs : Wait for ccsStatus 'Completed' (5m interval)" - kubernetes.core.k8s_info: - api_version: "ccs.cpd.ibm.com/v1beta1" - kind: CCS - name: "ccs-cr" - namespace: "{{ cpd_instance_namespace }}" - register: ccs_cr_lookup - until: - - ccs_cr_lookup.resources is defined - - ccs_cr_lookup.resources | length == 1 - - ccs_cr_lookup.resources[0].status is defined - - ccs_cr_lookup.resources[0].status.ccsStatus is defined - - ccs_cr_lookup.resources[0].status.ccsStatus == "Completed" - retries: 12 # 1 hour - delay: 300 # Every 5 minutes - - rescue: - - name: "get the elasticsearch statefulset" - kubernetes.core.k8s_info: - api_version: apps/v1 - namespace: "{{ cpd_instance_namespace }}" - kind: StatefulSet - label_selectors: ["ibm-es-master=True"] - register: statefulset_output - - # We are aiming to catch this exception caused by a change in how ElasticSearch is managed by Cloud Pak for Data - # whereby in CPD 5.1 it starts to explicitly control the version of ElasticSearch, and sets it to an older version - # that the version it naturally would have been set to already, resulting in an unsupported downgrade. - # - # [2025-05-02T10:00:35,693][ERROR][o.o.b.OpenSearchUncaughtExceptionHandler] [elasticsea-0ac3-ib-6fb9-es-server-esnodes-0] uncaught exception in thread [main] - # org.opensearch.bootstrap.StartupException: java.lang.IllegalArgumentException: Could not load codec 'Lucene912'. Did you forget to add lucene-backward-codecs.jar? - # at org.opensearch.bootstrap.OpenSearch.init(OpenSearch.java:185) ~[opensearch-2.17.0.jar:2.17.0] - # at org.opensearch.bootstrap.OpenSearch.execute(OpenSearch.java:172) ~[opensearch-2.17.0.jar:2.17.0] - # at org.opensearch.cli.EnvironmentAwareCommand.execute(EnvironmentAwareCommand.java:104) ~[opensearch-2.17.0.jar:2.17.0] - # at org.opensearch.cli.Command.mainWithoutErrorHandling(Command.java:138) ~[opensearch-cli-2.17.0.jar:2.17.0] - # at org.opensearch.cli.Command.main(Command.java:101) ~[opensearch-cli-2.17.0.jar:2.17.0] - # at org.opensearch.bootstrap.OpenSearch.main(OpenSearch.java:138) ~[opensearch-2.17.0.jar:2.17.0] - # at org.opensearch.bootstrap.OpenSearch.main(OpenSearch.java:104) ~[opensearch-2.17.0.jar:2.17.0] - # Caused by: java.lang.IllegalArgumentException: Could not load codec 'Lucene912'. Did you forget to add lucene-backward-codecs.jar? - # - # When this happens, the only solution we are aware of is to delete the ElasticSearch instance and allow it to be recreated fresh, - # the alternative is to not upgrade Cloud Pak for Data until after it catches up with the version of ElasticSearch already in use. - - name: "check if elasticsearch rescue is appropriate" - set_fact: - recover_elasticsearch: true - when: - - statefulset_output.resources is defined - - statefulset_output.resources | length == 1 - - statefulset_output.resources[0].status is defined - - statefulset_output.resources[0].status.availableReplicas is defined - - statefulset_output.resources[0].status.availableReplicas == 0 - - - name: "Pause for 5 minutes before removing elasticsearchcluster and associated persistent volumes" - pause: - minutes: 5 - - - name: "reset rescue decision" - set_fact: - recover_elasticsearch: false - - - name: "confirm elasticsearch rescue is still appropriate" - set_fact: - recover_elasticsearch: true - when: - - statefulset_output.resources is defined - - statefulset_output.resources | length == 1 - - statefulset_output.resources[0].status is defined - - statefulset_output.resources[0].status.availableReplicas is defined - - statefulset_output.resources[0].status.availableReplicas == 0 - - - name: "delete the elasticsearch cluster" - k8s: - api_version: elasticsearch.opencontent.ibm.com/v1 - kind: ElasticsearchCluster - state: absent - namespace: "{{ cpd_instance_namespace }}" - name: "{{ statefulset_output.resources[0].metadata.ownerReferences[0].name }}" - when: - - recover_elasticsearch == true - - - name: "Delete the elasticsearch PVCs" - k8s: - api_version: v1 - kind: PersistentVolumeClaim - namespace: "{{ cpd_instance_namespace }}" - label_selectors: ["app.kubernetes.io/component={{ statefulset_output.resources[0].metadata.name}}"] - state: absent - when: - - recover_elasticsearch == true - - always: - - name: "wait-ccs : Wait for ccsStatus 'Completed' (5m interval)" - kubernetes.core.k8s_info: - api_version: "ccs.cpd.ibm.com/v1beta1" - kind: CCS - name: "ccs-cr" - namespace: "{{ cpd_instance_namespace }}" - register: ccs_cr_lookup - until: - - ccs_cr_lookup.resources is defined - - ccs_cr_lookup.resources | length == 1 - - ccs_cr_lookup.resources[0].status is defined - - ccs_cr_lookup.resources[0].status.ccsStatus is defined - - ccs_cr_lookup.resources[0].status.ccsStatus == "Completed" - retries: 50 # Just over 4 hours - delay: 300 # Every 5 minutes - +- name: "wait-ccs : Wait for ccsStatus 'Completed' (5m interval)" + kubernetes.core.k8s_info: + api_version: "ccs.cpd.ibm.com/v1beta1" + kind: CCS + name: "ccs-cr" + namespace: "{{ cpd_instance_namespace }}" + register: ccs_cr_lookup + until: + - ccs_cr_lookup.resources is defined + - ccs_cr_lookup.resources | length == 1 + - ccs_cr_lookup.resources[0].status is defined + - ccs_cr_lookup.resources[0].status.ccsStatus is defined + - ccs_cr_lookup.resources[0].status.ccsStatus == "Completed" # or ccs_cr_lookup.resources[0].status.wmlStatus == "Failed" + retries: 50 # Just over 4 hours + delay: 300 # Every 5 minutes -# 5. Check that the final status is "Completed" -# ----------------------------------------------------------------------------- - name: "wait-ccs : Check that the CCS ccsStatus is 'Completed'" assert: that: ccs_cr_lookup.resources[0].status.ccsStatus == "Completed" diff --git a/ibm/mas_devops/roles/mirror_ocp/README.md b/ibm/mas_devops/roles/mirror_ocp/README.md index b5bb129dbc..bb52a92321 100644 --- a/ibm/mas_devops/roles/mirror_ocp/README.md +++ b/ibm/mas_devops/roles/mirror_ocp/README.md @@ -81,21 +81,21 @@ Path to your Red Hat pull secret, available from: [https://console.redhat.com/op Role Variables - OpenShift Version ------------------------------------------------------------------------------- ### ocp_release -The Red Hat release you are mirroring content for, e.g. `4.17`. +The Red Hat release you are mirroring content for, e.g. `4.16`. - **Required** - Environment Variable: `OCP_RELEASE` - Default: None ### ocp_min_version -The minimum version of the Red Hat release to mirror platform content for, e.g. `4.17.9`. +The minimum version of the Red Hat release to mirror platform content for, e.g. `4.16.11`. - **Optional** - Environment Variable: `OCP_MIN_VERSION` - Default: None ### ocp_max_version -The maximimum version of the Red Hat release to mirror platform content for, e.g. `4.17.9`. +The maximimum version of the Red Hat release to mirror platform content for, e.g. `4.16.20`. - **Optional** - Environment Variable: `OCP_MAX_VERSION` @@ -157,7 +157,7 @@ Example Playbook mirror_redhat_platform: false mirror_redhat_operators: true - ocp_release: 4.17 + ocp_release: 4.16 redhat_pullsecret: ~/pull-secret.json roles: diff --git a/ibm/mas_devops/roles/ocp_cluster_monitoring/README.md b/ibm/mas_devops/roles/ocp_cluster_monitoring/README.md index d6ee177753..472c1b9b1e 100644 --- a/ibm/mas_devops/roles/ocp_cluster_monitoring/README.md +++ b/ibm/mas_devops/roles/ocp_cluster_monitoring/README.md @@ -2,8 +2,8 @@ ocp_cluster_monitoring =============================================================================== Configures the OpenShift Container Platform Cluster Monitoring enabling two settings: -- [OpenShift user defined project monitoring](hhttps://docs.redhat.com/en/documentation/openshift_container_platform/4.17/html/monitoring/configuring-user-workload-monitoring#preparing-to-configure-the-monitoring-stack-uwm) is enabled (`openshift-monitoring` namespace) -- [OpenShift monitoring stack](https://docs.redhat.com/en/documentation/openshift_container_platform/4.17/html/monitoring/configuring-user-workload-monitoring#configuring-persistent-storage_storing-and-recording-data-uwm) is configured to use persistent storage (`openshift-monitoring` namespace) +- [OpenShift user defined project monitoring](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.16/html/monitoring/enabling-monitoring-for-user-defined-projects) is enabled (`openshift-monitoring` namespace) +- [OpenShift monitoring stack](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.16/html/monitoring/index) is configured to use persistent storage (`openshift-monitoring` namespace) Role Variables diff --git a/ibm/mas_devops/roles/ocp_provision/defaults/main.yml b/ibm/mas_devops/roles/ocp_provision/defaults/main.yml index e035cfecdd..af9053f465 100644 --- a/ibm/mas_devops/roles/ocp_provision/defaults/main.yml +++ b/ibm/mas_devops/roles/ocp_provision/defaults/main.yml @@ -7,7 +7,7 @@ cluster_platform: "{{lookup('env', 'CLUSTER_PLATFORM') | default('x',true)}}" ocp_version: "{{ lookup('env', 'OCP_VERSION') }}" ocp_fips_enabled: "{{ lookup('env', 'OCP_FIPS_ENABLED') | default('false', true) | bool }}" -default_ocp_version: 4.17 +default_ocp_version: 4.16 supported_cluster_types: - fyre