Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/playbooks/ocp.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export AWS_SECRET_ACCESS_KEY=xxx
export ROSA_TOKEN=xxx

export CLUSTER_NAME=masonrosa
export OCP_VERSION=4.17
export OCP_VERSION=4.16
export ROSA_COMPUTE_NODES=5
export ROSA_CLUSTER_ADMIN_PASSWORD=xxx
ansible-playbook ibm.mas_devops.ocp_rosa_provision
Expand All @@ -31,7 +31,7 @@ This also supports upgrading the storage volume used for the cluster's internal

```bash
export CLUSTER_NAME=masinst1
export OCP_VERSION=4.17_openshift
export OCP_VERSION=4.16_openshift
export IBMCLOUD_APIKEY=xxx
export REBOOT_WORKER_NODES=true
export CPD_ENTITLEMENT_KEY=xxx
Expand All @@ -44,7 +44,7 @@ This playbook will provision a QuickBurn OCP cluster in IBM DevIT Fyre service,

```bash
export CLUSTER_NAME=masinst1
export OCP_VERSION=4.17
export OCP_VERSION=4.16
export FYRE_USERNAME=xxx
export FYRE_APIKEY=xxx
export FYRE_PRODUCT_ID=xxx
Expand Down
2 changes: 1 addition & 1 deletion ibm/mas_devops/playbooks/ocp_convert_to_disconnected.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
vars:
ocp_operatorhub_disable_redhat_sources: true

ocp_release: "{{ lookup('env', 'OCP_RELEASE') | default('4.17', true) }}"
ocp_release: "{{ lookup('env', 'OCP_RELEASE') | default('4.16', true) }}"
setup_redhat_release: true
setup_redhat_catalogs: true

Expand Down
2 changes: 1 addition & 1 deletion ibm/mas_devops/playbooks/ocp_fyre_provision.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
- hosts: localhost
vars:
cluster_type: fyre
ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.17', True) }}"
ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.16', True) }}"

# We update the cipher support on all installs, even though it's only technically
# requires for FIPS clusters
Expand Down
2 changes: 1 addition & 1 deletion ibm/mas_devops/playbooks/ocp_roks_provision.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
any_errors_fatal: true
vars:
cluster_type: roks
ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.17_openshift', True) }}"
ocp_version: "{{ lookup('env', 'OCP_VERSION') | default('4.16_openshift', True) }}"
prometheus_storage_class: ibmc-block-gold
prometheus_alertmgr_storage_class: ibmc-file-gold-gid

Expand Down
151 changes: 25 additions & 126 deletions ibm/mas_devops/roles/cp4d_service/tasks/wait/wait-ccs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,12 @@
- "CCS CR already patched? .............. {{ is_ccs_already_patched }}"
- "CCS Block Storage Class .............. {{ ccscr_output.resources[0].spec.blockStorageClass | default('<undefined>', true) }}"


# 2. Apply the patch per recommendation from CP4D team
# https://github.ibm.com/NGP-TWC/ml-planning/issues/32683
# https://medium.com/@dany.drouin/scaling-watson-knowledge-catalog-on-cloud-pak-for-data-11623f41f7df
# -----------------------------------------------------------------------------
# only run following block if is_ccs_already_patched == False
- name: "wait-ccs : Apply CCS Scaling Patch"
when: not is_ccs_already_patched
block:
- block:
- name: "wait-ccs : Patch ccs-cr to increase resource limits"
kubernetes.core.k8s:
api_version: ccs.cpd.ibm.com/v1beta1
Expand Down Expand Up @@ -62,7 +59,8 @@
field_manager: ansible
force_conflicts: true

# Delete ccs-operator pod to force the reconcile from the beginning after ccs-cr is patched.
# 3. Delete ccs-operator pod to force the reconcile from the beginning after ccs-cr is patched.
# -----------------------------------------------------------------------------
- name: "wait-ccs : Scale down ccs-operator"
kubernetes.core.k8s:
api_version: apps/v1
Expand Down Expand Up @@ -91,7 +89,8 @@
field_manager: ansible
force_conflicts: true

# Wait for ccs operator ...
# 4. Wait for ccs operator ...
# -----------------------------------------------------------------------------
- name: "wait-ccs : Wait for ccs-operator to be ready again (60s delay)"
kubernetes.core.k8s_info:
api_version: apps/v1
Expand All @@ -103,141 +102,41 @@
retries: 20 # Approximately 20 minutes before we give up
delay: 60 # 1 minute

when: not is_ccs_already_patched

- include_tasks: "tasks/wait/wait-elasticsearch.yml"
when:
- cpd_48_or_higher # elastic search operator was just introduced with cpd 4.8
- not skip_ibm_entitlement_injection # eventually we hope to be able to skip patching the elastic search cr with image pull secret, but not for now


# 3. Wait for CouchDB Stateful Set to be ready
# 5. Wait for CouchDB Stateful Set to be ready
# -----------------------------------------------------------------------------
# There have been issues with CouchDB not starting due to Persistent Storage,
# This task restarts any failing pods
- include_tasks: "tasks/wait/wait-couchdb.yml"
when: cpd_48

when:
- cpd_48

# 4. Wait for CCS CR to be ready
# 6. Wait for CCS CR to be ready
# -----------------------------------------------------------------------------
# Note: We can't fail early when we see Failed status, as the operator will
# report failed multiple times during initial reconcile.
# We give it an hour to have made it past upgrading elasticsearch if still
# failing check to see if elasticsearch needs rescuing by deleting and
# recreating.
# regardless of if the elasticsearch rescue is performed we wait another 4 hours
# to let the process complete (in the always section)
- block:
- name: "wait-ccs : Wait for ccsStatus 'Completed' (5m interval)"
kubernetes.core.k8s_info:
api_version: "ccs.cpd.ibm.com/v1beta1"
kind: CCS
name: "ccs-cr"
namespace: "{{ cpd_instance_namespace }}"
register: ccs_cr_lookup
until:
- ccs_cr_lookup.resources is defined
- ccs_cr_lookup.resources | length == 1
- ccs_cr_lookup.resources[0].status is defined
- ccs_cr_lookup.resources[0].status.ccsStatus is defined
- ccs_cr_lookup.resources[0].status.ccsStatus == "Completed"
retries: 12 # 1 hour
delay: 300 # Every 5 minutes

rescue:
- name: "get the elasticsearch statefulset"
kubernetes.core.k8s_info:
api_version: apps/v1
namespace: "{{ cpd_instance_namespace }}"
kind: StatefulSet
label_selectors: ["ibm-es-master=True"]
register: statefulset_output

# We are aiming to catch this exception caused by a change in how ElasticSearch is managed by Cloud Pak for Data
# whereby in CPD 5.1 it starts to explicitly control the version of ElasticSearch, and sets it to an older version
# that the version it naturally would have been set to already, resulting in an unsupported downgrade.
#
# [2025-05-02T10:00:35,693][ERROR][o.o.b.OpenSearchUncaughtExceptionHandler] [elasticsea-0ac3-ib-6fb9-es-server-esnodes-0] uncaught exception in thread [main]
# org.opensearch.bootstrap.StartupException: java.lang.IllegalArgumentException: Could not load codec 'Lucene912'. Did you forget to add lucene-backward-codecs.jar?
# at org.opensearch.bootstrap.OpenSearch.init(OpenSearch.java:185) ~[opensearch-2.17.0.jar:2.17.0]
# at org.opensearch.bootstrap.OpenSearch.execute(OpenSearch.java:172) ~[opensearch-2.17.0.jar:2.17.0]
# at org.opensearch.cli.EnvironmentAwareCommand.execute(EnvironmentAwareCommand.java:104) ~[opensearch-2.17.0.jar:2.17.0]
# at org.opensearch.cli.Command.mainWithoutErrorHandling(Command.java:138) ~[opensearch-cli-2.17.0.jar:2.17.0]
# at org.opensearch.cli.Command.main(Command.java:101) ~[opensearch-cli-2.17.0.jar:2.17.0]
# at org.opensearch.bootstrap.OpenSearch.main(OpenSearch.java:138) ~[opensearch-2.17.0.jar:2.17.0]
# at org.opensearch.bootstrap.OpenSearch.main(OpenSearch.java:104) ~[opensearch-2.17.0.jar:2.17.0]
# Caused by: java.lang.IllegalArgumentException: Could not load codec 'Lucene912'. Did you forget to add lucene-backward-codecs.jar?
#
# When this happens, the only solution we are aware of is to delete the ElasticSearch instance and allow it to be recreated fresh,
# the alternative is to not upgrade Cloud Pak for Data until after it catches up with the version of ElasticSearch already in use.
- name: "check if elasticsearch rescue is appropriate"
set_fact:
recover_elasticsearch: true
when:
- statefulset_output.resources is defined
- statefulset_output.resources | length == 1
- statefulset_output.resources[0].status is defined
- statefulset_output.resources[0].status.availableReplicas is defined
- statefulset_output.resources[0].status.availableReplicas == 0

- name: "Pause for 5 minutes before removing elasticsearchcluster and associated persistent volumes"
pause:
minutes: 5

- name: "reset rescue decision"
set_fact:
recover_elasticsearch: false

- name: "confirm elasticsearch rescue is still appropriate"
set_fact:
recover_elasticsearch: true
when:
- statefulset_output.resources is defined
- statefulset_output.resources | length == 1
- statefulset_output.resources[0].status is defined
- statefulset_output.resources[0].status.availableReplicas is defined
- statefulset_output.resources[0].status.availableReplicas == 0

- name: "delete the elasticsearch cluster"
k8s:
api_version: elasticsearch.opencontent.ibm.com/v1
kind: ElasticsearchCluster
state: absent
namespace: "{{ cpd_instance_namespace }}"
name: "{{ statefulset_output.resources[0].metadata.ownerReferences[0].name }}"
when:
- recover_elasticsearch == true

- name: "Delete the elasticsearch PVCs"
k8s:
api_version: v1
kind: PersistentVolumeClaim
namespace: "{{ cpd_instance_namespace }}"
label_selectors: ["app.kubernetes.io/component={{ statefulset_output.resources[0].metadata.name}}"]
state: absent
when:
- recover_elasticsearch == true

always:
- name: "wait-ccs : Wait for ccsStatus 'Completed' (5m interval)"
kubernetes.core.k8s_info:
api_version: "ccs.cpd.ibm.com/v1beta1"
kind: CCS
name: "ccs-cr"
namespace: "{{ cpd_instance_namespace }}"
register: ccs_cr_lookup
until:
- ccs_cr_lookup.resources is defined
- ccs_cr_lookup.resources | length == 1
- ccs_cr_lookup.resources[0].status is defined
- ccs_cr_lookup.resources[0].status.ccsStatus is defined
- ccs_cr_lookup.resources[0].status.ccsStatus == "Completed"
retries: 50 # Just over 4 hours
delay: 300 # Every 5 minutes

- name: "wait-ccs : Wait for ccsStatus 'Completed' (5m interval)"
kubernetes.core.k8s_info:
api_version: "ccs.cpd.ibm.com/v1beta1"
kind: CCS
name: "ccs-cr"
namespace: "{{ cpd_instance_namespace }}"
register: ccs_cr_lookup
until:
- ccs_cr_lookup.resources is defined
- ccs_cr_lookup.resources | length == 1
- ccs_cr_lookup.resources[0].status is defined
- ccs_cr_lookup.resources[0].status.ccsStatus is defined
- ccs_cr_lookup.resources[0].status.ccsStatus == "Completed" # or ccs_cr_lookup.resources[0].status.wmlStatus == "Failed"
retries: 50 # Just over 4 hours
delay: 300 # Every 5 minutes

# 5. Check that the final status is "Completed"
# -----------------------------------------------------------------------------
- name: "wait-ccs : Check that the CCS ccsStatus is 'Completed'"
assert:
that: ccs_cr_lookup.resources[0].status.ccsStatus == "Completed"
Expand Down
8 changes: 4 additions & 4 deletions ibm/mas_devops/roles/mirror_ocp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,21 @@ Path to your Red Hat pull secret, available from: [https://console.redhat.com/op
Role Variables - OpenShift Version
-------------------------------------------------------------------------------
### ocp_release
The Red Hat release you are mirroring content for, e.g. `4.17`.
The Red Hat release you are mirroring content for, e.g. `4.16`.

- **Required**
- Environment Variable: `OCP_RELEASE`
- Default: None

### ocp_min_version
The minimum version of the Red Hat release to mirror platform content for, e.g. `4.17.9`.
The minimum version of the Red Hat release to mirror platform content for, e.g. `4.16.11`.

- **Optional**
- Environment Variable: `OCP_MIN_VERSION`
- Default: None

### ocp_max_version
The maximimum version of the Red Hat release to mirror platform content for, e.g. `4.17.9`.
The maximimum version of the Red Hat release to mirror platform content for, e.g. `4.16.20`.

- **Optional**
- Environment Variable: `OCP_MAX_VERSION`
Expand Down Expand Up @@ -157,7 +157,7 @@ Example Playbook
mirror_redhat_platform: false
mirror_redhat_operators: true

ocp_release: 4.17
ocp_release: 4.16
redhat_pullsecret: ~/pull-secret.json

roles:
Expand Down
4 changes: 2 additions & 2 deletions ibm/mas_devops/roles/ocp_cluster_monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ ocp_cluster_monitoring
===============================================================================
Configures the OpenShift Container Platform Cluster Monitoring enabling two settings:

- [OpenShift user defined project monitoring](hhttps://docs.redhat.com/en/documentation/openshift_container_platform/4.17/html/monitoring/configuring-user-workload-monitoring#preparing-to-configure-the-monitoring-stack-uwm) is enabled (`openshift-monitoring` namespace)
- [OpenShift monitoring stack](https://docs.redhat.com/en/documentation/openshift_container_platform/4.17/html/monitoring/configuring-user-workload-monitoring#configuring-persistent-storage_storing-and-recording-data-uwm) is configured to use persistent storage (`openshift-monitoring` namespace)
- [OpenShift user defined project monitoring](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.16/html/monitoring/enabling-monitoring-for-user-defined-projects) is enabled (`openshift-monitoring` namespace)
- [OpenShift monitoring stack](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.16/html/monitoring/index) is configured to use persistent storage (`openshift-monitoring` namespace)


Role Variables
Expand Down
2 changes: 1 addition & 1 deletion ibm/mas_devops/roles/ocp_provision/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ cluster_platform: "{{lookup('env', 'CLUSTER_PLATFORM') | default('x',true)}}"

ocp_version: "{{ lookup('env', 'OCP_VERSION') }}"
ocp_fips_enabled: "{{ lookup('env', 'OCP_FIPS_ENABLED') | default('false', true) | bool }}"
default_ocp_version: 4.17
default_ocp_version: 4.16

supported_cluster_types:
- fyre
Expand Down
Loading