From 54daa31ce0a3c23d4d74def5e54436a39a899ed4 Mon Sep 17 00:00:00 2001 From: TerrenceMcGuinness-NOAA Date: Thu, 8 Feb 2024 15:48:38 -0500 Subject: [PATCH] Jenkins Declartive Pipeline for CI with gfs/gefs multibuilds (#2246) Adding top level Jenkins file for CI tests running on Jenkins Controller: - Declarative Multi-branch Pipeline (has enhanced restart capabilities on a per section bases) - Starts Pipeline from Label PR same as BASH system (for now) - Progress and restarts can me managed with CAC Login at [EPIC OAR Jenkins](https://jenkins.epic.oarcloud.noaa.gov) - Has logic for multi **gfs/gefs** system builds (arguments based on a configuration file `ci/casts/yamls/build.yaml`) - Any number of **systems** may be added by manual adding an ele- ment to the matrix in the Jenkinsfile - _It may be possible to dynamic add matrix values with a specialty plug-in_ - Currently only runs on **Orion** and **Hera** using `mterry` account Resolves #2119 Resolves #2118 --- Jenkinsfile | 188 +++++++++++++++++++++++++++ ci/cases/yamls/build.yaml | 3 + ci/scripts/run-check_ci.sh | 4 +- ci/scripts/utils/ci_utils.sh | 124 ++++++++++++++++-- ci/scripts/utils/ci_utils_wrapper.sh | 9 ++ modulefiles/module_gwsetup.hera.lua | 2 + 6 files changed, 315 insertions(+), 15 deletions(-) create mode 100644 Jenkinsfile create mode 100644 ci/cases/yamls/build.yaml create mode 100755 ci/scripts/utils/ci_utils_wrapper.sh diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000..c591aae70f --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,188 @@ +def Machine = 'none' +def machine = 'none' +def HOME = 'none' +def localworkspace = 'none' +def commonworkspace = 'none' + +pipeline { + agent { label 'built-in' } + + options { + skipDefaultCheckout() + buildDiscarder(logRotator(numToKeepStr: '2')) + } + + stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR + // which is used to designate the Nodes in the Jenkins Controler by the agent label + // Each Jenknis Node is connected to said machine via an JAVA agent via an ssh tunnel + + stage('Get Machine') { + agent { label 'built-in' } + steps { + script { + localworkspace = env.WORKSPACE + machine = 'none' + for (label in pullRequest.labels) { + echo "Label: ${label}" + if ((label.matches("CI-Hera-Ready"))) { + machine = 'hera' + } else if ((label.matches("CI-Orion-Ready"))) { + machine = 'orion' + } else if ((label.matches("CI-Hercules-Ready"))) { + machine = 'hercules' + } + } // createing a second machine varible with first letter capital + // because the first letter of the machine name is captitalized in the GitHub labels + Machine = machine[0].toUpperCase() + machine.substring(1) + } + } + } + + stage('Get Common Workspace') { + agent { label "${machine}-emc" } + steps { + script { + properties([parameters([[$class: 'NodeParameterDefinition', allowedSlaves: ['built-in','Hera-EMC','Orion-EMC'], defaultSlaves: ['built-in'], name: '', nodeEligibility: [$class: 'AllNodeEligibility'], triggerIfResult: 'allCases']])]) + HOME = "${WORKSPACE}/TESTDIR" + commonworkspace = "${WORKSPACE}" + sh( script: "mkdir -p ${HOME}/RUNTESTS", returnStatus: true) + pullRequest.addLabel("CI-${Machine}-Building") + if ( pullRequest.labels.any{ value -> value.matches("CI-${Machine}-Ready") } ) { + pullRequest.removeLabel("CI-${Machine}-Ready") + } + } + } + } + + stage('Build System') { + matrix { + agent { label "${machine}-emc" } + //options { + // throttle(['global_matrix_build']) + //} + axes { + axis { + name "system" + values "gfs", "gefs" + } + } + stages { + stage("build system") { + steps { + script { + def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to build the system on per system basis under the common workspace HOME + sh( script: "mkdir -p ${HOMEgfs}", returnStatus: true) + ws(HOMEgfs) { + env.MACHINE_ID = machine // MACHINE_ID is used in the build scripts to determine the machine and is added to the shell environment + if (fileExists("${HOMEgfs}/sorc/BUILT_semaphor")) { // if the system is already built, skip the build in the case of re-runs + sh( script: "cat ${HOMEgfs}/sorc/BUILT_semaphor", returnStdout: true).trim() // TODO: and user configurable control to manage build semphore + ws(commonworkspace) { pullRequest.comment("Cloned PR already built (or build skipped) on ${machine} in directory ${HOMEgfs}") } + } else { + checkout scm + sh( script: "source workflow/gw_setup.sh;which git;git --version;git submodule update --init --recursive", returnStatus: true) + def builds_file = readYaml file: "ci/cases/yamls/build.yaml" + def build_args_list = builds_file['builds'] + def build_args = build_args_list[system].join(" ").trim().replaceAll("null", "") + dir("${HOMEgfs}/sorc") { + sh( script: "${build_args}", returnStatus: true) + sh( script: "./link_workflow.sh", returnStatus: true) + sh( script: "echo ${HOMEgfs} > BUILT_semaphor", returnStatus: true) + } + } + if ( pullRequest.labels.any{ value -> value.matches("CI-${Machine}-Building") } ) { + pullRequest.removeLabel("CI-${Machine}-Building") + } + pullRequest.addLabel("CI-${Machine}-Running") + } + } + } + } + } + } + } + + stage('Run Tests') { + matrix { + agent { label "${machine}-emc" } + axes { + axis { + name "Case" + values "C48_ATM", "C48_S2SWA_gefs", "C48_S2SW", "C96_atm3DVar" // TODO add dynamic list of cases from env vars (needs addtional plugins) + } + } + stages { + stage('Create Experiment') { + steps { + script { + sh( script: "sed -n '/{.*}/!p' ${HOME}/gfs/ci/cases/pr/${Case}.yaml > ${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp", returnStatus: true) + def yaml_case = readYaml file: "${HOME}/gfs/ci/cases/pr/${Case}.yaml.tmp" + system = yaml_case.experiment.system + def HOMEgfs = "${HOME}/${system}" // local HOMEgfs is used to populate the XML on per system basis + env.RUNTESTS = "${HOME}/RUNTESTS" + sh( script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml", returnStatus: true) + } + } + } + stage('Run Experiments') { + steps { + script { + HOMEgfs = "${HOME}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments + ws(HOMEgfs) { + pslot = sh( script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh get_pslot ${HOME}/RUNTESTS ${Case}", returnStdout: true ).trim() + pullRequest.comment("**Running experiments: ${Case} on ${Machine}**
Built against system **${system}** in directory:
`${HOMEgfs}`
With the experiment in directory:
`${HOME}/RUNTESTS/${pslot}`") + try { + sh( script: "${HOMEgfs}/ci/scripts/run-check_ci.sh ${HOME} ${pslot}", returnStatus: true) + } catch (Exception e) { + pullRequest.comment("**FAILURE** running experiments: ${Case} on ${Machine}") + error("Failed to run experiments ${Case} on ${Machine}") + } + pullRequest.comment("**SUCCESS** running experiments: ${Case} on ${Machine}") + } + } + } + post { + always { + script { + ws (HOMEgfs) { + for (label in pullRequest.labels) { + if (label.contains("${Machine}")) { + pullRequest.removeLabel(label) + } + } + } + } + } + success { + script { + ws (HOMEgfs) { + pullRequest.addLabel("CI-${Machine}-Passed") + def timestamp = new Date().format("MM dd HH:mm:ss", TimeZone.getTimeZone('America/New_York')) + pullRequest.comment("**CI SUCCESS** ${Machine} at ${timestamp}\n\nBuilt and ran in directory `${HOME}`") + } + } + } + failure { + script { + ws (HOMEgfs) { + pullRequest.addLabel("CI-${Machine}-Failed") + def timestamp = new Date().format("MM dd HH:mm:ss", TimeZone.getTimeZone('America/New_York')) + pullRequest.comment("**CI FAILED** ${Machine} at ${timestamp}
Built and ran in directory `${HOME}`") + if (fileExists('${HOME}/RUNTESTS/ci.log')) { + def fileContent = readFile '${HOME}/RUNTESTS/ci.log' + fileContent.eachLine { line -> + if( line.contains(".log")) { + archiveArtifacts artifacts: "${line}", fingerprint: true + } + } + } + } + } + } + } + } + } + } + } + } + +} diff --git a/ci/cases/yamls/build.yaml b/ci/cases/yamls/build.yaml new file mode 100644 index 0000000000..5398fa1889 --- /dev/null +++ b/ci/cases/yamls/build.yaml @@ -0,0 +1,3 @@ +builds: + - gefs: './build_all.sh' + - gfs: './build_all.sh -gu' \ No newline at end of file diff --git a/ci/scripts/run-check_ci.sh b/ci/scripts/run-check_ci.sh index 5a909c1c64..f98f434462 100755 --- a/ci/scripts/run-check_ci.sh +++ b/ci/scripts/run-check_ci.sh @@ -21,7 +21,9 @@ pslot=${2:-${pslot:-?}} # Name of the experiment being tested by this scr # │   └── ${pslot} # └── EXPDIR # └── ${pslot} -HOMEgfs="${TEST_DIR}/HOMEgfs" +# Two system build directories created at build time gfs, and gdas +# TODO: Make this configurable (for now all scripts run from gfs for CI at runtime) +HOMEgfs="${TEST_DIR}/gfs" RUNTESTS="${TEST_DIR}/RUNTESTS" # Source modules and setup logging diff --git a/ci/scripts/utils/ci_utils.sh b/ci/scripts/utils/ci_utils.sh index 737a3e5a86..6f2426c388 100755 --- a/ci/scripts/utils/ci_utils.sh +++ b/ci/scripts/utils/ci_utils.sh @@ -1,24 +1,120 @@ #!/bin/env bash -function cancel_slurm_jobs() { +function determine_scheduler() { + if command -v sbatch &> /dev/null; then + echo "slurm"; + elif command -v qsub &> /dev/null; then + echo "torque"; + else + echo "unknown" + fi +} - # Usage: cancel_slurm_jobs - # Example: cancel_slurm_jobs "C48_ATM_3c4e7f74" +function cancel_batch_jobs() { + # Usage: cancel_batch_jobs + # Example: cancel_batch_jobs "C48_ATM_3c4e7f74" # - # Cancel all Slurm jobs that have the given substring in their name + # Cancel all batch jobs that have the given substring in their name # So like in the example all jobs with "C48_ATM_3c4e7f74" # in their name will be canceled local substring=$1 local job_ids - job_ids=$(squeue -u "${USER}" -h -o "%i") - - for job_id in ${job_ids}; do - job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true - if [[ "${job_name}" =~ ${substring} ]]; then - echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" - scancel "${job_id}" - continue - fi - done + + scheduler=$(determine_scheduler) + + if [[ "${schduler}" == "torque" ]]; then + job_ids=$(qstat -u "${USER}" | awk '{print $1}') || true + + for job_id in ${job_ids}; do + job_name=$(qstat -f "${job_id}" | grep Job_Name | awk '{print $3}') || true + if [[ "${job_name}" =~ ${substring} ]]; then + echo "Canceling PBS Job ${job_name} with: qdel ${job_id}" + qdel "${job_id}" + continue + fi + done + + elif [[ "${scheduler}" == "slurm" ]]; then + + job_ids=$(squeue -u "${USER}" -h -o "%i") + + for job_id in ${job_ids}; do + job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true + if [[ "${job_name}" =~ ${substring} ]]; then + echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}" + scancel "${job_id}" + continue + fi + done + + else + echo "FATAL: Unknown/unsupported job scheduler" + exit 1 + fi +} + + +function get_pr_case_list () { + + ############################################################# + # loop over every yaml file in the PR's ci/cases + # and create an run directory for each one for this PR loop + ############################################################# + for yaml_config in "${HOMEgfs}/ci/cases/pr/"*.yaml; do + case=$(basename "${yaml_config}" .yaml) || true + echo "${case}" + done +} + +function get_pslot_list () { + + local RUNTESTS="${1}" + + ############################################################# + # loop over expdir directories in RUNTESTS + # and create list of the directory names (pslot) with the hash tag + ############################################################# + for pslot_dir in "${RUNTESTS}/EXPDIR/"*; do + pslot=$(basename "${pslot_dir}") || true + echo "${pslot}" + done + +} + +function get_pslot () { + + local RUNTESTS="${1}" + local case="${2}" + + ############################################################# + # loop over expdir directories in RUNTESTS + # and return the name of the pslot with its tag that matches the case + ############################################################# + for pslot_dir in "${RUNTESTS}/EXPDIR/"*; do + pslot=$(basename "${pslot_dir}") + check_case=$(echo "${pslot}" | rev | cut -d"_" -f2- | rev) || true + if [[ "${check_case}" == "${case}" ]]; then + echo "${pslot}" + break + fi + done + +} + +function create_experiment () { + + local yaml_config="${1}" + cd "${HOMEgfs}" || exit 1 + pr_sha=$(git rev-parse --short HEAD) + case=$(basename "${yaml_config}" .yaml) || true + export pslot=${case}_${pr_sha} + + source "${HOMEgfs}/ci/platforms/config.${MACHINE_ID}" + source "${HOMEgfs}/workflow/gw_setup.sh" + + # system=$(grep "system:" "${yaml_config}" | cut -d":" -f2 | tr -d " ") || true + + "${HOMEgfs}/${system}/workflow/create_experiment.py" --overwrite --yaml "${yaml_config}" + } diff --git a/ci/scripts/utils/ci_utils_wrapper.sh b/ci/scripts/utils/ci_utils_wrapper.sh new file mode 100755 index 0000000000..51c392fb99 --- /dev/null +++ b/ci/scripts/utils/ci_utils_wrapper.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +HOMEgfs="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." >/dev/null 2>&1 && pwd )" +source "${HOMEgfs}/ush/detect_machine.sh" + +utitilty_function="${1}" + +source "${HOMEgfs}/ci/scripts/utils/ci_utils.sh" +${utitilty_function} "${@:2}" diff --git a/modulefiles/module_gwsetup.hera.lua b/modulefiles/module_gwsetup.hera.lua index c86cac7b02..961403e1a2 100644 --- a/modulefiles/module_gwsetup.hera.lua +++ b/modulefiles/module_gwsetup.hera.lua @@ -14,5 +14,7 @@ load(pathJoin("python", python_ver)) load("py-jinja2") load("py-pyyaml") load("py-numpy") +local git_ver=os.getenv("git_ver") or "2.40.0" +load(pathJoin("git", git_ver)) whatis("Description: GFS run setup environment")