Skip to content

Commit

Permalink
Port cycling to Hercules (NOAA-EMC#2196)
Browse files Browse the repository at this point in the history
Adds cycled support for Hercules (excluding gsi-monitor).
Partially resolves NOAA-EMC#1588
GSI monitoring is disabled on Hercules due to missing Perl modules. That will be enabled in a later PR.
  • Loading branch information
DavidHuber-NOAA committed Jan 8, 2024
1 parent ef6827d commit c15875b
Show file tree
Hide file tree
Showing 13 changed files with 300 additions and 37 deletions.
3 changes: 0 additions & 3 deletions ci/cases/pr/C96C48_hybatmDA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,3 @@ arguments:
gfs_cyc: 1
start: cold
yaml: {{ HOMEgfs }}/ci/platforms/gfs_defaults_ci.yaml

skip_ci_on_hosts:
- hercules
3 changes: 0 additions & 3 deletions ci/cases/pr/C96_atm3DVar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,3 @@ arguments:
gfs_cyc: 1
start: cold
yaml: {{ HOMEgfs }}/ci/platforms/gfs_defaults_ci.yaml

skip_ci_on_hosts:
- hercules
256 changes: 247 additions & 9 deletions env/HERCULES.env
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ fi

step=$1

export npe_node_max=40
export npe_node_max=80
export launcher="srun -l --export=ALL"
export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out"

Expand All @@ -26,19 +26,164 @@ export KMP_AFFINITY=scatter
export OMP_STACKSIZE=2048000
export NTHSTACK=1024000000
#export LD_BIND_NOW=1
export I_MPI_EXTRA_FILESYSTEM=1
export I_MPI_EXTRA_FILESYSTEM_LIST=lustre

ulimit -s unlimited
ulimit -a

if [[ "${step}" = "waveinit" ]] || [[ "${step}" = "waveprep" ]] || [[ "${step}" = "wavepostsbs" ]] || \
[[ "${step}" = "wavepostbndpnt" ]] || [[ "${step}" = "wavepostpnt" ]] || [[ "${step}" == "wavepostbndpntbll" ]]; then
case ${step} in
"prep" | "prepbufr")

nth_max=$((npe_node_max / npe_node_prep))

export POE="NO"
export BACK=${BACK:-"YES"}
export sys_tp="HERCULES"
export launcher_PREP="srun"
;;
"preplandobs")

export APRUN_CALCFIMS="${launcher} -n 1"
;;
"waveinit" | "waveprep" | "wavepostsbs" | "wavepostbndpnt" | "wavepostpnt" | "wavepostbndpntbll")

export CFP_MP="YES"
if [[ "${step}" = "waveprep" ]]; then export MP_PULSE=0 ; fi
[[ "${step}" = "waveprep" ]] && export MP_PULSE=0
export wavempexec=${launcher}
export wave_mpmd=${mpmd_opt}

elif [[ "${step}" = "fcst" ]]; then
;;
"atmanlrun")

nth_max=$((npe_node_max / npe_node_atmanlrun))

export NTHREADS_ATMANL=${nth_atmanlrun:-${nth_max}}
[[ ${NTHREADS_ATMANL} -gt ${nth_max} ]] && export NTHREADS_ATMANL=${nth_max}
export APRUN_ATMANL="${launcher} -n ${npe_atmanlrun} --cpus-per-task=${NTHREADS_ATMANL}"
;;
"atmensanlrun")

nth_max=$((npe_node_max / npe_node_atmensanlrun))

export NTHREADS_ATMENSANL=${nth_atmensanlrun:-${nth_max}}
[[ ${NTHREADS_ATMENSANL} -gt ${nth_max} ]] && export NTHREADS_ATMENSANL=${nth_max}
export APRUN_ATMENSANL="${launcher} -n ${npe_atmensanlrun} --cpus-per-task=${NTHREADS_ATMENSANL}"
;;
"aeroanlrun")

export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_aeroanlrun))

export NTHREADS_AEROANL=${nth_aeroanlrun:-${nth_max}}
[[ ${NTHREADS_AEROANL} -gt ${nth_max} ]] && export NTHREADS_AEROANL=${nth_max}
export APRUN_AEROANL="${launcher} -n ${npe_aeroanlrun} --cpus-per-task=${NTHREADS_AEROANL}"
;;
"landanl")

nth_max=$((npe_node_max / npe_node_landanl))

export NTHREADS_LANDANL=${nth_landanl:-${nth_max}}
[[ ${NTHREADS_LANDANL} -gt ${nth_max} ]] && export NTHREADS_LANDANL=${nth_max}
export APRUN_LANDANL="${launcher} -n ${npe_landanl} --cpus-per-task=${NTHREADS_LANDANL}"

export APRUN_APPLY_INCR="${launcher} -n 6"
;;
"ocnanalbmat")

export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_ocnanalbmat))

export NTHREADS_OCNANAL=${nth_ocnanalbmat:-${nth_max}}
[[ ${NTHREADS_OCNANAL} -gt ${nth_max} ]] && export NTHREADS_OCNANAL=${nth_max}
export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalbmat} --cpus-per-task=${NTHREADS_OCNANAL}"
;;
"ocnanalrun")

export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_ocnanalrun))

export NTHREADS_OCNANAL=${nth_ocnanalrun:-${nth_max}}
[[ ${NTHREADS_OCNANAL} -gt ${nth_max} ]] && export NTHREADS_OCNANAL=${nth_max}
export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalrun} --cpus-per-task=${NTHREADS_OCNANAL}"
;;
"ocnanalchkpt")

export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_ocnanalchkpt))

export NTHREADS_OCNANAL=${nth_ocnanalchkpt:-${nth_max}}
[[ ${NTHREADS_OCNANAL} -gt ${nth_max} ]] && export NTHREADS_OCNANAL=${nth_max}
export APRUN_OCNANAL="${launcher} -n ${npe_ocnanalchkpt} --cpus-per-task=${NTHREADS_OCNANAL}"
;;
"anal" | "analcalc")

export MKL_NUM_THREADS=4
export MKL_CBWR=AUTO

export CFP_MP=${CFP_MP:-"YES"}
export USE_CFP=${USE_CFP:-"YES"}
export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_anal))

export NTHREADS_GSI=${nth_anal:-${nth_max}}
[[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max}
export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}} --cpus-per-task=${NTHREADS_GSI}"

export NTHREADS_CALCINC=${nth_calcinc:-1}
[[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max}
export APRUN_CALCINC="${launcher} \$ncmd --cpus-per-task=${NTHREADS_CALCINC}"

export NTHREADS_CYCLE=${nth_cycle:-12}
[[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max}
npe_cycle=${ntiles:-6}
export APRUN_CYCLE="${launcher} -n ${npe_cycle} --cpus-per-task=${NTHREADS_CYCLE}"

export NTHREADS_GAUSFCANL=1
npe_gausfcanl=${npe_gausfcanl:-1}
export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl} --cpus-per-task=${NTHREADS_GAUSFCANL}"
;;
"sfcanl")
nth_max=$((npe_node_max / npe_node_sfcanl))

export NTHREADS_CYCLE=${nth_sfcanl:-14}
[[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max}
npe_sfcanl=${ntiles:-6}
export APRUN_CYCLE="${launcher} -n ${npe_sfcanl} --cpus-per-task=${NTHREADS_CYCLE}"
;;
"eobs")

export MKL_NUM_THREADS=4
export MKL_CBWR=AUTO

export CFP_MP=${CFP_MP:-"YES"}
export USE_CFP=${USE_CFP:-"YES"}
export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_eobs))

export NTHREADS_GSI=${nth_eobs:-${nth_max}}
[[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max}
export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_eobs}} --cpus-per-task=${NTHREADS_GSI}"
;;
"eupd")

export CFP_MP=${CFP_MP:-"YES"}
export USE_CFP=${USE_CFP:-"YES"}
export APRUNCFP="${launcher} -n \$ncmd ${mpmd_opt}"

nth_max=$((npe_node_max / npe_node_eupd))

export NTHREADS_ENKF=${nth_eupd:-${nth_max}}
[[ ${NTHREADS_ENKF} -gt ${nth_max} ]] && export NTHREADS_ENKF=${nth_max}
export APRUN_ENKF="${launcher} -n ${npe_enkf:-${npe_eupd}} --cpus-per-task=${NTHREADS_ENKF}"
;;
"fcst" | "efcs")

export OMP_STACKSIZE=512M
if [[ "${CDUMP}" =~ "gfs" ]]; then
Expand All @@ -53,17 +198,110 @@ elif [[ "${step}" = "fcst" ]]; then
# With ESMF threading, the model wants to use the full node
export APRUN_UFS="${launcher} -n ${ntasks}"
unset nprocs ppn nnodes ntasks
;;

elif [[ "${step}" = "upp" ]]; then
"upp")

nth_max=$((npe_node_max / npe_node_upp))

export NTHREADS_UPP=${nth_upp:-1}
[[ ${NTHREADS_UPP} -gt ${nth_max} ]] && export NTHREADS_UPP=${nth_max}
export APRUN_UPP="${launcher} -n ${npe_upp} --cpus-per-task=${NTHREADS_UPP}"

elif [[ "${step}" = "atmos_products" ]]; then
;;
"atmos_products")

export USE_CFP="YES" # Use MPMD for downstream product generation
;;
"ecen")

fi
nth_max=$((npe_node_max / npe_node_ecen))

export NTHREADS_ECEN=${nth_ecen:-${nth_max}}
[[ ${NTHREADS_ECEN} -gt ${nth_max} ]] && export NTHREADS_ECEN=${nth_max}
export APRUN_ECEN="${launcher} -n ${npe_ecen} --cpus-per-task=${NTHREADS_ECEN}"

export NTHREADS_CHGRES=${nth_chgres:-12}
[[ ${NTHREADS_CHGRES} -gt ${npe_node_max} ]] && export NTHREADS_CHGRES=${npe_node_max}
export APRUN_CHGRES="time"

export NTHREADS_CALCINC=${nth_calcinc:-1}
[[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max}
export APRUN_CALCINC="${launcher} -n ${npe_ecen} --cpus-per-task=${NTHREADS_CALCINC}"

;;
"esfc")

nth_max=$((npe_node_max / npe_node_esfc))

export NTHREADS_ESFC=${nth_esfc:-${nth_max}}
[[ ${NTHREADS_ESFC} -gt ${nth_max} ]] && export NTHREADS_ESFC=${nth_max}
export APRUN_ESFC="${launcher} -n ${npe_esfc} --cpus-per-task=${NTHREADS_ESFC}"

export NTHREADS_CYCLE=${nth_cycle:-14}
[[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max}
export APRUN_CYCLE="${launcher} -n ${npe_esfc} --cpus-per-task=${NTHREADS_CYCLE}"

;;
"epos")

nth_max=$((npe_node_max / npe_node_epos))

export NTHREADS_EPOS=${nth_epos:-${nth_max}}
[[ ${NTHREADS_EPOS} -gt ${nth_max} ]] && export NTHREADS_EPOS=${nth_max}
export APRUN_EPOS="${launcher} -n ${npe_epos} --cpus-per-task=${NTHREADS_EPOS}"

;;
"postsnd")

export CFP_MP="YES"

nth_max=$((npe_node_max / npe_node_postsnd))

export NTHREADS_POSTSND=${nth_postsnd:-1}
[[ ${NTHREADS_POSTSND} -gt ${nth_max} ]] && export NTHREADS_POSTSND=${nth_max}
export APRUN_POSTSND="${launcher} -n ${npe_postsnd} --cpus-per-task=${NTHREADS_POSTSND}"

export NTHREADS_POSTSNDCFP=${nth_postsndcfp:-1}
[[ ${NTHREADS_POSTSNDCFP} -gt ${nth_max} ]] && export NTHREADS_POSTSNDCFP=${nth_max}
export APRUN_POSTSNDCFP="${launcher} -n ${npe_postsndcfp} ${mpmd_opt}"

;;
"awips")

nth_max=$((npe_node_max / npe_node_awips))

export NTHREADS_AWIPS=${nth_awips:-2}
[[ ${NTHREADS_AWIPS} -gt ${nth_max} ]] && export NTHREADS_AWIPS=${nth_max}
export APRUN_AWIPSCFP="${launcher} -n ${npe_awips} ${mpmd_opt}"

;;
"gempak")

export CFP_MP="YES"

if [[ ${CDUMP} == "gfs" ]]; then
npe_gempak=${npe_gempak_gfs}
npe_node_gempak=${npe_node_gempak_gfs}
fi

nth_max=$((npe_node_max / npe_node_gempak))

export NTHREADS_GEMPAK=${nth_gempak:-1}
[[ ${NTHREADS_GEMPAK} -gt ${nth_max} ]] && export NTHREADS_GEMPAK=${nth_max}
export APRUN="${launcher} -n ${npe_gempak} ${mpmd_opt}"

;;
"fit2obs")

nth_max=$((npe_node_max / npe_node_fit2obs))

export NTHREADS_FIT2OBS=${nth_fit2obs:-1}
[[ ${NTHREADS_FIT2OBS} -gt ${nth_max} ]] && export NTHREADS_FIT2OBS=${nth_max}
export MPIRUN="${launcher} -n ${npe_fit2obs} --cpus-per-task=${NTHREADS_FIT2OBS}"

;;
*)
# Some other job not yet defined here
echo "WARNING: The job step ${step} does not specify Hercules-specific resources"
;;
esac
2 changes: 2 additions & 0 deletions modulefiles/module_base.hercules.lua
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ prepend_path("MODULEPATH", "/work/noaa/epic/role-epic/spack-stack/hercules/spack

load(pathJoin("stack-intel", os.getenv("stack_intel_ver")))
load(pathJoin("stack-intel-oneapi-mpi", os.getenv("stack_impi_ver")))
load(pathJoin("intel-oneapi-mkl", os.getenv("intel_mkl_ver")))
load(pathJoin("python", os.getenv("python_ver")))
load(pathJoin("perl", os.getenv("perl_ver")))

-- TODO load NCL once the SAs remove the 'depends_on' statements within it
-- NCL is a static installation and does not depend on any libraries
Expand Down
15 changes: 11 additions & 4 deletions parm/config/gfs/config.base.emc.dyn
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,20 @@ export DO_GOES="NO" # GOES products
export DO_BUFRSND="NO" # BUFR sounding products
export DO_GEMPAK="NO" # GEMPAK products
export DO_AWIPS="NO" # AWIPS products
export DO_NPOESS="NO" # NPOESS products
export DO_NPOESS="NO" # NPOESS products
export DO_TRACKER="YES" # Hurricane track verification
export DO_GENESIS="YES" # Cyclone genesis verification
export DO_GENESIS_FSU="NO" # Cyclone genesis verification (FSU)
export DO_VERFOZN="YES" # Ozone data assimilation monitoring
export DO_VERFRAD="YES" # Radiance data assimilation monitoring
export DO_VMINMON="YES" # GSI minimization monitoring
# The monitor is not yet supported on Hercules
if [[ "${machine}" == "HERCULES" ]]; then
export DO_VERFOZN="NO" # Ozone data assimilation monitoring
export DO_VERFRAD="NO" # Radiance data assimilation monitoring
export DO_VMINMON="NO" # GSI minimization monitoring
else
export DO_VERFOZN="YES" # Ozone data assimilation monitoring
export DO_VERFRAD="YES" # Radiance data assimilation monitoring
export DO_VMINMON="YES" # GSI minimization monitoring
fi
export DO_MOS="NO" # GFS Model Output Statistics - Only supported on WCOSS2

# NO for retrospective parallel; YES for real-time parallel
Expand Down
14 changes: 10 additions & 4 deletions parm/config/gfs/config.resources
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ elif [[ "${machine}" = "AWSPW" ]]; then
elif [[ ${machine} = "ORION" ]]; then
export npe_node_max=40
elif [[ ${machine} = "HERCULES" ]]; then
export npe_node_max=40
export npe_node_max=80
fi

if [[ ${step} = "prep" ]]; then
Expand Down Expand Up @@ -905,13 +905,19 @@ elif [[ ${step} = "eobs" || ${step} = "eomg" ]]; then
export nth_eomg=${nth_eobs}
npe_node_eobs=$(echo "${npe_node_max} / ${nth_eobs}" | bc)
export npe_node_eobs
export npe_node_eomg=${npe_node_eobs}
export is_exclusive=True
#The number of tasks and cores used must be the same for eobs
#For S4, this is accomplished by running 10 tasks/node
# The number of tasks and cores used must be the same for eobs
# See https://github.com/NOAA-EMC/global-workflow/issues/2092 for details
# For S4, this is accomplished by running 10 tasks/node
if [[ ${machine} = "S4" ]]; then
export npe_node_eobs=10
elif [[ ${machine} = "HERCULES" ]]; then
# For Hercules, this is only an issue at C384; use 20 tasks/node
if [[ ${CASE} = "C384" ]]; then
export npe_node_eobs=20
fi
fi
export npe_node_eomg=${npe_node_eobs}

elif [[ ${step} = "ediag" ]]; then

Expand Down
5 changes: 4 additions & 1 deletion parm/config/gfs/config.ufs
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,12 @@ case "${machine}" in
"WCOSS2")
npe_node_max=128
;;
"HERA" | "ORION" | "HERCULES")
"HERA" | "ORION" )
npe_node_max=40
;;
"HERCULES" )
npe_node_max=80
;;
"JET")
case "${PARTITION_BATCH}" in
"xjet")
Expand Down
4 changes: 3 additions & 1 deletion scripts/exglobal_archive.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ source "${HOMEgfs}/ush/file_utils.sh"

[[ ! -d ${ARCDIR} ]] && mkdir -p "${ARCDIR}"
nb_copy "${COM_ATMOS_ANALYSIS}/${APREFIX}gsistat" "${ARCDIR}/gsistat.${RUN}.${PDY}${cyc}"
nb_copy "${COM_CHEM_ANALYSIS}/${APREFIX}aerostat" "${ARCDIR}/aerostat.${RUN}.${PDY}${cyc}"
if [[ ${DO_AERO} = "YES" ]]; then
nb_copy "${COM_CHEM_ANALYSIS}/${APREFIX}aerostat" "${ARCDIR}/aerostat.${RUN}.${PDY}${cyc}"
fi
nb_copy "${COM_ATMOS_GRIB_1p00}/${APREFIX}pgrb2.1p00.anl" "${ARCDIR}/pgbanl.${RUN}.${PDY}${cyc}.grib2"

# Archive 1 degree forecast GRIB2 files for verification
Expand Down
Loading

0 comments on commit c15875b

Please sign in to comment.