Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for new JobRouter syntax (HTCONDOR-243) #435

Merged
merged 10 commits into from
Mar 29, 2021
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ install_manifest.txt
cmake_install.cmake
CMakeFiles
CMakeCache.txt
config/01-ce-router.conf
config/01-ce-router-defaults.conf
src/htcondorce/*.pyc
tmp
7 changes: 1 addition & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@ include(FindPythonInterp)
execute_process ( COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib().replace('/usr', '${CMAKE_INSTALL_PREFIX}', 1))" OUTPUT_VARIABLE DETECTED_PYTHON_SITELIB OUTPUT_STRIP_TRAILING_WHITESPACE )
set(PYTHON_SITELIB "${DETECTED_PYTHON_SITELIB}" CACHE PATH "Base directory for python libraries")

configure_file (
"${PROJECT_SOURCE_DIR}/config/01-ce-router.conf.in"
"${CMAKE_CURRENT_BINARY_DIR}/config/01-ce-router.conf"
)

configure_file (
"${PROJECT_SOURCE_DIR}/config/01-ce-router-defaults.conf.in"
"${CMAKE_CURRENT_BINARY_DIR}/config/01-ce-router-defaults.conf"
Expand Down Expand Up @@ -115,7 +110,7 @@ install(FILES
install(FILES
config/01-ce-auth.conf
config/01-ce-collector.conf
${CMAKE_CURRENT_BINARY_DIR}/config/01-ce-router.conf
config/01-ce-router.conf
config/01-pilot-env.conf
config/02-ce-condor.conf
config/02-ce-pbs.conf
Expand Down
198 changes: 198 additions & 0 deletions config/01-ce-router-defaults.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,201 @@ SCHEDD_ATTRS = $(SCHEDD_ATTRS) HTCondorCEVersion grid_resource
#
MERGE_JOB_ROUTER_DEFAULT_ADS=True
JOB_ROUTER_DEFAULTS = $(JOB_ROUTER_DEFAULTS_GENERATED)

# Use JOB_ROUTER_DEFAULTS + JOB_ROUTER_ENTRIES by default instead of
# the new-style job router transforms (HTCONDOR-243)
JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = True


#################################
# Default Job Router Transforms #
#################################

JOB_ROUTER_PRE_ROUTE_TRANSFORMS = Base Cleanup OrigRequests
JOB_ROUTER_POST_ROUTE_TRANSFORMS = Cpus Gpus Memory Queue BatchRuntime CERequirements OnExitHold

JOB_ROUTER_TRANSFORM_Base @=jrt
# Job Router special values
MaxIdleJobs = 2000
MaxJobs = $(CONDORCE_MAX_JOBS)
# Always set the following routed job attributes
SET RoutedJob True
SET Requirements True

@jrt


JOB_ROUTER_TRANSFORM_Cleanup @=jrt
# If the batch condor job is removed then the CE will resubmit the
# job and we want to void that
DELETE PeriodicRemove
DELETE TotalSubmitProcs
@jrt


JOB_ROUTER_TRANSFORM_OrigRequests @=jrt
# Copy original, incoming job attributes for use in post transforms
COPY /^Request.+$/ orig_\0
COPY /^OnExitHold.*$/ orig_\0
COPY BatchRuntime orig_BatchRuntime
COPY environment orig_environment

# Support whole node job requests against HTCondor pools if the source job specifies 'WantWholeNode = True'
# 'if' can't handle complex expressions yet so we evaluate it here for use in post-transforms
EVALMACRO test_want_whole_node $(MY.WantWholeNode : False)
@jrt


JOB_ROUTER_TRANSFORM_Cpus @=jrt
# Outside of the HTCondor WholeNodeJobs case, set RequestCpus to one of the following, in order:
# 1. 'xcount' from the source job
# 2. RequestCpus from the source job
# 3. default_xcount from the job route
# 4. 1
if $(test_want_whole_node)
SET JOB_GLIDEIN_Cpus "$$(MY.TotalCpus ?: JobCpus)"
# MATCH_EXP_JOB_GLIDEIN_Cpus is based on the value of JOB_GLIDEIN_Cpus once the routed job is matched to an
# HTCondor slot
SET GlideinCpusIsGood int(MATCH_EXP_JOB_GLIDEIN_Cpus ?: "0") isnt error
# Also used by JobGpus and JobMemory
SET JobIsRunning (JobStatus =!= 1) && (JobStatus =!= 5) && GlideinCpusIsGood
SET JobCpus JobIsRunning ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus
SET RequestCpus TARGET.TotalCpus ?: JobCpus
endif

EVALMACRO test_xcount $(MY.xcount : 0) > 1
if $(test_xcount)
cpus = $(MY.xcount)
else
EVALMACRO test_orig_RequestCpus $(MY.orig_RequestCpus : 0) > 1
if $(test_orig_RequestCpus)
cpus = $(MY.orig_RequestCpus)
else
cpus = $(default_xcount)
endif
endif

EVALSET OriginalCpus $(cpus : 1)
EVALSET remote_SMPGranularity $(cpus : 1)
EVALSET remote_NodeNumber $(cpus : 1)

DEFAULT JobIsRunning (JobStatus =!= 1) && (JobStatus =!= 5)
DEFAULT JobCpus OriginalCpus
DEFAULT RequestCpus OriginalCpus
@jrt


JOB_ROUTER_TRANSFORM_Gpus @=jrt
# Request GPUs for whole node jobs (HTCONDOR-103)
# If a whole node job requests GPUs and is matched to a machine with GPUs then set the job's RequestGPUs to all the
# GPUs on that machine
if $(test_want_whole_node)
SET JOB_GLIDEIN_GPUs "$$(MY.TotalGPUs ?: JobGPUs)"
# MATCH_EXP_JOB_GLIDEIN_GPUs is based on the value of JOB_GLIDEIN_GPUs once the routed job is matched to an
# HTCondor slot
SET GlideinGPUsIsGood int(MATCH_EXP_JOB_GLIDEIN_GPUs ?: "0") isnt error
SET JobGPUs JobIsRunning ? int(MATCH_EXP_JOB_GLIDEIN_GPUs) : OriginalGPUs
SET RequestGPUs (TARGET.TotalGPUs > 0) ? TotalGPUs: JobGPUs
endif

EVALSET OriginalGPUs $(MY.orig_RequestGPUs)
DEFAULT RequestGPUs OriginalGPUs
@jrt


JOB_ROUTER_TRANSFORM_Memory @=jrt
# Outside of the HTCondor WholeNodeJobs case, set RequestCpus to one of the following, in order:
# 1. 'maxMemory' from the source job if it's positive
# 2. RequestMemory from the source job
# 3. default_MaxMemory from the job route
# 4. 2000
if $(test_want_whole_node)
SET JOB_GLIDEIN_Memory "$$(MY.TotalMemory ?: 0)"
SET JobMemory JobIsRunning ? int(MATCH_EXP_JOB_GLIDEIN_Memory)*95/100 : OriginalMemory
SET RequestMemory TARGET.TotalMemory ? TotalMemory*95/100 : JobMemory
endif

EVALMACRO test_maxmemory $(MY.maxMemory : 0) > 1
if $(test_maxmemory)
mem = $(MY.maxMemory)
else
EVALMACRO test_orig_RequestMemory $(MY.orig_RequestMemory : 0) > 1
if $(test_orig_RequestMemory)
mem = $(MY.orig_RequestMemory)
else
mem = $(default_maxMemory)
endif
endif

EVALSET OriginalMemory $(mem : 2000)
EVALSET remote_OriginalMemory $(mem : 2000)

DEFAULT JobMemory OriginalMemory
DEFAULT RequestMemory OriginalMemory
@jrt


JOB_ROUTER_TRANSFORM_Queue @=jrt
# Set the remote batch queue to one of the following, in order:
# 1. 'batch_queue' from the source job
# 2. 'queue' from the source job
# 3. 'default_queue' from the route
# 4. Empty string
EVALSET remote_queue $(MY.batch_queue) ?: \
$(MY.queue) ?: \
"$(default_queue)"
@jrt


JOB_ROUTER_TRANSFORM_BatchRuntime @=jrt
# 'BatchRuntime' is in seconds but admins configure 'default_maxWallTime' and 'ROUTED_JOB_MAX_TIME' and remote
# submitters set 'maxWallTime' in minutes. Remote submitters set 'BatchRuntime' in minutes
# Set the remote batch runtime used by non-HTCondor batch systems to one of the following, in order:
# 1. 'maxWalltime' (minutes) from the source job
# 2. 'BatchRuntime' (seconds) from the source job
# 3. 'default_maxWallTime' (minutes) from the route
# 4. 'ROUTED_JOB_MAX_TIME' (minutes) from the config
if defined MY.maxWallTime
def_walltime = 60*$(MY.maxWallTime)
elif defined MY.org_BatchRuntime
def_walltime = orig_BatchRuntime
elif defined default_maxWallTime
def_walltime = 60*$(default_maxWallTime)
else
def_walltime = 60*$(ROUTED_JOB_MAX_TIME)
endif

SET BatchRuntime $(def_walltime) ?: 259200
@jrt


JOB_ROUTER_TRANSFORM_CERequirements @=jrt
SET CondorCE 1

ce_reqs = "CondorCE"
if defined default_CERequirements
ce_reqs = $(default_CERequirements),$(ce_reqs)
endif

EVALSET CERequirements $(ce_reqs)
@jrt


JOB_ROUTER_TRANSFORM_OnExitHold @=jrt
SET CondorCE_OnExitHold ((MY.minWalltime isnt undefined && MY.RemoteWallClockTime isnt undefined) ? \
(MY.RemoteWallClockTime < 60*MY.minWallTime) : \
False)

EVALMACRO test_orig_OnExitHold $(MY.orig_OnExitHold : False)

if $(test_orig_OnExitHold)
SET CondorCE_OnExitHoldReason "The on_exit_hold expression ($(orig_OnExitHold)) evaluated to TRUE."
SET OnExitHoldSubcode MY.orig_OnExitHoldSubcode ?: 1
SET OnExitHoldReason MY.orig_OnExitHoldReason ?: MY.CondorCE_OnExitHoldReason
endif

SET OnExitHold (MY.orig_OnExitHold ?: False) || MY.CondorCE_OnExitHold
DEFAULT CondorCE_OnExitHoldReason "The job's wall clock time $(MY.RemoteWallClockTime/60) min, is less than the minimum specified by the job ($(minWalltime))"
DEFAULT OnExitHoldSubcode 42
DEFAULT OnExitHoldReason MY.CondorCE_OnExitHold ? MY.CondorCE_OnExitHoldReason : "Job held for unknown reason."
@jrt
12 changes: 12 additions & 0 deletions config/01-ce-router.conf.in → config/01-ce-router.conf
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,30 @@
# Set the maximum number of jobs the CE is willing to run.
CONDORCE_MAX_JOBS = 10000


# Set the max time, in minutes, of routed jobs (default 72 hours)
ROUTED_JOB_MAX_TIME = 4320


# The amount of time, in seconds, that the job router will wait before considering
# new candidate jobs for routing. If you are seeing heavy CPU usage from the job
# router, increase the polling period to a few hundred seconds.
# JOB_ROUTER_POLLING_PERIOD = 10


# Set this to True to allow HTCondor-CE jobs to run more than once
# (default: False)
# ENABLE_JOB_RETRIES = False


# Some pilot systems leave completed jobs in the HTCondor-CE queue for
# VO operators to retrieve later to troubleshoot specific pilot jobs.
# Set this to the maximum number of days that completed jobs may
# remain in the queue after completion.
# (default: 30)
# COMPELTED_JOB_EXPIRATION = 30


# Use the defaults generated by the condor_ce_router_defaults script. To add
# additional defaults, add additional lines of the form:
#
Expand All @@ -37,3 +42,10 @@ ROUTED_JOB_MAX_TIME = 4320
MERGE_JOB_ROUTER_DEFAULT_ADS=True
JOB_ROUTER_DEFAULTS = $(JOB_ROUTER_DEFAULTS_GENERATED)


# Set the JobRouter configuration syntax, defaults to True
# (i.e. JOB_ROUTER_DEFAULTS combined with JOB_ROUTER_ENTRIES).
# Set to 'False' to use the new 'JOB_ROUTER_TRANSFORM_*' and 'JOB_ROUTER_ROUTE_*'
# style configuration (requires HTCondor 8.9)
#
# JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = True
6 changes: 6 additions & 0 deletions config/02-ce-bosco-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ JOB_ROUTER_ENTRIES = \
TargetUniverse = 9; \
name = "Local_BOSCO"; \
]


JOB_ROUTER_ROUTE_Local_BOSCO @=jrt
TargetUniverse = 9
GridResource = "batch $(BOSCO_RMS) $(BOSCO_ENDPOINT)"
@jrt
12 changes: 12 additions & 0 deletions config/02-ce-bosco.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,15 @@ JOB_ROUTER_ENTRIES = \
TargetUniverse = 9; \
name = "Local_BOSCO"; \
]


# New-style job route configuration (requires HTCondor 8.9) To use
# this instead of JOB_ROUTER_ENTRIES, set "JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = False"
# in /etc/condor-ce/config.d/01-ce-router.conf and uncomment the following:
#
# JOB_ROUTER_ROUTE_Local_BOSCO @=jrt
# TargetUniverse = 9
# GridResource = "batch $(BOSCO_RMS) $(BOSCO_ENDPOINT)"
# @jrt
#
# JOB_ROUTER_ROUTE_NAMES = Local_BOSCO
7 changes: 6 additions & 1 deletion config/02-ce-condor-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ JOB_ROUTER_ENTRIES @=jre
]
@jre


JOB_ROUTER_ROUTE_Local_Condor @=jrt
TargetUniverse = 5
@jrt


JOB_ROUTER_SCHEDD2_SPOOL=/var/lib/condor/spool
JOB_ROUTER_SCHEDD2_NAME=$(FULL_HOSTNAME)
JOB_ROUTER_SCHEDD2_POOL=$(FULL_HOSTNAME):9618

11 changes: 11 additions & 0 deletions config/02-ce-condor.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@ JOB_ROUTER_ENTRIES @=jre
]
@jre


JOB_ROUTER_SCHEDD2_SPOOL=/var/lib/condor/spool
JOB_ROUTER_SCHEDD2_NAME=$(FULL_HOSTNAME)
JOB_ROUTER_SCHEDD2_POOL=$(FULL_HOSTNAME):9618


# New-style job route configuration (requires HTCondor 8.9) To use
# this instead of JOB_ROUTER_ENTRIES, set "JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = False"
# in /etc/condor-ce/config.d/01-ce-router.conf and uncomment the following:
#
# JOB_ROUTER_ROUTE_Local_Condor @=jrt
# TargetUniverse = 5
# @jrt
#
# JOB_ROUTER_ROUTE_NAMES = Local_Condor
6 changes: 6 additions & 0 deletions config/02-ce-lsf-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ JOB_ROUTER_ENTRIES @=jre
name = "Local_LSF";
]
@jre


JOB_ROUTER_ROUTE_Local_LSF @=jrt
TargetUniverse = 9
GridResource = "batch lsf"
@jrt
12 changes: 12 additions & 0 deletions config/02-ce-lsf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,15 @@ JOB_ROUTER_ENTRIES @=jre
name = "Local_LSF";
]
@jre


# New-style job route configuration (requires HTCondor 8.9) To use
# this instead of JOB_ROUTER_ENTRIES, set "JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = False"
# in /etc/condor-ce/config.d/01-ce-router.conf and uncomment the following:
#
# JOB_ROUTER_ROUTE_Local_LSF @=jrt
# TargetUniverse = 9
# GridResource = "batch lsf"
# @jrt
#
# JOB_ROUTER_ROUTE_NAMES = Local_LSF
23 changes: 5 additions & 18 deletions config/02-ce-pbs-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,8 @@ JOB_ROUTER_ENTRIES @=jre
]
@jre

# A more complex route, sending CMS jobs to the "cms" queue and everyone else
# to the "grid" queue
# JOB_ROUTER_ENTRIES @=jre
# [
# GridResource = "batch pbs";
# TargetUniverse = 9;
# name = "Local_PBS_cms";
# set_default_queue = "cms";
# Requirements = target.x509UserProxyVOName =?= "cms";
# ]
# [
# GridResource = "batch pbs";
# TargetUniverse = 9;
# name = "Local_PBS_other";
# set_default_queue = "other";
# Requirements = target.x509UserProxyVOName =!= "cms";
# ]
# @jre

JOB_ROUTER_ROUTE_Local_PBS @=jrt
TargetUniverse = 9
GridResource = "batch pbs"
@jrt