Skip to content

Commit

Permalink
Merge pull request #435 from brianhlin/HTCONDOR-243.new-jr-syntax
Browse files Browse the repository at this point in the history
Add support for new JobRouter syntax (HTCONDOR-243)
  • Loading branch information
brianhlin committed Mar 29, 2021
2 parents a896a1f + 7993b98 commit 8bca412
Show file tree
Hide file tree
Showing 17 changed files with 361 additions and 56 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ install_manifest.txt
cmake_install.cmake
CMakeFiles
CMakeCache.txt
config/01-ce-router.conf
config/01-ce-router-defaults.conf
src/htcondorce/*.pyc
tmp
7 changes: 1 addition & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@ include(FindPythonInterp)
execute_process ( COMMAND ${PYTHON_EXECUTABLE} -c "from distutils.sysconfig import get_python_lib; print(get_python_lib().replace('/usr', '${CMAKE_INSTALL_PREFIX}', 1))" OUTPUT_VARIABLE DETECTED_PYTHON_SITELIB OUTPUT_STRIP_TRAILING_WHITESPACE )
set(PYTHON_SITELIB "${DETECTED_PYTHON_SITELIB}" CACHE PATH "Base directory for python libraries")

configure_file (
"${PROJECT_SOURCE_DIR}/config/01-ce-router.conf.in"
"${CMAKE_CURRENT_BINARY_DIR}/config/01-ce-router.conf"
)

configure_file (
"${PROJECT_SOURCE_DIR}/config/01-ce-router-defaults.conf.in"
"${CMAKE_CURRENT_BINARY_DIR}/config/01-ce-router-defaults.conf"
Expand Down Expand Up @@ -123,7 +118,7 @@ install(FILES
install(FILES
config/01-ce-auth.conf
config/01-ce-collector.conf
${CMAKE_CURRENT_BINARY_DIR}/config/01-ce-router.conf
config/01-ce-router.conf
config/01-pilot-env.conf
config/02-ce-condor.conf
config/02-ce-pbs.conf
Expand Down
198 changes: 198 additions & 0 deletions config/01-ce-router-defaults.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,201 @@ SCHEDD_ATTRS = $(SCHEDD_ATTRS) HTCondorCEVersion grid_resource
#
MERGE_JOB_ROUTER_DEFAULT_ADS=True
JOB_ROUTER_DEFAULTS = $(JOB_ROUTER_DEFAULTS_GENERATED)

# Use JOB_ROUTER_DEFAULTS + JOB_ROUTER_ENTRIES by default instead of
# the new-style job router transforms (HTCONDOR-243)
JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = True


#################################
# Default Job Router Transforms #
#################################

JOB_ROUTER_PRE_ROUTE_TRANSFORMS = Base Cleanup OrigRequests
JOB_ROUTER_POST_ROUTE_TRANSFORMS = Cpus Gpus Memory Queue BatchRuntime CERequirements OnExitHold

JOB_ROUTER_TRANSFORM_Base @=jrt
# Job Router special values
MaxIdleJobs = 2000
MaxJobs = $(CONDORCE_MAX_JOBS)
# Always set the following routed job attributes
SET RoutedJob True
SET Requirements True

@jrt


JOB_ROUTER_TRANSFORM_Cleanup @=jrt
# If the batch condor job is removed then the CE will resubmit the
# job and we want to void that
DELETE PeriodicRemove
DELETE TotalSubmitProcs
@jrt


JOB_ROUTER_TRANSFORM_OrigRequests @=jrt
# Copy original, incoming job attributes for use in post transforms
COPY /^Request.+$/ orig_\0
COPY /^OnExitHold.*$/ orig_\0
COPY BatchRuntime orig_BatchRuntime
COPY environment orig_environment

# Support whole node job requests against HTCondor pools if the source job specifies 'WantWholeNode = True'
# 'if' can't handle complex expressions yet so we evaluate it here for use in post-transforms
EVALMACRO test_want_whole_node $(MY.WantWholeNode : False)
@jrt


JOB_ROUTER_TRANSFORM_Cpus @=jrt
# Outside of the HTCondor WholeNodeJobs case, set RequestCpus to one of the following, in order:
# 1. 'xcount' from the source job
# 2. RequestCpus from the source job
# 3. default_xcount from the job route
# 4. 1
if $(test_want_whole_node)
SET JOB_GLIDEIN_Cpus "$$(MY.TotalCpus ?: JobCpus)"
# MATCH_EXP_JOB_GLIDEIN_Cpus is based on the value of JOB_GLIDEIN_Cpus once the routed job is matched to an
# HTCondor slot
SET GlideinCpusIsGood int(MATCH_EXP_JOB_GLIDEIN_Cpus ?: "0") isnt error
# Also used by JobGpus and JobMemory
SET JobIsRunning (JobStatus =!= 1) && (JobStatus =!= 5) && GlideinCpusIsGood
SET JobCpus JobIsRunning ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus
SET RequestCpus TARGET.TotalCpus ?: JobCpus
endif

EVALMACRO test_xcount $(MY.xcount : 0) > 1
if $(test_xcount)
cpus = $(MY.xcount)
else
EVALMACRO test_orig_RequestCpus $(MY.orig_RequestCpus : 0) > 1
if $(test_orig_RequestCpus)
cpus = $(MY.orig_RequestCpus)
else
cpus = $(default_xcount)
endif
endif

EVALSET OriginalCpus $(cpus : 1)
EVALSET remote_SMPGranularity $(cpus : 1)
EVALSET remote_NodeNumber $(cpus : 1)

DEFAULT JobIsRunning (JobStatus =!= 1) && (JobStatus =!= 5)
DEFAULT JobCpus OriginalCpus
DEFAULT RequestCpus OriginalCpus
@jrt


JOB_ROUTER_TRANSFORM_Gpus @=jrt
# Request GPUs for whole node jobs (HTCONDOR-103)
# If a whole node job requests GPUs and is matched to a machine with GPUs then set the job's RequestGPUs to all the
# GPUs on that machine
if $(test_want_whole_node)
SET JOB_GLIDEIN_GPUs "$$(MY.TotalGPUs ?: JobGPUs)"
# MATCH_EXP_JOB_GLIDEIN_GPUs is based on the value of JOB_GLIDEIN_GPUs once the routed job is matched to an
# HTCondor slot
SET GlideinGPUsIsGood int(MATCH_EXP_JOB_GLIDEIN_GPUs ?: "0") isnt error
SET JobGPUs JobIsRunning ? int(MATCH_EXP_JOB_GLIDEIN_GPUs) : OriginalGPUs
SET RequestGPUs (TARGET.TotalGPUs > 0) ? TotalGPUs: JobGPUs
endif

EVALSET OriginalGPUs $(MY.orig_RequestGPUs)
DEFAULT RequestGPUs OriginalGPUs
@jrt


JOB_ROUTER_TRANSFORM_Memory @=jrt
# Outside of the HTCondor WholeNodeJobs case, set RequestCpus to one of the following, in order:
# 1. 'maxMemory' from the source job if it's positive
# 2. RequestMemory from the source job
# 3. default_MaxMemory from the job route
# 4. 2000
if $(test_want_whole_node)
SET JOB_GLIDEIN_Memory "$$(MY.TotalMemory ?: 0)"
SET JobMemory JobIsRunning ? int(MATCH_EXP_JOB_GLIDEIN_Memory)*95/100 : OriginalMemory
SET RequestMemory TARGET.TotalMemory ? TotalMemory*95/100 : JobMemory
endif

EVALMACRO test_maxmemory $(MY.maxMemory : 0) > 1
if $(test_maxmemory)
mem = $(MY.maxMemory)
else
EVALMACRO test_orig_RequestMemory $(MY.orig_RequestMemory : 0) > 1
if $(test_orig_RequestMemory)
mem = $(MY.orig_RequestMemory)
else
mem = $(default_maxMemory)
endif
endif

EVALSET OriginalMemory $(mem : 2000)
EVALSET remote_OriginalMemory $(mem : 2000)

DEFAULT JobMemory OriginalMemory
DEFAULT RequestMemory OriginalMemory
@jrt


JOB_ROUTER_TRANSFORM_Queue @=jrt
# Set the remote batch queue to one of the following, in order:
# 1. 'batch_queue' from the source job
# 2. 'queue' from the source job
# 3. 'default_queue' from the route
# 4. Empty string
EVALSET remote_queue $(MY.batch_queue) ?: \
$(MY.queue) ?: \
"$(default_queue)"
@jrt


JOB_ROUTER_TRANSFORM_BatchRuntime @=jrt
# 'BatchRuntime' is in seconds but admins configure 'default_maxWallTime' and 'ROUTED_JOB_MAX_TIME' and remote
# submitters set 'maxWallTime' in minutes. Remote submitters set 'BatchRuntime' in minutes
# Set the remote batch runtime used by non-HTCondor batch systems to one of the following, in order:
# 1. 'maxWalltime' (minutes) from the source job
# 2. 'BatchRuntime' (seconds) from the source job
# 3. 'default_maxWallTime' (minutes) from the route
# 4. 'ROUTED_JOB_MAX_TIME' (minutes) from the config
if defined MY.maxWallTime
def_walltime = 60*$(MY.maxWallTime)
elif defined MY.org_BatchRuntime
def_walltime = orig_BatchRuntime
elif defined default_maxWallTime
def_walltime = 60*$(default_maxWallTime)
else
def_walltime = 60*$(ROUTED_JOB_MAX_TIME)
endif

SET BatchRuntime $(def_walltime) ?: 259200
@jrt


JOB_ROUTER_TRANSFORM_CERequirements @=jrt
SET CondorCE 1

ce_reqs = "CondorCE"
if defined default_CERequirements
ce_reqs = $(default_CERequirements),$(ce_reqs)
endif

EVALSET CERequirements $(ce_reqs)
@jrt


JOB_ROUTER_TRANSFORM_OnExitHold @=jrt
SET CondorCE_OnExitHold ((MY.minWalltime isnt undefined && MY.RemoteWallClockTime isnt undefined) ? \
(MY.RemoteWallClockTime < 60*MY.minWallTime) : \
False)

EVALMACRO test_orig_OnExitHold $(MY.orig_OnExitHold : False)

if $(test_orig_OnExitHold)
SET CondorCE_OnExitHoldReason "The on_exit_hold expression ($(orig_OnExitHold)) evaluated to TRUE."
SET OnExitHoldSubcode MY.orig_OnExitHoldSubcode ?: 1
SET OnExitHoldReason MY.orig_OnExitHoldReason ?: MY.CondorCE_OnExitHoldReason
endif

SET OnExitHold (MY.orig_OnExitHold ?: False) || MY.CondorCE_OnExitHold
DEFAULT CondorCE_OnExitHoldReason "The job's wall clock time $(MY.RemoteWallClockTime/60) min, is less than the minimum specified by the job ($(minWalltime))"
DEFAULT OnExitHoldSubcode 42
DEFAULT OnExitHoldReason MY.CondorCE_OnExitHold ? MY.CondorCE_OnExitHoldReason : "Job held for unknown reason."
@jrt
12 changes: 12 additions & 0 deletions config/01-ce-router.conf.in → config/01-ce-router.conf
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,30 @@
# Set the maximum number of jobs the CE is willing to run.
CONDORCE_MAX_JOBS = 10000


# Set the max time, in minutes, of routed jobs (default 72 hours)
ROUTED_JOB_MAX_TIME = 4320


# The amount of time, in seconds, that the job router will wait before considering
# new candidate jobs for routing. If you are seeing heavy CPU usage from the job
# router, increase the polling period to a few hundred seconds.
# JOB_ROUTER_POLLING_PERIOD = 10


# Set this to True to allow HTCondor-CE jobs to run more than once
# (default: False)
# ENABLE_JOB_RETRIES = False


# Some pilot systems leave completed jobs in the HTCondor-CE queue for
# VO operators to retrieve later to troubleshoot specific pilot jobs.
# Set this to the maximum number of days that completed jobs may
# remain in the queue after completion.
# (default: 30)
# COMPELTED_JOB_EXPIRATION = 30


# Use the defaults generated by the condor_ce_router_defaults script. To add
# additional defaults, add additional lines of the form:
#
Expand All @@ -37,3 +42,10 @@ ROUTED_JOB_MAX_TIME = 4320
MERGE_JOB_ROUTER_DEFAULT_ADS=True
JOB_ROUTER_DEFAULTS = $(JOB_ROUTER_DEFAULTS_GENERATED)


# Set the JobRouter configuration syntax, defaults to True
# (i.e. JOB_ROUTER_DEFAULTS combined with JOB_ROUTER_ENTRIES).
# Set to 'False' to use the new 'JOB_ROUTER_TRANSFORM_*' and 'JOB_ROUTER_ROUTE_*'
# style configuration (requires HTCondor 8.9)
#
# JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = True
6 changes: 6 additions & 0 deletions config/02-ce-bosco-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ JOB_ROUTER_ENTRIES = \
TargetUniverse = 9; \
name = "Local_BOSCO"; \
]


JOB_ROUTER_ROUTE_Local_BOSCO @=jrt
TargetUniverse = 9
GridResource = "batch $(BOSCO_RMS) $(BOSCO_ENDPOINT)"
@jrt
12 changes: 12 additions & 0 deletions config/02-ce-bosco.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,15 @@ JOB_ROUTER_ENTRIES = \
TargetUniverse = 9; \
name = "Local_BOSCO"; \
]


# New-style job route configuration (requires HTCondor 8.9) To use
# this instead of JOB_ROUTER_ENTRIES, set "JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = False"
# in /etc/condor-ce/config.d/01-ce-router.conf and uncomment the following:
#
# JOB_ROUTER_ROUTE_Local_BOSCO @=jrt
# TargetUniverse = 9
# GridResource = "batch $(BOSCO_RMS) $(BOSCO_ENDPOINT)"
# @jrt
#
# JOB_ROUTER_ROUTE_NAMES = Local_BOSCO
7 changes: 6 additions & 1 deletion config/02-ce-condor-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ JOB_ROUTER_ENTRIES @=jre
]
@jre


JOB_ROUTER_ROUTE_Local_Condor @=jrt
TargetUniverse = 5
@jrt


JOB_ROUTER_SCHEDD2_SPOOL=/var/lib/condor/spool
JOB_ROUTER_SCHEDD2_NAME=$(FULL_HOSTNAME)
JOB_ROUTER_SCHEDD2_POOL=$(FULL_HOSTNAME):9618

11 changes: 11 additions & 0 deletions config/02-ce-condor.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,18 @@ JOB_ROUTER_ENTRIES @=jre
]
@jre


JOB_ROUTER_SCHEDD2_SPOOL=/var/lib/condor/spool
JOB_ROUTER_SCHEDD2_NAME=$(FULL_HOSTNAME)
JOB_ROUTER_SCHEDD2_POOL=$(FULL_HOSTNAME):9618


# New-style job route configuration (requires HTCondor 8.9) To use
# this instead of JOB_ROUTER_ENTRIES, set "JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = False"
# in /etc/condor-ce/config.d/01-ce-router.conf and uncomment the following:
#
# JOB_ROUTER_ROUTE_Local_Condor @=jrt
# TargetUniverse = 5
# @jrt
#
# JOB_ROUTER_ROUTE_NAMES = Local_Condor
6 changes: 6 additions & 0 deletions config/02-ce-lsf-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,9 @@ JOB_ROUTER_ENTRIES @=jre
name = "Local_LSF";
]
@jre


JOB_ROUTER_ROUTE_Local_LSF @=jrt
TargetUniverse = 9
GridResource = "batch lsf"
@jrt
12 changes: 12 additions & 0 deletions config/02-ce-lsf.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,15 @@ JOB_ROUTER_ENTRIES @=jre
name = "Local_LSF";
]
@jre


# New-style job route configuration (requires HTCondor 8.9) To use
# this instead of JOB_ROUTER_ENTRIES, set "JOB_ROUTER_USE_DEPRECATED_ROUTER_ENTRIES = False"
# in /etc/condor-ce/config.d/01-ce-router.conf and uncomment the following:
#
# JOB_ROUTER_ROUTE_Local_LSF @=jrt
# TargetUniverse = 9
# GridResource = "batch lsf"
# @jrt
#
# JOB_ROUTER_ROUTE_NAMES = Local_LSF
23 changes: 5 additions & 18 deletions config/02-ce-pbs-defaults.conf
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,8 @@ JOB_ROUTER_ENTRIES @=jre
]
@jre

# A more complex route, sending CMS jobs to the "cms" queue and everyone else
# to the "grid" queue
# JOB_ROUTER_ENTRIES @=jre
# [
# GridResource = "batch pbs";
# TargetUniverse = 9;
# name = "Local_PBS_cms";
# set_default_queue = "cms";
# Requirements = target.x509UserProxyVOName =?= "cms";
# ]
# [
# GridResource = "batch pbs";
# TargetUniverse = 9;
# name = "Local_PBS_other";
# set_default_queue = "other";
# Requirements = target.x509UserProxyVOName =!= "cms";
# ]
# @jre

JOB_ROUTER_ROUTE_Local_PBS @=jrt
TargetUniverse = 9
GridResource = "batch pbs"
@jrt

0 comments on commit 8bca412

Please sign in to comment.