Skip to content

Commit

Permalink
Gittrac #5480: Initial version of submitting NOOP node jobs works
Browse files Browse the repository at this point in the history
(it works if there's a real submit file for the node, but not
otherwise).  Commit includes temporary code.
  • Loading branch information
Kent Wenger committed Mar 28, 2016
1 parent cef408b commit d8333ab
Show file tree
Hide file tree
Showing 9 changed files with 52 additions and 12 deletions.
21 changes: 12 additions & 9 deletions src/condor_dagman/dag.cpp
Expand Up @@ -91,8 +91,8 @@ Dag::Dag( /* const */ StringList &dagFiles,
const CondorID *DAGManJobID,
bool prohibitMultiJobs, bool submitDepthFirst,
const char *defaultNodeLog, bool generateSubdagSubmits,
SubmitDagDeepOptions *submitDagDeepOpts, bool isSplice,
const MyString &spliceScope ) :
SubmitDagDeepOptions *submitDagDeepOpts, bool submitNoopJobs,
bool isSplice, const MyString &spliceScope ) :
_maxPreScripts (maxPreScripts),
_maxPostScripts (maxPostScripts),
MAX_SIGNAL (64),
Expand Down Expand Up @@ -134,6 +134,7 @@ Dag::Dag( /* const */ StringList &dagFiles,
_reject (false),
_alwaysRunPost (true),
_defaultPriority (0),
_submitNoopJobs (submitNoopJobs),
_metrics (NULL)
{

Expand Down Expand Up @@ -1337,7 +1338,7 @@ Job * Dag::FindNodeByEventID ( const CondorID condorID ) const {
check_warning_strictness( DAG_STRICT_3 );
}
}
ASSERT( isNoop == node->GetNoop() );
//TEMPTEMP? ASSERT( isNoop == node->GetNoop() );
}

return node;
Expand Down Expand Up @@ -3673,7 +3674,7 @@ Dag::LogEventNodeLookup( const ULogEvent* event,
// inserted it when we did the condor_submit.)
Job *tmpNode = NULL;
bool isNoop = JobIsNoop( condorID );
ASSERT( isNoop == node->GetNoop() );
//TEMPTEMP ASSERT( isNoop == node->GetNoop() );
int id = GetIndexID( condorID );
HashTable<int, Job *> *ht =
GetEventIDHash( isNoop );
Expand Down Expand Up @@ -3859,9 +3860,9 @@ Dag::SanityCheckSubmitEvent( const CondorID condorID, const Job* node )

//---------------------------------------------------------------------------
HashTable<int, Job *> *
Dag::GetEventIDHash(bool isNoop)
Dag::GetEventIDHash( bool isNoop )
{
if ( isNoop ) {
if ( isNoop && !_submitNoopJobs ) {
return &_noopIDHash;
}

Expand Down Expand Up @@ -3961,7 +3962,8 @@ Dag::SubmitNodeJob( const Dagman &dm, Job *node, CondorID &condorID )
bool submit_success = false;

node->_submitTries++;
if ( node->GetNoop() ) {
//TEMPTEMP? if ( node->GetNoop() ) {
if ( node->GetNoop() && !_submitNoopJobs ) {//TEMPTEMP?
submit_success = fake_condor_submit( condorID, 0,
node->GetJobName(), node->GetDirectory(),
_defaultNodeLog );
Expand All @@ -3975,7 +3977,8 @@ Dag::SubmitNodeJob( const Dagman &dm, Job *node, CondorID &condorID )
node->GetJobName(), parents,
node->varsFromDag, node->GetRetries(),
node->GetDirectory(), _defaultNodeLog,
node->NumChildren() > 0 && dm._claim_hold_time > 0 );
node->NumChildren() > 0 && dm._claim_hold_time > 0,
node->GetNoop() );
}

result = submit_success ? SUBMIT_RESULT_OK : SUBMIT_RESULT_FAILED;
Expand Down Expand Up @@ -4012,7 +4015,7 @@ Dag::ProcessSuccessfulSubmit( Job *node, const CondorID &condorID )
// since we won't have seen the submit command stdout...)

node->SetCondorID( condorID );
ASSERT( JobIsNoop( node->GetID() ) == node->GetNoop() );
//TEMPTEMP ASSERT( JobIsNoop( node->GetID() ) == node->GetNoop() );
int id = GetIndexID( node->GetID() );
int insertResult = GetEventIDHash( node->GetNoop() )->insert( id, node );
ASSERT( insertResult == 0 );
Expand Down
5 changes: 5 additions & 0 deletions src/condor_dagman/dag.h
Expand Up @@ -135,6 +135,7 @@ class Dag {
bool prohibitMultiJobs, bool submitDepthFirst,
const char *defaultNodeLog, bool generateSubdagSubmits,
SubmitDagDeepOptions *submitDagDeepOpts,
bool submitNoopJobs,
bool isSplice = false, const MyString &spliceScope = "root" );

///
Expand Down Expand Up @@ -1193,6 +1194,10 @@ class Dag {
// The default priority for nodes in this DAG. (defaults to 0)
int _defaultPriority;

//TEMPTEMP -- document
//TEMPTEMP -- hmm -- should this be in submitDagDeepOpts or something?
bool _submitNoopJobs;

// Whether the DAG is currently halted.
bool _dagIsHalted;

Expand Down
8 changes: 8 additions & 0 deletions src/condor_dagman/dagman_main.cpp
Expand Up @@ -142,6 +142,7 @@ Dagman::Dagman() :
_claim_hold_time(20),
_doRecovery(false),
_suppressJobLogs(false),
_submitNoopJobs(true),
_dagmanClassad(NULL)
{
debug_level = DEBUG_VERBOSE; // Default debug level is verbose output
Expand Down Expand Up @@ -451,6 +452,12 @@ Dagman::Config()
debug_printf( DEBUG_NORMAL, "DAGMAN_SUPPRESS_JOB_LOGS setting: %s\n",
_suppressJobLogs ? "True" : "False" );

_submitNoopJobs =
param_boolean( "DAGMAN_SUBMIT_NOOP_JOBS",
_submitNoopJobs );
debug_printf( DEBUG_NORMAL, "DAGMAN_SUBMIT_NOOP_JOBS setting: %s\n",
_submitNoopJobs ? "True" : "False" );

// enable up the debug cache if needed
if (debug_cache_enabled) {
debug_cache_set_size(debug_cache_size);
Expand Down Expand Up @@ -1093,6 +1100,7 @@ void main_init (int argc, char ** const argv) {
dagman._defaultNodeLog.Value(),
dagman._generateSubdagSubmits,
&dagman._submitDagDeepOpts,
dagman._submitNoopJobs,
false ); /* toplevel dag! */

if( dagman.dag == NULL ) {
Expand Down
3 changes: 3 additions & 0 deletions src/condor_dagman/dagman_main.h
Expand Up @@ -226,6 +226,9 @@ class Dagman {
// log files specified in their submit files (see gittrac #4353).
bool _suppressJobLogs;

//TEMPTEMP -- document
bool _submitNoopJobs;

DagmanClassad *_dagmanClassad;
};

Expand Down
11 changes: 10 additions & 1 deletion src/condor_dagman/dagman_submit.cpp
Expand Up @@ -201,12 +201,14 @@ do_submit( ArgList &args, CondorID &condorID, bool prohibitMultiJobs )
}

//-------------------------------------------------------------------------
//TEMPTEMP -- probably start by just appending "noop_job = true" here if job is a noop; then add stuff to cons up the whole submit file...
//TEMPTEMP -- for a noop job, do we want to skip a bunch of the stuff like appending the parent node list? or are we better off not creating two different code paths so much?
bool
condor_submit( const Dagman &dm, const char* cmdFile, CondorID& condorID,
const char* DAGNodeName, MyString &DAGParentNodeNames,
List<Job::NodeVar> *vars, int retry,
const char* directory, const char *workflowLogFile,
bool hold_claim )
bool hold_claim, bool noopNode )
{
TmpDir tmpDir;
MyString errMsg;
Expand Down Expand Up @@ -234,6 +236,11 @@ condor_submit( const Dagman &dm, const char* cmdFile, CondorID& condorID,

args.AppendArg( dm.condorSubmitExe );

if ( noopNode ) {
args.AppendArg( "-a" );
args.AppendArg( "noop_job=true" );
}

args.AppendArg( "-a" );
MyString nodeName = MyString(ATTR_DAG_NODE_NAME_ALT) + " = " + DAGNodeName;
args.AppendArg( nodeName.Value() );
Expand Down Expand Up @@ -392,6 +399,8 @@ get_fake_condorID()
{
return _subprocID;
}

//TEMPTEMP -- hmm -- we probably don't want to call this at all if we are submitting a noop node; do we want to call the "real" submit method with a noop flag, or make a new method for submitting noop nodes?
//-------------------------------------------------------------------------
bool
fake_condor_submit( CondorID& condorID, Job* job, const char* DAGNodeName,
Expand Down
1 change: 1 addition & 0 deletions src/condor_dagman/parse.cpp
Expand Up @@ -1728,6 +1728,7 @@ parse_splice(
dag->DefaultNodeLog(),
dag->GenerateSubdagSubmits(),
NULL, // this Dag will never submit a job
false, //TEMPTEMP -- submit noops
true, /* we are a splice! */
current_splice_scope() );

Expand Down
3 changes: 2 additions & 1 deletion src/condor_dagman/submit.h
Expand Up @@ -47,14 +47,15 @@
@param log file to force this job to use (should be null if submit
file specifies log file)
@param hold_claim is true if DAGMAN_HOLD_CLAIM_IDLE is positive
@param noopNode TEMPTEMP
@return true on success, false on failure
*/

bool condor_submit( const Dagman &dm, const char* cmdFile, CondorID& condorID,
const char* DAGNodeName, MyString &DAGParentNodeNames,
List<Job::NodeVar> *vars, int retry,
const char* directory, const char *worflowLogFile,
bool hold_claim );
bool hold_claim, bool noopNode );

void set_fake_condorID( int subprocID );

Expand Down
3 changes: 2 additions & 1 deletion src/condor_tests/job_dagman_basic.dag
@@ -1,3 +1,4 @@
Job NODERET job_dagman_basic-node.cmd
#TEMPTEMP Job NODERET job_dagman_basic-node.cmd
Job NODERET job_dagman_basic-node.cmd NOOP
Script PRE NODERET ./x_dagman_premonitor.pl
Script POST NODERET ./job_dagman_monitor.pl $RETURN
9 changes: 9 additions & 0 deletions src/condor_utils/param_info.in
Expand Up @@ -1193,6 +1193,15 @@ friendly_name=Dagman Write Partial Rescue
review=?
tags=dagman,dagman_main

[DAGMAN_SUBMIT_NOOP_JOBS]
default=true
type=bool
reconfig=false
customization=expert
friendly_name=Dagman Write Partial Rescue
review=?
tags=dagman,dagman_main

[CKPT_SERVER_DIR]
default=
type=string
Expand Down

0 comments on commit d8333ab

Please sign in to comment.