Skip to content

Commit

Permalink
workaround for gpu mem allocation time problem
Browse files Browse the repository at this point in the history
  • Loading branch information
justanhduc committed Nov 11, 2020
1 parent 4494eb8 commit b30ab07
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 6 deletions.
43 changes: 38 additions & 5 deletions jobs.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,23 @@ void s_count_running_jobs(int s)
send_msg(s, &m);
}

int s_count_allocating_jobs()
{
int count = 0;
struct Job *p;

/* Count running jobs */
p = firstjob;
while(p != 0)
{
if (p->state == ALLOCATING)
++count;

p = p->next;
}
return count;
}

void s_get_label(int s, int jobid)
{
struct Job *p = 0;
Expand Down Expand Up @@ -255,7 +272,7 @@ int wake_hold_client()
p = findjob_holding_client();
if (p)
{
p->state = QUEUED;
p->state = (p->gpus) ? ALLOCATING : QUEUED;
return p->jobid;
}
return -1;
Expand All @@ -269,6 +286,9 @@ const char * jstate2string(enum Jobstate s)
case QUEUED:
jobstate = "queued";
break;
case ALLOCATING:
jobstate = "allocating";
break;
case RUNNING:
jobstate = "running";
break;
Expand Down Expand Up @@ -389,11 +409,11 @@ int s_newjob(int s, struct msg *m)
p = newjobptr();

p->jobid = jobids++;
p->gpus = m->u.newjob.gpus;
if (count_not_finished_jobs() < max_jobs)
p->state = QUEUED;
p->state = (p->gpus) ? ALLOCATING : QUEUED;
else
p->state = HOLDING_CLIENT;
p->gpus = m->u.newjob.gpus;
p->num_slots = m->u.newjob.num_slots;
p->store_output = m->u.newjob.store_output;
p->should_keep_finished = m->u.newjob.should_keep_finished;
Expand Down Expand Up @@ -591,12 +611,24 @@ int next_run_job()
p = firstjob;
while(p != 0)
{
if (p->state == QUEUED)
if (p->state == QUEUED || p->state == ALLOCATING)
{
if (p->gpus) {
int numFree;
/* get number of free GPUs at the moment */
getFreeGpuList(&numFree);

if (numFree > 0) {
/* GPU mem takes some time to be allocated
* if there are many processes in queue,
* they can use the same GPU
* TODO: this is ugly */
sleep(60);
} else {
p = p->next;
continue;
}

if (numFree < p->gpus) {
/* if fewer GPUs than required then next */
p = p->next;
Expand All @@ -610,7 +642,8 @@ int next_run_job()
/* We won't try to run any job do_depending on an unfinished
* job */
if (do_depend_job != NULL &&
(do_depend_job->state == QUEUED || do_depend_job->state == RUNNING))
(do_depend_job->state == QUEUED || do_depend_job->state == RUNNING ||
do_depend_job->state == ALLOCATING))
{
/* Next try */
p = p->next;
Expand Down
2 changes: 1 addition & 1 deletion list.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static const char * ofilename_shown(const struct Job *p)
output_filename = "(no output)";
} else if (p->store_output)
{
if (p->state == QUEUED)
if (p->state == QUEUED || p->state == ALLOCATING)
{
output_filename = "(file)";
} else
Expand Down
1 change: 1 addition & 0 deletions main.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ struct msg;
enum Jobstate
{
QUEUED,
ALLOCATING,
RUNNING,
FINISHED,
SKIPPED,
Expand Down

0 comments on commit b30ab07

Please sign in to comment.