Skip to content

Commit

Permalink
a fix for launching consecutive GPU jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
justanhduc committed Nov 20, 2020
1 parent 250f733 commit 6ebe392
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 2 deletions.
10 changes: 10 additions & 0 deletions client.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <sys/socket.h>
#include <sys/time.h>
#include <signal.h>

#include "main.h"

static void c_end_of_job(const struct Result *res);
Expand Down Expand Up @@ -160,6 +161,9 @@ int c_wait_server_commands() {
run_job(&result);
c_end_of_job(&result);
return result.errorlevel;
} else if (m.type == REMINDER) {
sleep(m.u.gpu_wait_time);
c_send_reminder();
}
}
return -1;
Expand Down Expand Up @@ -767,3 +771,9 @@ void c_set_gpu_wait_time() {
m.u.gpu_wait_time = command_line.gpu_wait_time;
send_msg(server_socket, &m);
}

void c_send_reminder() {
struct Msg m;
m.type = REMINDER;
send_msg(server_socket, &m);
}
4 changes: 3 additions & 1 deletion jobs.c
Original file line number Diff line number Diff line change
Expand Up @@ -581,9 +581,11 @@ int next_run_job() {
* be executed as `select` blocks. fortunately, GPU jobs
* usually last much longer (hours) than
* time_between_gpu_runs (tens of seconds).
* TODO: fix this*/
* So each time like that, the server asks the client to
* send a reminder after the waiting time */
if ((time(NULL) - last_gpu_run_time) < time_between_gpu_runs) {
/* there was one GPU task just run, next */
s_request_reminder_after(time_between_gpu_runs, p->jobid);
p = p->next;
continue;
}
Expand Down
7 changes: 6 additions & 1 deletion main.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ enum msg_types {
LAST_ID,
KILL_ALL,
SET_GPU_WAIT_TIME,
GET_GPU_WAIT_TIME
GET_GPU_WAIT_TIME,
REMINDER
};

enum Request {
Expand Down Expand Up @@ -269,6 +270,8 @@ void c_set_gpu_wait_time();

void c_get_gpu_wait_time();

void c_send_reminder();

/* jobs.c */
void s_list(int s);

Expand Down Expand Up @@ -343,6 +346,8 @@ void server_main(int notify_fd, char *_path);

void dump_conns_struct(FILE *out);

void s_request_reminder_after(int time, int jobid);

/* server_start.c */
int try_connect(int s);

Expand Down
14 changes: 14 additions & 0 deletions server.c
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,8 @@ client_read(int index) {
case GET_GPU_WAIT_TIME:
s_send_time_between_gpu_runs(s);
break;
case REMINDER:
break;
case GET_VERSION:
s_send_version(s);
break;
Expand Down Expand Up @@ -523,6 +525,18 @@ static void s_newjob_nok(int index) {
send_msg(s, &m);
}

void s_request_reminder_after(int time, int jobid) {
struct Msg m;
int idx;

m.type = REMINDER;
m.u.gpu_wait_time = time;
idx = get_conn_of_jobid(jobid);
if (idx == -1)
error("Cannot find the client holding job %d", jobid);
send_msg(client_cs[idx].socket, &m);
}

static void dump_conn_struct(FILE *out, const struct Client_conn *p) {
fprintf(out, " new_conn\n");
fprintf(out, " socket %i\n", p->socket);
Expand Down

0 comments on commit 6ebe392

Please sign in to comment.