-
Notifications
You must be signed in to change notification settings - Fork 238
/
job.py
618 lines (513 loc) · 20.8 KB
/
job.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
import asyncio
import base64
import collections
import json
import logging
import os
import traceback
from typing import TYPE_CHECKING, Dict, List
import aiohttp
from gear import CommonAiohttpAppKeys, Database, K8sCache
from hailtop import httpx
from hailtop.aiotools import BackgroundTaskManager
from hailtop.batch_client.globals import ROOT_JOB_GROUP_ID
from hailtop.utils import Notice, retry_transient_errors, time_msecs
from ..batch import batch_record_to_dict, job_group_record_to_dict
from ..batch_configuration import KUBERNETES_SERVER_URL
from ..batch_format_version import BatchFormatVersion
from ..file_store import FileStore
from ..globals import STATUS_FORMAT_VERSION, complete_states, tasks
from ..instance_config import QuantifiedResource
from ..spec_writer import SpecWriter
from .instance import Instance
if TYPE_CHECKING:
from .instance_collection import InstanceCollectionManager # pylint: disable=cyclic-import
log = logging.getLogger('job')
async def notify_batch_job_complete(db: Database, client_session: httpx.ClientSession, batch_id: int):
record = await db.select_and_fetchone(
"""
SELECT batches.*,
cost_t.cost,
cost_t.cost_breakdown,
job_groups_cancelled.id IS NOT NULL AS cancelled,
job_groups_n_jobs_in_complete_states.n_completed,
job_groups_n_jobs_in_complete_states.n_succeeded,
job_groups_n_jobs_in_complete_states.n_failed,
job_groups_n_jobs_in_complete_states.n_cancelled
FROM batches
LEFT JOIN job_groups_n_jobs_in_complete_states
ON batches.id = job_groups_n_jobs_in_complete_states.id
LEFT JOIN LATERAL (
SELECT COALESCE(SUM(`usage` * rate), 0) AS cost, JSON_OBJECTAGG(resources.resource, COALESCE(`usage` * rate, 0)) AS cost_breakdown
FROM (
SELECT batch_id, resource_id, CAST(COALESCE(SUM(`usage`), 0) AS SIGNED) AS `usage`
FROM aggregated_job_group_resources_v3
WHERE batches.id = aggregated_job_group_resources_v3.batch_id
GROUP BY batch_id, resource_id
) AS usage_t
LEFT JOIN resources ON usage_t.resource_id = resources.resource_id
GROUP BY batch_id
) AS cost_t ON TRUE
LEFT JOIN job_groups_cancelled
ON batches.id = job_groups_cancelled.id
WHERE batches.id = %s AND NOT deleted AND callback IS NOT NULL AND
batches.`state` = 'complete';
""",
(batch_id,),
'notify_batch_job_complete',
)
if not record:
return
callback = record['callback']
log.info(f'making callback for batch {batch_id}: {callback}')
async def request(session):
await session.post(callback, json=batch_record_to_dict(record))
log.info(f'callback for batch {batch_id} successful')
try:
if record['user'] == 'ci':
# only jobs from CI may use batch's TLS identity
await request(client_session)
else:
async with httpx.client_session() as session:
await request(session)
except asyncio.CancelledError:
raise
except Exception:
log.info(f'callback for batch {batch_id} failed, will not retry.')
async def notify_job_group_on_job_complete(
db: Database, client_session: httpx.ClientSession, batch_id: int, job_group_id: int
):
records = db.select_and_fetchall(
"""
SELECT job_groups.*,
ancestor_id,
cost_t.cost,
cost_t.cost_breakdown,
t.cancelled IS NOT NULL AS cancelled,
job_groups_n_jobs_in_complete_states.n_completed,
job_groups_n_jobs_in_complete_states.n_succeeded,
job_groups_n_jobs_in_complete_states.n_failed,
job_groups_n_jobs_in_complete_states.n_cancelled
FROM job_group_self_and_ancestors
LEFT JOIN job_groups ON job_groups.batch_id = job_group_self_and_ancestors.batch_id AND
job_groups.job_group_id = job_group_self_and_ancestors.ancestor_id
LEFT JOIN batches ON job_group_self_and_ancestors.batch_id = batches.id
LEFT JOIN job_groups_n_jobs_in_complete_states
ON job_group_self_and_ancestors.batch_id = job_groups_n_jobs_in_complete_states.id AND
job_group_self_and_ancestors.ancestor_id = job_groups_n_jobs_in_complete_states.job_group_id
LEFT JOIN LATERAL (
SELECT COALESCE(SUM(`usage` * rate), 0) AS cost, JSON_OBJECTAGG(resources.resource, COALESCE(`usage` * rate, 0)) AS cost_breakdown
FROM (
SELECT resource_id, CAST(COALESCE(SUM(`usage`), 0) AS SIGNED) AS `usage`
FROM aggregated_job_group_resources_v3
WHERE job_group_self_and_ancestors.batch_id = aggregated_job_group_resources_v3.batch_id AND
job_group_self_and_ancestors.ancestor_id = aggregated_job_group_resources_v3.job_group_id
GROUP BY resource_id
) AS usage_t
LEFT JOIN resources ON usage_t.resource_id = resources.resource_id
) AS cost_t ON TRUE
LEFT JOIN LATERAL (
SELECT 1 AS cancelled
FROM job_group_self_and_ancestors AS self_and_ancestors
INNER JOIN job_groups_cancelled
ON self_and_ancestors.batch_id = job_groups_cancelled.id AND
self_and_ancestors.ancestor_id = job_groups_cancelled.job_group_id
WHERE self_and_ancestors.batch_id = job_group_self_and_ancestors.batch_id AND
self_and_ancestors.job_group_id = job_group_self_and_ancestors.ancestor_id
) AS t ON TRUE
WHERE job_group_self_and_ancestors.batch_id = %s AND
job_group_self_and_ancestors.job_group_id = %s AND
job_group_self_and_ancestors.ancestor_id != %s AND
NOT deleted AND
job_groups.callback IS NOT NULL AND
job_groups.`state` = 'complete';
""",
(batch_id, job_group_id, ROOT_JOB_GROUP_ID),
'notify_job_group_on_job_complete',
)
async for record in records:
ancestor_job_group_id = record['ancestor_id']
callback = record['callback']
log.info(f'making callback for batch {batch_id} job group {ancestor_job_group_id}: {callback}')
async def request(session, record, callback, batch_id, ancestor_job_group_id):
await session.post(callback, json=job_group_record_to_dict(record))
log.info(f'callback for batch {batch_id} job group {ancestor_job_group_id} successful')
try:
if record['user'] == 'ci':
# only jobs from CI may use batch's TLS identity
await request(client_session, record, callback, batch_id, ancestor_job_group_id)
else:
async with httpx.client_session() as session:
await request(session, record, callback, batch_id, ancestor_job_group_id)
except asyncio.CancelledError:
raise
except Exception:
log.info(f'callback for batch {batch_id} job group {ancestor_job_group_id} failed, will not retry.')
async def add_attempt_resources(app, db, batch_id, job_id, attempt_id, resources: List[QuantifiedResource]):
resource_name_to_id = app['resource_name_to_id']
if attempt_id and len(resources) > 0:
try:
_resources: Dict[str, int] = collections.defaultdict(lambda: 0)
for resource in resources:
_resources[resource['name']] += resource['quantity']
# This must be sorted in order to match the order of values in the actual SQL table!
_resources = dict(sorted(_resources.items()))
resource_args = [
(
batch_id,
job_id,
attempt_id,
resource_name_to_id[name].resource_id,
resource_name_to_id[name].deduped_resource_id,
quantity,
)
for name, quantity in _resources.items()
]
await db.execute_many(
"""
INSERT INTO `attempt_resources` (batch_id, job_id, attempt_id, resource_id, deduped_resource_id, quantity)
VALUES (%s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE quantity = quantity;
""",
resource_args,
'add_attempt_resources',
)
except Exception:
log.exception(f'error while inserting resources for job {job_id}, attempt {attempt_id}')
raise
async def mark_job_complete(
app,
batch_id,
job_id,
attempt_id,
job_group_id,
instance_name,
new_state,
status,
start_time,
end_time,
reason,
resources: List[QuantifiedResource],
*,
marked_job_started=False,
):
scheduler_state_changed: Notice = app['scheduler_state_changed']
cancel_ready_state_changed: asyncio.Event = app['cancel_ready_state_changed']
db: Database = app['db']
client_session = app[CommonAiohttpAppKeys.CLIENT_SESSION]
inst_coll_manager: 'InstanceCollectionManager' = app['driver'].inst_coll_manager
task_manager: BackgroundTaskManager = app['task_manager']
id = (batch_id, job_id)
log.info(f'marking job {id} complete new_state {new_state}')
now = time_msecs()
try:
rv = await db.execute_and_fetchone(
'CALL mark_job_complete(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);',
(
batch_id,
job_id,
attempt_id,
instance_name,
new_state,
json.dumps(status) if status is not None else None,
start_time,
end_time,
reason,
now,
),
'mark_job_complete',
)
except Exception:
log.exception(f'error while marking job {id} complete on instance {instance_name}')
raise
scheduler_state_changed.notify()
cancel_ready_state_changed.set()
instance = None
if instance_name:
instance = inst_coll_manager.get_instance(instance_name)
if instance:
if rv['delta_cores_mcpu'] != 0 and instance.state == 'active':
# may also create scheduling opportunities, set above
instance.adjust_free_cores_in_memory(rv['delta_cores_mcpu'])
else:
log.warning(f'mark_complete for job {id} from unknown {instance}')
if not marked_job_started:
await add_attempt_resources(app, db, batch_id, job_id, attempt_id, resources)
if rv['rc'] != 0:
log.info(f'mark_job_complete returned {rv} for job {id}')
return
old_state = rv['old_state']
if old_state in complete_states:
log.info(f'old_state {old_state} complete for job {id}, doing nothing')
# already complete, do nothing
return
await notify_batch_job_complete(db, client_session, batch_id)
await notify_job_group_on_job_complete(db, client_session, batch_id, job_group_id)
if instance and not instance.inst_coll.is_pool and instance.state == 'active':
task_manager.ensure_future(instance.kill())
async def mark_job_started(app, batch_id, job_id, attempt_id, instance, start_time, resources):
db: Database = app['db']
id = (batch_id, job_id)
log.info(f'mark job {id} started')
try:
rv = await db.execute_and_fetchone(
"""
CALL mark_job_started(%s, %s, %s, %s, %s);
""",
(batch_id, job_id, attempt_id, instance.name, start_time),
'mark_job_started',
)
except Exception:
log.info(f'error while marking job {id} started on {instance}')
raise
if rv['delta_cores_mcpu'] != 0 and instance.state == 'active':
instance.adjust_free_cores_in_memory(rv['delta_cores_mcpu'])
await add_attempt_resources(app, db, batch_id, job_id, attempt_id, resources)
async def mark_job_creating(
app,
batch_id: int,
job_id: int,
attempt_id: str,
instance: Instance,
start_time: int,
resources: List[QuantifiedResource],
):
db: Database = app['db']
id = (batch_id, job_id)
log.info(f'mark job {id} creating')
try:
rv = await db.execute_and_fetchone(
"""
CALL mark_job_creating(%s, %s, %s, %s, %s);
""",
(batch_id, job_id, attempt_id, instance.name, start_time),
'mark_job_creating',
)
except Exception:
log.info(f'error while marking job {id} creating on {instance}')
raise
log.info(rv)
if rv['delta_cores_mcpu'] != 0 and instance.state == 'pending':
instance.adjust_free_cores_in_memory(rv['delta_cores_mcpu'])
await add_attempt_resources(app, db, batch_id, job_id, attempt_id, resources)
async def unschedule_job(app, record):
cancel_ready_state_changed: asyncio.Event = app['cancel_ready_state_changed']
scheduler_state_changed: Notice = app['scheduler_state_changed']
db: Database = app['db']
client_session = app[CommonAiohttpAppKeys.CLIENT_SESSION]
inst_coll_manager = app['driver'].inst_coll_manager
batch_id = record['batch_id']
job_id = record['job_id']
attempt_id = record['attempt_id']
id = (batch_id, job_id)
instance_name = record['instance_name']
assert instance_name is not None
log.info(f'unscheduling job {id}, attempt {attempt_id} from instance {instance_name}')
end_time = time_msecs()
try:
rv = await db.execute_and_fetchone(
'CALL unschedule_job(%s, %s, %s, %s, %s, %s);',
(batch_id, job_id, attempt_id, instance_name, end_time, 'cancelled'),
)
except Exception:
log.exception(f'error while unscheduling job {id} on instance {instance_name}')
raise
log.info(f'unschedule job {id}: updated database {rv}')
# job that was running is now ready to be cancelled
cancel_ready_state_changed.set()
instance = inst_coll_manager.get_instance(instance_name)
if not instance:
log.warning(f'unschedule job {id}, attempt {attempt_id}: unknown instance {instance_name}')
return
if rv['delta_cores_mcpu'] and instance.state == 'active':
instance.adjust_free_cores_in_memory(rv['delta_cores_mcpu'])
scheduler_state_changed.notify()
log.info(f'unschedule job {id}, attempt {attempt_id}: updated {instance} free cores')
url = f'http://{instance.ip_address}:5000/api/v1alpha/batches/{batch_id}/jobs/{job_id}/delete'
async def make_request():
if instance.state in ('inactive', 'deleted'):
return
try:
await client_session.delete(url)
await instance.mark_healthy()
except asyncio.TimeoutError:
await instance.incr_failed_request_count()
return
except aiohttp.ClientResponseError as err:
if err.status == 404:
await instance.mark_healthy()
return
await instance.incr_failed_request_count()
raise
await retry_transient_errors(make_request)
if not instance.inst_coll.is_pool:
await instance.kill()
log.info(f'unschedule job {id}, attempt {attempt_id}: called delete job')
async def job_config(app, record):
k8s_cache: K8sCache = app['k8s_cache']
db: Database = app['db']
format_version = BatchFormatVersion(record['format_version'])
batch_id = record['batch_id']
job_id = record['job_id']
attempt_id = record['attempt_id']
job_group_id = record['job_group_id']
db_spec = json.loads(record['spec'])
if format_version.has_full_spec_in_cloud():
job_spec = {
'secrets': format_version.get_spec_secrets(db_spec),
'service_account': format_version.get_spec_service_account(db_spec),
}
else:
job_spec = db_spec
job_spec['attempt_id'] = attempt_id
job_spec['job_group_id'] = job_group_id
userdata = json.loads(record['userdata'])
gsa_key = None
secrets = job_spec.get('secrets', [])
k8s_secrets = await asyncio.gather(*[
k8s_cache.read_secret(secret['name'], secret['namespace']) for secret in secrets
])
# backwards compatibility
gsa_key_secret_name = userdata.get('hail_credentials_secret_name') or userdata['gsa_key_secret_name']
for secret, k8s_secret in zip(secrets, k8s_secrets):
if secret['name'] == gsa_key_secret_name:
gsa_key = k8s_secret.data
secret['data'] = k8s_secret.data
if os.environ.get('HAIL_TERRA'):
assert not gsa_key
else:
assert gsa_key
service_account = job_spec.get('service_account')
if service_account:
namespace = service_account['namespace']
name = service_account['name']
sa = await k8s_cache.read_service_account(name, namespace)
if sa.secrets is not None:
# ServiceAccounts created prior to Kubernetes 1.24 have autogenerated secrets
assert len(sa.secrets) == 1
token_secret_name = sa.secrets[0].name
else:
# ServiceAccounts post v1.24 don't have autogenerated secrets and we make those ourselves
token_secret_name = f'{name}-token'
secret = await k8s_cache.read_secret(token_secret_name, namespace)
user_token = base64.b64decode(secret.data['token']).decode()
cert = secret.data['ca.crt']
kube_config = f"""
apiVersion: v1
clusters:
- cluster:
certificate-authority: /.kube/ca.crt
server: {KUBERNETES_SERVER_URL}
name: default-cluster
contexts:
- context:
cluster: default-cluster
user: {namespace}-{name}
namespace: {namespace}
name: default-context
current-context: default-context
kind: Config
preferences: {{}}
users:
- name: {namespace}-{name}
user:
token: {user_token}
"""
job_spec['secrets'].append({
'name': 'kube-config',
'mount_path': '/.kube',
'data': {'config': base64.b64encode(kube_config.encode()).decode(), 'ca.crt': cert},
})
env = job_spec.get('env')
if not env:
env = []
job_spec['env'] = env
env.append({'name': 'KUBECONFIG', 'value': '/.kube/config'})
if format_version.has_full_spec_in_cloud():
spec_token, start_job_id = await SpecWriter.get_token_start_id(db, batch_id, job_id)
else:
spec_token = None
start_job_id = None
return {
'batch_id': batch_id,
'job_id': job_id,
'format_version': format_version.format_version,
'token': spec_token,
'start_job_id': start_job_id,
'user': record['user'],
'gsa_key': gsa_key,
'job_spec': job_spec,
'queue_time': time_msecs() - record['time_ready'] if record['time_ready'] else None,
}
async def mark_job_errored(app, batch_id, job_group_id, job_id, attempt_id, user, format_version, error_msg):
file_store: FileStore = app['file_store']
status = {
'version': STATUS_FORMAT_VERSION,
'worker': None,
'batch_id': batch_id,
'job_id': job_id,
'job_group_id': job_group_id,
'attempt_id': attempt_id,
'user': user,
'state': 'error',
'error': error_msg,
'container_statuses': {k: None for k in tasks},
}
if format_version.has_full_status_in_gcs():
await file_store.write_status_file(batch_id, job_id, attempt_id, json.dumps(status))
db_status = format_version.db_status(status)
await mark_job_complete(
app, batch_id, job_id, attempt_id, job_group_id, None, 'Error', db_status, None, None, 'error', []
)
async def schedule_job(app, record, instance):
assert instance.state == 'active'
db: Database = app['db']
client_session = app[CommonAiohttpAppKeys.CLIENT_SESSION]
batch_id = record['batch_id']
job_id = record['job_id']
attempt_id = record['attempt_id']
job_group_id = record['job_group_id']
format_version = BatchFormatVersion(record['format_version'])
id = (batch_id, job_id)
try:
body = await job_config(app, record)
except Exception:
log.exception(f'while making job config for job {id} with attempt id {attempt_id}')
await mark_job_errored(
app, batch_id, job_group_id, job_id, attempt_id, record['user'], format_version, traceback.format_exc()
)
raise
try:
await client_session.post(
f'http://{instance.ip_address}:5000/api/v1alpha/batches/jobs/create',
json=body,
timeout=aiohttp.ClientTimeout(total=2),
)
await instance.mark_healthy()
except aiohttp.ClientResponseError as e:
await instance.mark_healthy()
if e.status == 403:
log.info(f'attempt {attempt_id} already exists for job {id} on {instance}, aborting')
if e.status == 503:
log.info(f'job {id} attempt {attempt_id} cannot be scheduled because {instance} is shutting down, aborting')
raise e
except Exception:
await instance.incr_failed_request_count()
raise
try:
rv = await db.execute_and_fetchone(
"""
CALL schedule_job(%s, %s, %s, %s);
""",
(batch_id, job_id, attempt_id, instance.name),
'schedule_job',
)
except Exception:
log.exception(f'Error while running schedule_job procedure for job {id} attempt {attempt_id}')
raise
if rv['delta_cores_mcpu'] != 0 and instance.state == 'active':
instance.adjust_free_cores_in_memory(rv['delta_cores_mcpu'])
if rv['rc'] != 0:
log.info(f'could not schedule job {id}, attempt {attempt_id} on {instance} in the db, {rv}')
return
log.info(f'success scheduling job {id} on {instance}')