-
Notifications
You must be signed in to change notification settings - Fork 17
/
chainerjob-reference.yaml
77 lines (66 loc) · 2.69 KB
/
chainerjob-reference.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# This is a reference of ChainerJob
apiVersion: kubeflow.org/v1alpha1
kind: ChainerJob
metadata:
name: example-job-mn
# ChainerJob in distributed mode consits of Master and multiple WorkerSets.
# Master is the pod (job technically) to boot your entire distributed job.
# WorkerSet is the set of homogenous pod (statefulset technically).
# You can define multiple WorkerSets to make heterogeneous WorkerSets.
spec:
# 'backend' defines the protocol to initiate process groups and exchange
# tensor data among the processes. Current supported backend is "mpi".
backend: mpi
# master is responsible for spawning/exiting entire distributed learning
# process. You con configure 'activeDeadlineSeconds'/'backoffLimit' to
# customize retry behavior on failure.
master:
# slots in autogenerated hostfile is configurable.
# Default slot is 1 or the number of GPUs you requested on
# 'chainer' container.
mpiConfig:
slots: 1
# Retry behaviors on failure of master
activeDeadlineSeconds: 6000
backoffLimit: 60
# You can put any pod template here. There are several exception:
# - a container "chainer" must exist. this is the place your mpi processes run.
# - Only `restartPolicy` equal to `Never` or `OnFailure` is allowed. Default is `Never`.
template:
spec:
containers:
- name: chainer
image: everpeace/chainermn:1.3.0
command:
- sh
- -c
- |
mpiexec -n 3 -N 1 --allow-run-as-root --display-map --mca mpi_cuda_support 0 \
python3 /train_mnist.py -e 2 -b 1000 -u 100
# You can define multiple WorkerSets to have heterogeneous WorkerSets
# When the master will be completed(success or failure), all WorkerSets will be scaled-down to 0.
workerSets:
# 'ws0' is the name of this workerset
ws0:
# The number of worker replicas consiting the workerset
replicas: 2
# slots in autogenerated hostfile is configurable.
# Default slot is 1 or the number of GPUs you requested on
# 'chainer' container.
mpiConfig:
slots: 1
# You can put any pod template here. There are several exception
# - a container "chainer" must exist. this is the place your mpi processes run.
# - `restartPolicy` equal to `Never` or `OnFailure` is allowed. Default is `Never`
#
# In typical usecase, 'chainer' containers in workersets just wait forever.
template:
spec:
containers:
- name: chainer
image: everpeace/chainermn:1.3.0
command:
- sh
- -c
- |
while true; do sleep 1 & wait; done