Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
78 lines (66 sloc) 2.69 KB
# This is a reference of ChainerJob
apiVersion: kubeflow.org/v1alpha1
kind: ChainerJob
metadata:
name: example-job-mn
# ChainerJob in distributed mode consits of Master and multiple WorkerSets.
# Master is the pod (job technically) to boot your entire distributed job.
# WorkerSet is the set of homogenous pod (statefulset technically).
# You can define multiple WorkerSets to make heterogeneous WorkerSets.
spec:
# 'backend' defines the protocol to initiate process groups and exchange
# tensor data among the processes. Current supported backend is "mpi".
backend: mpi
# master is responsible for spawning/exiting entire distributed learning
# process. You con configure 'activeDeadlineSeconds'/'backoffLimit' to
# customize retry behavior on failure.
master:
# slots in autogenerated hostfile is configurable.
# Default slot is 1 or the number of GPUs you requested on
# 'chainer' container.
mpiConfig:
slots: 1
# Retry behaviors on failure of master
activeDeadlineSeconds: 6000
backoffLimit: 60
# You can put any pod template here. There are several exception:
# - a container "chainer" must exist. this is the place your mpi processes run.
# - Only `restartPolicy` equal to `Never` or `OnFailure` is allowed. Default is `Never`.
template:
spec:
containers:
- name: chainer
image: everpeace/chainermn:1.3.0
command:
- sh
- -c
- |
mpiexec -n 3 -N 1 --allow-run-as-root --display-map --mca mpi_cuda_support 0 \
python3 /train_mnist.py -e 2 -b 1000 -u 100
# You can define multiple WorkerSets to have heterogeneous WorkerSets
# When the master will be completed(success or failure), all WorkerSets will be scaled-down to 0.
workerSets:
# 'ws0' is the name of this workerset
ws0:
# The number of worker replicas consiting the workerset
replicas: 2
# slots in autogenerated hostfile is configurable.
# Default slot is 1 or the number of GPUs you requested on
# 'chainer' container.
mpiConfig:
slots: 1
# You can put any pod template here. There are several exception
# - a container "chainer" must exist. this is the place your mpi processes run.
# - `restartPolicy` equal to `Never` or `OnFailure` is allowed. Default is `Never`
#
# In typical usecase, 'chainer' containers in workersets just wait forever.
template:
spec:
containers:
- name: chainer
image: everpeace/chainermn:1.3.0
command:
- sh
- -c
- |
while true; do sleep 1 & wait; done
You can’t perform that action at this time.