examples/pytorch/cnn-mnist/mnist.yaml

# Distributed training of a traditional CNN model to do image classification 
# using the MNIST dataset and PyTorch.
apiVersion: jobset.x-k8s.io/v1alpha2
kind: JobSet
metadata:
  name: pytorch
spec:
  replicatedJobs:
  - name: workers
    template:
      spec:
        parallelism: 4
        completions: 4
        backoffLimit: 0
        template:
          spec:
            containers:
            - name: pytorch
              image: gcr.io/k8s-staging-jobset/pytorch-mnist:latest
              ports:
              - containerPort: 3389
              env:
              - name: MASTER_ADDR
                value: "pytorch-workers-0-0.pytorch"
              - name: MASTER_PORT
                value: "3389"
              - name: RANK
                valueFrom:
                  fieldRef:
                    fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index']
              # Force python to not buffer output and write directly to stdout, so we can view training logs via `kubectl logs`.
              - name: PYTHONUNBUFFERED
                value: "0"
              command:
              - bash
              - -xc
              - |
                torchrun --rdzv_id=123 --nnodes=4 --nproc_per_node=1 --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT --node_rank=$RANK mnist.py --epochs=1 --log-interval=1