Skip to content

Commit

Permalink
openmpi: master/workers sync mechanism is replaced with k8s api from …
Browse files Browse the repository at this point in the history
…redis.
  • Loading branch information
everpeace committed Apr 20, 2018
1 parent b52b391 commit f5fc019
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 156 deletions.
3 changes: 1 addition & 2 deletions kubeflow/openmpi/all.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ local redis = import "kubeflow/openmpi/redis.libsonnet";
},

all::
redis.all(updatedParams)
+ assets.all(updatedParams)
assets.all(updatedParams)
+ workloads.all(updatedParams)
+ service.all(updatedParams)
},
Expand Down
107 changes: 61 additions & 46 deletions kubeflow/openmpi/assets/init.sh
Original file line number Diff line number Diff line change
@@ -1,42 +1,14 @@
set -exv

probe_redis() {
local cmd="$1"
local status=$2
local max_retries=$3
local retries=0

until [ "$(${cmd})" = "${status}" ]; do
sleep 10

retries=$(expr ${retries} + 1)
if [ -n "${max_retries}" ] && [ ${retries} -ge ${max_retries} ]; then
exit 124
fi
done
}

ping_redis() {
local timeout=$1

probe_redis "redis-cli -h openmpi-redis ping" PONG ${timeout}
}

wait_cond() {
local cond=$1
local status=$2
local timeout=$3

probe_redis "redis-cli -h openmpi-redis get ${cond}" ${status} ${timeout}
}
SCRIPT_DIR=$(cd $(dirname $0);pwd)
K8S_PODS_ENDPOINT="$KUBERNETES_SERVICE_HOST:$KUBERNETES_SERVICE_PORT/api/v1/namespaces/$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace)/pods"

signal_cond() {
local cond=$1
# TOKEN should not be printed.
set +x
TOKEN_HEADER="Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
set -e

redis-cli -h openmpi-redis incr ${cond}
}

if [ $# -ne 4 ]; then
if [ $# -ne 5 ]; then
echo "illegal number of parameters"
exit 1
fi
Expand All @@ -45,6 +17,56 @@ role="$1"
workers="$2"
hostname="$3"
exec="$4"
master_pod="$5"

phase_of(){
local pod_name=$1
# TOKEN should not be printed.
set +x
curl -sL --insecure --header "$TOKEN_HEADER" \
https://${K8S_PODS_ENDPOINT}/${pod_name} \
| jq -r '.status.phase' 2>/dev/null
set -x
}

wait_workers_running() {
local max_retries=$1
local retries=0
local num_runnning_worker=0

until [ ${num_runnning_worker} -eq ${workers} ]; do

local num_runnning_worker=0
for worker in $(cat ${SCRIPT_DIR}/hostfile | cut -f 1 -d' '); do
local worker_pod=${worker%%.*}
echo -n "worker pod ${worker_pod}: "
phase=$(phase_of ${worker_pod})
echo $phase
if [ "$phase" = "Running" ]; then
num_runnning_worker=$((${num_runnning_worker} + 1))
fi
done
echo the number of running worker: ${num_runnning_worker}/${workers}

if [ -n "${max_retries}" ] && [ ${retries} -ge ${max_retries} ]; then
exit 124
else
sleep 1
fi
done
}

wait_master_done() {
local max_retries=$1
local retries=0
until [ $(phase_of ${master_pod}) = "Succeeded" -o $(phase_of ${master_pod}) = "Failed" ]; do
sleep 10;
retries=$(expr ${retries} + 1)
if [ -n "${max_retries}" ] && [ ${retries} -ge ${max_retries} ]; then
exit 124
fi
done
}

# Set up openmpi
mkdir -p /root/.openmpi
Expand All @@ -57,29 +79,22 @@ cp /kubeflow/openmpi/secrets/id_rsa.pub /root/.ssh
cp /kubeflow/openmpi/secrets/authorized_keys /root/.ssh
cp /kubeflow/openmpi/assets/ssh_config /root/.ssh/config

# Install redis-cli
apt-get update && apt-get install -y redis-tools
# Install curl and jq
apt-get update && apt-get install -y curl jq

# Start sshd in daemon mode
/usr/sbin/sshd -e -f /kubeflow/openmpi/assets/sshd_config
sleep 10

# Wait until redis is up
ping_redis 30

# Start running the workloads.
echo running ${hostname}

exit_code=0
ready_cond="openmpi:ready"
done_cond="openmpi:done"
if [ "${role}" = "master" ]; then
wait_cond ${ready_cond} ${workers} 30
wait_workers_running 30
sh -c "${exec}" || exit_code=$?
signal_cond ${done_cond}
else
signal_cond ${ready_cond}
wait_cond ${done_cond} 1
wait_master_done
fi

echo shutting down ${hostname}
Expand Down
108 changes: 0 additions & 108 deletions kubeflow/openmpi/redis.libsonnet

This file was deleted.

1 change: 1 addition & 0 deletions kubeflow/openmpi/workloads.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
std.toString(params.workers),
hostname,
params.exec,
"openmpi-master"
],
ports: [
{
Expand Down

0 comments on commit f5fc019

Please sign in to comment.