File tree Expand file tree Collapse file tree 4 files changed +4
-4
lines changed
Expand file tree Collapse file tree 4 files changed +4
-4
lines changed Original file line number Diff line number Diff line change @@ -66,7 +66,7 @@ references:
6666 while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
6767 echo "Done waiting. Job status code: $status_code" && \
6868 # Allow time for logs to flush.
69- sleep 10 && \
69+ sleep 30 && \
7070 echo "JOB_NAME: $job_name" && \
7171 gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
7272 if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
Original file line number Diff line number Diff line change 9494 while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
9595 echo "Done waiting. Job status code: $status_code" && \
9696 # Allow time for logs to flush.
97- sleep 10 && \
97+ sleep 30 && \
9898 echo "JOB_NAME: $job_name" && \
9999 echo "GKE_CLUSTER: $GKE_CLUSTER" && \
100100 echo "GKE_ZONE: $GKE_ZONE" && \
Original file line number Diff line number Diff line change 11# the default package dependencies
22
33numpy>=1.16.4
4- torch>=1.3
4+ torch>=1.3, <1.6 # TODO: temporary freeze for Horovod incompatibility with 1.6
55tensorboard>=1.14
66future>=0.17.1 # required for builtins in setup.py
77# pyyaml>=3.13
Original file line number Diff line number Diff line change @@ -11,4 +11,4 @@ horovod>=0.19.1
1111omegaconf>=2.0.0
1212# scipy>=0.13.3
1313scikit-learn>=0.20.0
14- torchtext>=0.3.1
14+ torchtext>=0.3.1, <0.7 # TODO: temporary fix fix for compatibility
You can’t perform that action at this time.
0 commit comments