Skip to content

Commit 40337cc

Browse files
Bordayukw777
andauthored
freeze PT 1.5 for Horovod issue (Lightning-AI#2744)
* freeze pt 1.5 * torchtext * Apply suggestions from code review Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com> * timeout Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com>
1 parent bc9348f commit 40337cc

File tree

4 files changed

+4
-4
lines changed

4 files changed

+4
-4
lines changed

.circleci/config.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ references:
6666
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
6767
echo "Done waiting. Job status code: $status_code" && \
6868
# Allow time for logs to flush.
69-
sleep 10 && \
69+
sleep 30 && \
7070
echo "JOB_NAME: $job_name" && \
7171
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
7272
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \

.github/workflows/tpu-testing.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ jobs:
9494
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
9595
echo "Done waiting. Job status code: $status_code" && \
9696
# Allow time for logs to flush.
97-
sleep 10 && \
97+
sleep 30 && \
9898
echo "JOB_NAME: $job_name" && \
9999
echo "GKE_CLUSTER: $GKE_CLUSTER" && \
100100
echo "GKE_ZONE: $GKE_ZONE" && \

requirements/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# the default package dependencies
22

33
numpy>=1.16.4
4-
torch>=1.3
4+
torch>=1.3, <1.6 # TODO: temporary freeze for Horovod incompatibility with 1.6
55
tensorboard>=1.14
66
future>=0.17.1 # required for builtins in setup.py
77
# pyyaml>=3.13

requirements/extra.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ horovod>=0.19.1
1111
omegaconf>=2.0.0
1212
# scipy>=0.13.3
1313
scikit-learn>=0.20.0
14-
torchtext>=0.3.1
14+
torchtext>=0.3.1, <0.7 # TODO: temporary fix fix for compatibility

0 commit comments

Comments
 (0)