fix: Address comments

Signed-off-by: Ce Gao <ce.gao@outlook.com>
kubeflow · Nov 26, 2021 · 2940450 · 2940450
1 parent 877110d
commit 2940450
Show file tree

Hide file tree

Showing 9 changed files with 36 additions and 41 deletions.
diff --git a/examples/pytorch/elastic/echo/Dockerfile b/examples/pytorch/elastic/echo/Dockerfile
@@ -1,6 +1,6 @@
 FROM python:3.8-buster
 WORKDIR /workspace
-RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple torch==1.10.0 numpy
+RUN pip install torch==1.10.0 numpy
 # TODO Replace this with the PIP version when available
 ADD echo.py echo.py
 ENV PYTHONPATH /workspace

diff --git a/examples/pytorch/elastic/echo/echo.yaml b/examples/pytorch/elastic/echo/echo.yaml
@@ -15,7 +15,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gaocegege/pytorch-elastic-example-echo:1.0.0
+              image: kubeflow/pytorch-elastic-example-echo:1.0.0
               imagePullPolicy: IfNotPresent
               env:
               - name: LOGLEVEL

diff --git a/examples/pytorch/elastic/imagenet/Dockerfile b/examples/pytorch/elastic/imagenet/Dockerfile
@@ -2,7 +2,7 @@ ARG BASE_IMAGE=pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
 FROM $BASE_IMAGE
 
 # install utilities and dependencies
-RUN pip install -i https://mirror.sjtu.edu.cn/pypi/web/simple classy-vision
+RUN pip install classy-vision
 
 WORKDIR /workspace
 

diff --git a/examples/pytorch/elastic/imagenet/imagenet.yaml b/examples/pytorch/elastic/imagenet/imagenet.yaml
@@ -16,7 +16,7 @@ spec:
         spec:
           containers:
             - name: pytorch
-              image: gaocegege/pytorch-elastic-example-imagenet:1.0.0-sigterm
+              image: kubeflow/pytorch-elastic-example-imagenet:1.0.0-sigterm
               imagePullPolicy: IfNotPresent
               env:
               - name: LOGLEVEL

diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh
@@ -113,7 +113,6 @@ cd - >/dev/null
 # Notice: The code in kube-openapi does not generate defaulter by default.
 # We need to build binary from pkg cmd folder.
 echo "Building openapi-gen"
-echo ${OPENAPI_PKG}
 go build -o openapi-gen ${OPENAPI_PKG}/cmd/openapi-gen
 
 echo "Generating OpenAPI specification for tensorflow/v1"

diff --git a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml
@@ -55,15 +55,15 @@ spec:
                     format: int32
                     type: integer
                   metrics:
-                    description: metrics contains the specifications for which to
-                      use to calculate the desired replica count (the maximum replica
+                    description: Metrics contains the specifications which are used
+                      to calculate the desired replica count (the maximum replica
                       count across all metrics will be used).  The desired replica
-                      count is calculated multiplying the ratio between the target
-                      value and the current value by the current number of pods.  Ergo,
+                      count is calculated with multiplying the ratio between the target
+                      value and the current value by the current number of pods. Ergo,
                       metrics used must decrease as the pod count is increased, and
                       vice-versa.  See the individual metric source types for more
                       information about how each type of metric must respond. If not
-                      set, the default metric will be set to 80% average CPU utilization.
+                      set, the HPA will not be created.
                     items:
                       description: MetricSpec specifies how to scale based on a single
                         metric (only `type` and one other matching field should be

diff --git a/pkg/apis/pytorch/v1/types.go b/pkg/apis/pytorch/v1/types.go
@@ -88,14 +88,14 @@ type ElasticPolicy struct {
 
 	MaxRestarts *int32 `json:"maxRestarts,omitempty"`
 
-	// metrics contains the specifications for which to use to calculate the
+	// Metrics contains the specifications which are used to calculate the
 	// desired replica count (the maximum replica count across all metrics will
-	// be used).  The desired replica count is calculated multiplying the
+	// be used).  The desired replica count is calculated with multiplying the
 	// ratio between the target value and the current value by the current
-	// number of pods.  Ergo, metrics used must decrease as the pod count is
+	// number of pods. Ergo, metrics used must decrease as the pod count is
 	// increased, and vice-versa.  See the individual metric source types for
 	// more information about how each type of metric must respond.
-	// If not set, the default metric will be set to 80% average CPU utilization.
+	// If not set, the HPA will not be created.
 	// +optional
 	Metrics []autoscalingv2beta2.MetricSpec `json:"metrics,omitempty"`
 }
@@ -111,14 +111,14 @@ const (
 	// BackendC10D is the rendezvous backend type for C10d.
 	BackendC10D RDZVBackend = "c10d"
 	// BackendETCD is the rendezvous backend type for ETCD.
-	BackendETCD = "etcd"
+	BackendETCD RDZVBackend = "etcd"
 	// BackendETCDV2 is the rendezvous backend type for ETCD v2.
-	BackendETCDV2 = "etcd-v2"
+	BackendETCDV2 RDZVBackend = "etcd-v2"
 
 	// PyTorchReplicaTypeMaster is the type of Master of distributed PyTorch
 	PyTorchReplicaTypeMaster common.ReplicaType = "Master"
 	// PyTorchReplicaTypeWorker is the type for workers of distributed PyTorch.
-	PyTorchReplicaTypeWorker = "Worker"
+	PyTorchReplicaTypeWorker common.ReplicaType = "Worker"
 )
 
 // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object

diff --git a/pkg/controller.v1/pytorch/hpa.go b/pkg/controller.v1/pytorch/hpa.go
@@ -31,40 +31,40 @@ import (
 func (r *PyTorchJobReconciler) ReconcileHPA(pytorhcjob *pytorchv1.PyTorchJob) error {
 	logger := r.Log.WithValues(pytorchv1.Singular, pytorhcjob.Name)
 
-	if pytorhcjob.Spec.ElasticPolicy == nil {
-		logger.V(1).Info("No ElasicPolicy is specified, skipping HPA reconciling process")
+	if pytorhcjob.Spec.ElasticPolicy == nil || pytorhcjob.Spec.ElasticPolicy.Metrics == nil {
+		logger.V(1).Info(
+			"No ElasicPolicy or Metric is specified, skipping HPA reconciling process")
 		return nil
 	}
 
-	// Create or update HPA
-	hpa := &autoscalingv2beta2.HorizontalPodAutoscaler{}
-	err := r.Get(context.TODO(), types.NamespacedName{Name: pytorhcjob.Name, Namespace: pytorhcjob.Namespace}, hpa)
+	current := &autoscalingv2beta2.HorizontalPodAutoscaler{}
+
+	// Get the exepected HPA.
+	expected, err := desiredHPA(pytorhcjob, r.Scheme)
 	if err != nil {
+		return err
+	}
+
+	if err := r.Get(context.TODO(), types.NamespacedName{
+		Name:      pytorhcjob.Name,
+		Namespace: pytorhcjob.Namespace,
+	}, current); err != nil {
 		if !errors.IsNotFound(err) {
 			return err
 		}
 
 		// Create the new HPA.
-		hpa, err = desiredHPA(pytorhcjob, r.Scheme)
-		if err != nil {
-			return err
-		}
-		logger.V(1).Info("Creating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
-		err = r.Create(context.TODO(), hpa)
+		logger.V(1).Info("Creating HPA", "namespace", expected.Namespace, "name", expected.Name)
+		err = r.Create(context.TODO(), expected)
 		if err != nil {
 			return err
 		}
 		return nil
 	}
 
-	// Update HPA
-	expected, err := desiredHPA(pytorhcjob, r.Scheme)
-	if err != nil {
-		return err
-	}
-	if !equality.Semantic.DeepEqual(expected.Spec, hpa.Spec) {
-		logger.V(1).Info("Updating HPA", "HPA.Namespace", hpa.Namespace, "HPA.Name", hpa.Name)
-		expected.ResourceVersion = hpa.ResourceVersion
+	if !equality.Semantic.DeepEqual(expected.Spec, current.Spec) {
+		logger.V(1).Info("Updating HPA", "namespace", current.Namespace, "name", current.Name)
+		expected.ResourceVersion = current.ResourceVersion
 		err = r.Update(context.TODO(), expected)
 		if err != nil {
 			return err

diff --git a/pkg/controller.v1/pytorch/pytorch.go b/pkg/controller.v1/pytorch/pytorch.go
@@ -53,11 +53,7 @@ func SetPodEnv(obj interface{}, podTemplateSpec *corev1.PodTemplateSpec, rtype,
 			if err != nil {
 				return err
 			}
-			if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeMaster)) {
-				if rank != 0 {
-					return fmt.Errorf("invalid config: There should be only a single master with index=0")
-				}
-			} else {
+			if rtype == strings.ToLower(string(pytorchv1.PyTorchReplicaTypeWorker)) {
 				rank = rank + 1
 			}