From 0759f7ae5142ed2e78a6971e9703fdc86b7307cd Mon Sep 17 00:00:00 2001 From: Ce Gao Date: Wed, 28 Feb 2018 04:38:56 +0800 Subject: [PATCH] Remove TensorBoard related code in operator (#391) Ref #347 Blocked until CI is running again. PS: Dashboard code is not changed. Signed-off-by: Ce Gao ce.gao@outlook.com --- README.md | 107 --------- dashboard/backend/handler/api_handler.go | 27 +-- .../frontend/src/components/CreateJob.js | 26 --- .../src/components/CreateTensorBoard.js | 86 -------- dashboard/frontend/src/components/Job.js | 4 - .../frontend/src/components/TensorBoard.js | 41 ---- developer_guide.md | 3 - examples/charts/tensorboard/.helmignore | 21 -- examples/charts/tensorboard/Chart.yaml | 4 - .../charts/tensorboard/templates/_helpers.tpl | 16 -- .../tensorboard/templates/deployment.yaml | 38 ---- examples/charts/tensorboard/values.yaml | 6 - examples/tf_job_tensorboard_azure.yaml | 44 ---- pkg/apis/tensorflow/v1alpha1/types.go | 14 +- .../v1alpha1/zz_generated.deepcopy.go | 43 ---- pkg/apis/tensorflow/validation/validation.go | 3 - pkg/trainer/tensorboard.go | 206 ------------------ pkg/trainer/tensorboard_test.go | 157 ------------- pkg/trainer/training.go | 33 +-- pkg/trainer/training_test.go | 9 +- py/test_runner.py | 3 +- test/e2e/main.go | 37 +--- tf_job_design_doc.md | 12 +- 23 files changed, 12 insertions(+), 928 deletions(-) delete mode 100644 dashboard/frontend/src/components/CreateTensorBoard.js delete mode 100644 dashboard/frontend/src/components/TensorBoard.js delete mode 100644 examples/charts/tensorboard/.helmignore delete mode 100644 examples/charts/tensorboard/Chart.yaml delete mode 100644 examples/charts/tensorboard/templates/_helpers.tpl delete mode 100644 examples/charts/tensorboard/templates/deployment.yaml delete mode 100644 examples/charts/tensorboard/values.yaml delete mode 100644 examples/tf_job_tensorboard_azure.yaml delete mode 100644 pkg/trainer/tensorboard.go delete mode 100644 pkg/trainer/tensorboard_test.go diff --git a/README.md b/README.md index cfaf3c77b1..b019a2c1b7 100644 --- a/README.md +++ b/README.md @@ -158,108 +158,6 @@ spec: Follow TensorFlow's [instructions](https://www.kubeflow.org/tutorials/using_gpu) for using GPUs. -### Requesting a TensorBoard instance - -You can also ask the `TFJob` operator to create a TensorBoard instance -by including a [TensorBoardSpec](https://github.com/kubeflow/tf-operator/blob/master/pkg/apis/tensorflow/v1alpha1/types.go#L95) -in your job. The table below describes the important fields in -[TensorBoardSpec](https://github.com/kubeflow/tf-operator/blob/master/pkg/apis/tensorflow/v1alpha1/types.go#L95). - -| Name | Description | Required | Default | -|---|---|---|---| -| `logDir` | Specifies the directory where TensorBoard will look to find TensorFlow event files that it can display | Yes | `None` | -| `volumes` | `Volumes` information that will be passed to the TensorBoard `deployment` | No | [] | -| `volumeMounts` | `VolumeMounts` information that will be passed to the TensorBoard `deployment` | No | [] | -| `serviceType` | `ServiceType` information that will be passed to the TensorBoard `service`| No | `ClusterIP` | - -#### TensorBoard on Azure - -On Azure you can store your event files on an Azure Files and use -volumes to make them available to TensorBoard. - -``` -apiVersion: "kubeflow.org/v1alpha1" -kind: "TFJob" -metadata: - name: "tf-smoke-gpu" -spec: - replica_specs: - - replicas: 1 - tfReplicaType: MASTER - template: - spec: - containers: - - image: gcr.io/tf-on-k8s-dogfood/tf_sample_gpu:latest - name: tensorflow - resources: - limits: - alpha.kubernetes.io/nvidia-gpu: 1 - restartPolicy: OnFailure - tensorboard: - logDir: /tmp/tensorflow - volumes: - - name: azurefile - azureFile: - secretName: azure-secret - shareName: data - readOnly: false - volumeMounts: - - mountPath: /tmp/tensorflow - name: azurefile - -``` - -#### TensorBoard on GKE - -On GKE you can store your event files on GCS and TensorBoard/TensorFlow -can read/write directly to GCS. - -``` -apiVersion: "kubeflow.org/v1alpha1" -kind: "TFJob" -metadata: - name: "tf-smoke-gpu" -spec: - replica_specs: - - replicas: 1 - tfPort: 2222 - tfReplicaType: MASTER - template: - spec: - containers: - - image: gcr.io/tf-on-k8s-dogfood/tf_sample_gpu:latest - name: tensorflow - args: - - --log_dir=gs://my-bucket/logdir - resources: - limits: - alpha.kubernetes.io/nvidia-gpu: 1 - restartPolicy: OnFailure - tensorboard: - logDir: gs://my-bucket/logdir - -``` - -#### Connecting to TensorBoard - -The TFJob operator will create a service named -**tensorboard-$RUNTIME_ID** for your job. You can connect to it -using the Kubernetes API Server proxy as follows - -Start the K8s proxy -``` -kubectl proxy -``` - -In a web-browser open up - -``` -http://${PROXY}:8001/api/v1/proxy/namespaces/default/services/tensorboard-${RUNTIME_ID}:80/ -``` - -Depending on how you configure the service for TensorBoard and cluster -you can make TensorBoard available without using the K8s proxy. - ## Monitoring your job To get the status of your job @@ -324,11 +222,6 @@ spec: restartPolicy: OnFailure tfPort: 2222 tfReplicaType: PS - tensorboard: - logDir: /tmp/tensorflow - serviceType: "" - volumeMounts: null - volumes: null tfImage: tensorflow/tensorflow:1.3.0 status: conditions: null diff --git a/dashboard/backend/handler/api_handler.go b/dashboard/backend/handler/api_handler.go index 5268c742ff..235262c2fb 100644 --- a/dashboard/backend/handler/api_handler.go +++ b/dashboard/backend/handler/api_handler.go @@ -24,11 +24,10 @@ type APIHandler struct { } // TFJobDetail describe the specification of a TFJob -// as well as related TensorBoard service if any and related pods +// if any and related pods type TFJobDetail struct { - TFJob *v1alpha1.TFJob `json:"tfJob"` - TbService *v1.Service `json:"tbService"` - Pods []v1.Pod `json:"pods"` + TFJob *v1alpha1.TFJob `json:"tfJob"` + Pods []v1.Pod `json:"pods"` } // TFJobList is a list of TFJobs @@ -140,26 +139,6 @@ func (apiHandler *APIHandler) handleGetTFJobDetail(request *restful.Request, res TFJob: job, } - if job.Spec.TensorBoard != nil { - tbSpec, err := apiHandler.cManager.ClientSet.CoreV1().Services(namespace).List(metav1.ListOptions{ - LabelSelector: fmt.Sprintf("kubeflow.org=,app=tensorboard,runtime_id=%s", job.Spec.RuntimeId), - }) - if err != nil { - log.Warningf("failed to list TensorBoard for TFJob %v under namespace %v, error: %v", job.Name, job.Namespace, err) - // TODO maybe partial result? - response.WriteError(http.StatusNotFound, err) - return - } else if len(tbSpec.Items) > 0 { - // Should never be more than 1 service that matched, handle error - // Handle case where no TensorBoard is found - tfJobDetail.TbService = &tbSpec.Items[0] - log.Warningf("more than one TensorBoards found for TFJob %v under namespace %v, this should be impossible", - job.Name, job.Namespace) - } else { - log.Warningf("Couldn't find a TensorBoard service for TFJob %v under namespace %v", job.Name, job.Namespace) - } - } - // Get associated pods pods, err := apiHandler.cManager.ClientSet.CoreV1().Pods(namespace).List(metav1.ListOptions{ LabelSelector: fmt.Sprintf("kubeflow.org=,runtime_id=%s", job.Spec.RuntimeId), diff --git a/dashboard/frontend/src/components/CreateJob.js b/dashboard/frontend/src/components/CreateJob.js index 0a507c54f8..4902dbf79d 100644 --- a/dashboard/frontend/src/components/CreateJob.js +++ b/dashboard/frontend/src/components/CreateJob.js @@ -8,7 +8,6 @@ import RaisedButton from "material-ui/RaisedButton"; import { withRouter } from "react-router-dom"; import { createTFJobService } from "../services"; -import CreateTensorBoard from "./CreateTensorBoard"; import RequiredTextField from "./RequiredTextField"; import VolumeCreator from "./VolumeCreator"; import EnvVarCreator from "./EnvVarCreator"; @@ -33,8 +32,6 @@ class CreateJob extends Component { psImage: "", psCommand: "", psArgs: "", - tbIsPresent: false, - tbSpec: {}, masterVolumeSpec: {}, workerVolumeSpec: {}, psVolumeSpec: {}, @@ -44,7 +41,6 @@ class CreateJob extends Component { }; this.handleInputChange = this.handleInputChange.bind(this); - this.setTensorboardSpec = this.setTensorboardSpec.bind(this); this.cancel = this.cancel.bind(this); this.deploy = this.deploy.bind(this); this.setMasterVolumesSpec = this.setMasterVolumesSpec.bind(this); @@ -55,10 +51,6 @@ class CreateJob extends Component { this.setPSEnvVars = this.setPSEnvVars.bind(this); } - setTensorboardSpec(tbSpec) { - this.setState({ tbSpec }); - } - handleInputChange(event) { const target = event.target; const value = target.type === "checkbox" ? target.checked : target.value; @@ -230,20 +222,6 @@ class CreateJob extends Component { )} - - {/* TENSORBOARD */} - -

TensorBoard

- - {this.state.tbIsPresent && ( - - )} @@ -302,10 +280,6 @@ class CreateJob extends Component { } }; - if (this.state.tbIsPresent) { - spec.spec.tensorboard = this.state.tbSpec; - } - createTFJobService(spec) .then(() => this.props.history.push("/")) .catch(console.error); diff --git a/dashboard/frontend/src/components/CreateTensorBoard.js b/dashboard/frontend/src/components/CreateTensorBoard.js deleted file mode 100644 index 9f3b8d0d1c..0000000000 --- a/dashboard/frontend/src/components/CreateTensorBoard.js +++ /dev/null @@ -1,86 +0,0 @@ -import React, { Component } from "react"; -import PropTypes from "prop-types"; -import TextField from "material-ui/TextField"; -import SelectField from "material-ui/SelectField"; -import MenuItem from "material-ui/MenuItem"; -import VolumeCreator from "./VolumeCreator"; - -class CreateTensorBoard extends Component { - constructor(props) { - super(props); - this.state = { - serviceType: 0, - logDir: "/tmp/tensorflow", - volumes: [], - volumeMounts: [] - }; - this.handleInputChange = this.handleInputChange.bind(this); - this.setVolumesSpec = this.setVolumesSpec.bind(this); - } - - handleInputChange(event) { - const target = event.target; - const value = target.type === "checkbox" ? target.checked : target.value; - const name = target.name; - this.setState({ - [name]: value - }); - this.bubbleSpec({ ...this.state, [name]: value }); - } - - styles = { - root: { - display: "flex", - flexDirection: "column" - }, - field: { - width: "80%" - } - }; - render() { - return ( -
- { - this.setState({ serviceType: v }); - this.bubbleSpec({ ...this.state, serviceType: v }); - }} - > - - - - - -
- ); - } - - setVolumesSpec(volumeSpecs) { - this.setState(Object.assign(this.state, volumeSpecs)); - this.bubbleSpec(Object.assign(this.state, volumeSpecs)); - } - - bubbleSpec(state) { - this.props.setTensorBoardSpec(this.buildTensorBoardSpec(state)); - } - - buildTensorBoardSpec(state) { - let st = state.serviceType === 0 ? "ClusterIP" : "LoadBalancer"; - return { ...state, serviceType: st }; - } -} - -CreateTensorBoard.propTypes = { - setTensorBoardSpec: PropTypes.func.isRequired -}; - -export default CreateTensorBoard; diff --git a/dashboard/frontend/src/components/Job.js b/dashboard/frontend/src/components/Job.js index 2ef825ca85..10b2f8e37a 100644 --- a/dashboard/frontend/src/components/Job.js +++ b/dashboard/frontend/src/components/Job.js @@ -1,7 +1,6 @@ import React, { Component } from "react"; import JobDetail from "./JobDetail.js"; import ReplicaSpec from "./ReplicaSpec.js"; -import TensorBoard from "./TensorBoard.js"; import { Card, CardText } from "material-ui/Card"; import { getTFJobService } from "../services"; @@ -35,9 +34,6 @@ class Job extends Component {
-
- -
{replicaSpecs} ); diff --git a/dashboard/frontend/src/components/TensorBoard.js b/dashboard/frontend/src/components/TensorBoard.js deleted file mode 100644 index 04f6f80a4b..0000000000 --- a/dashboard/frontend/src/components/TensorBoard.js +++ /dev/null @@ -1,41 +0,0 @@ -import React from "react"; -import PropTypes from "prop-types"; -import InfoEntry from "./InfoEntry"; -import { Card, CardHeader, CardText } from "material-ui/Card"; - -const TensorBoard = ({ service }) => { - return ( - - - - {service ? ( -
- - {service.status.loadBalancer.ingress && ( - - )} -
- ) : ( - "TensorBoard was not configured for this TFJob or the service was deleted." - )} -
-
- ); -}; - -TensorBoard.propTypes = { - service: PropTypes.shape({ - spec: PropTypes.object.isRequired, - status: PropTypes.object.isRequired - }).isRequired -}; - -export default TensorBoard; diff --git a/developer_guide.md b/developer_guide.md index b397e990c4..0da1682c45 100644 --- a/developer_guide.md +++ b/developer_guide.md @@ -22,9 +22,6 @@ $ tree -d -I 'vendor|bin|.git' ├── docs │   └── diagrams ├── examples -│   ├── charts -│   │   └── tensorboard -│   │   └── templates │   ├── crd │   ├── gke │   │   └── notebook_image diff --git a/examples/charts/tensorboard/.helmignore b/examples/charts/tensorboard/.helmignore deleted file mode 100644 index f0c1319444..0000000000 --- a/examples/charts/tensorboard/.helmignore +++ /dev/null @@ -1,21 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*~ -# Various IDEs -.project -.idea/ -*.tmproj diff --git a/examples/charts/tensorboard/Chart.yaml b/examples/charts/tensorboard/Chart.yaml deleted file mode 100644 index 50f5672155..0000000000 --- a/examples/charts/tensorboard/Chart.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -description: A Helm chart for running tensorboard -name: tensorboard -version: 0.1.0 diff --git a/examples/charts/tensorboard/templates/_helpers.tpl b/examples/charts/tensorboard/templates/_helpers.tpl deleted file mode 100644 index f0d83d2edb..0000000000 --- a/examples/charts/tensorboard/templates/_helpers.tpl +++ /dev/null @@ -1,16 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -*/}} -{{- define "fullname" -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} diff --git a/examples/charts/tensorboard/templates/deployment.yaml b/examples/charts/tensorboard/templates/deployment.yaml deleted file mode 100644 index 0757a4f5de..0000000000 --- a/examples/charts/tensorboard/templates/deployment.yaml +++ /dev/null @@ -1,38 +0,0 @@ -kind: Service -apiVersion: v1 -metadata: - name: {{ .Release.Name }} -spec: - ports: - # Accept traffic sent to port 80 - - name: http - port: 80 - targetPort: 80 - selector: - # Loadbalance traffic across Pods matching - # this label selector - app: {{ .Release.Name }} - ---- - -apiVersion: apps/v1beta1 -kind: Deployment -metadata: - name: {{ .Release.Name }} -spec: - replicas: 1 - template: - metadata: - name: {{ .Release.Name }} - labels: - app: {{ .Release.Name }} - spec: - containers: - - name: tensorboard - image: gcr.io/tensorflow/tensorflow:latest - command: - - /usr/local/bin/tensorboard - - --logdir={{ .Values.logDir }} - - --port=80 - ports: - - containerPort: 80 diff --git a/examples/charts/tensorboard/values.yaml b/examples/charts/tensorboard/values.yaml deleted file mode 100644 index d46c02a10c..0000000000 --- a/examples/charts/tensorboard/values.yaml +++ /dev/null @@ -1,6 +0,0 @@ -# Default values for tensorboard. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. -# -# The log directory to use with TensorBoard. -logDir: \ No newline at end of file diff --git a/examples/tf_job_tensorboard_azure.yaml b/examples/tf_job_tensorboard_azure.yaml deleted file mode 100644 index 9f2d5edcd1..0000000000 --- a/examples/tf_job_tensorboard_azure.yaml +++ /dev/null @@ -1,44 +0,0 @@ -apiVersion: "kubeflow.org/v1alpha1" -kind: "TFJob" -metadata: - name: "example-job-tb-azure" -spec: - tensorboard: - logDir: /tmp/tensorflow - serviceType: LoadBalancer - volumes: - - name: azurefile - azureFile: - secretName: azure-secret - shareName: data - readOnly: false - volumeMounts: - - mountPath: /tmp/tensorflow - name: azurefile - replicaSpecs: - - replicas: 1 - tfPort: 2222 - tfReplicaType: MASTER - template: - spec: - volumes: - - name: azurefile - azureFile: - secretName: azure-secret - shareName: data - readOnly: false - containers: - - image: wbuchwalter/helm-tf:hyper-param-sweep-cpu - name: tensorflow - command: - - python - - main.py - - --log-dir - - /tmp/tensorflow/ - env: - - name: LC_ALL - value: C.UTF-8 - volumeMounts: - - mountPath: /tmp/tensorflow - name: azurefile - restartPolicy: OnFailure diff --git a/pkg/apis/tensorflow/v1alpha1/types.go b/pkg/apis/tensorflow/v1alpha1/types.go index dce90bf906..802e1a549a 100644 --- a/pkg/apis/tensorflow/v1alpha1/types.go +++ b/pkg/apis/tensorflow/v1alpha1/types.go @@ -48,14 +48,10 @@ type TFJobSpec struct { // TODO(jlewi): Can we we get rid of this and use some value from Kubernetes or a random ide. RuntimeId string - //TensorBoardSpec specifies the configuration to start a TensorBoard deployment - TensorBoard *TensorBoardSpec `json:"tensorboard"` - // ReplicaSpecs specifies the TF replicas to run. ReplicaSpecs []*TFReplicaSpec `json:"replicaSpecs"` - // TFImage defines the tensorflow docker image that should be used for Tensorboard - // and the default parameter server + // TFImage defines the tensorflow docker image that should be used for default parameter server TFImage string `json:"tfImage,omitempty"` // TerminationPolicy specifies the condition that the tfjob should be considered finished. @@ -104,14 +100,6 @@ type TFReplicaSpec struct { TFReplicaType `json:"tfReplicaType"` } -type TensorBoardSpec struct { - //Location of TensorFlow event files - LogDir string `json:"logDir"` - Volumes []v1.Volume `json:"volumes"` - VolumeMounts []v1.VolumeMount `json:"volumeMounts"` - ServiceType v1.ServiceType `json:"serviceType"` -} - type TFJobPhase string const ( diff --git a/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go index 37ddf1345f..7b7817e9d9 100644 --- a/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/tensorflow/v1alpha1/zz_generated.deepcopy.go @@ -81,10 +81,6 @@ func RegisterDeepCopies(scheme *runtime.Scheme) error { in.(*TFReplicaStatus).DeepCopyInto(out.(*TFReplicaStatus)) return nil }, InType: reflect.TypeOf(&TFReplicaStatus{})}, - conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { - in.(*TensorBoardSpec).DeepCopyInto(out.(*TensorBoardSpec)) - return nil - }, InType: reflect.TypeOf(&TensorBoardSpec{})}, conversion.GeneratedDeepCopyFunc{Fn: func(in interface{}, out interface{}, c *conversion.Cloner) error { in.(*TerminationPolicySpec).DeepCopyInto(out.(*TerminationPolicySpec)) return nil @@ -257,15 +253,6 @@ func (in *TFJobList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TFJobSpec) DeepCopyInto(out *TFJobSpec) { *out = *in - if in.TensorBoard != nil { - in, out := &in.TensorBoard, &out.TensorBoard - if *in == nil { - *out = nil - } else { - *out = new(TensorBoardSpec) - (*in).DeepCopyInto(*out) - } - } if in.ReplicaSpecs != nil { in, out := &in.ReplicaSpecs, &out.ReplicaSpecs *out = make([]*TFReplicaSpec, len(*in)) @@ -394,36 +381,6 @@ func (in *TFReplicaStatus) DeepCopy() *TFReplicaStatus { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TensorBoardSpec) DeepCopyInto(out *TensorBoardSpec) { - *out = *in - if in.Volumes != nil { - in, out := &in.Volumes, &out.Volumes - *out = make([]v1.Volume, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } - if in.VolumeMounts != nil { - in, out := &in.VolumeMounts, &out.VolumeMounts - *out = make([]v1.VolumeMount, len(*in)) - for i := range *in { - (*in)[i].DeepCopyInto(&(*out)[i]) - } - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TensorBoardSpec. -func (in *TensorBoardSpec) DeepCopy() *TensorBoardSpec { - if in == nil { - return nil - } - out := new(TensorBoardSpec) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TerminationPolicySpec) DeepCopyInto(out *TerminationPolicySpec) { *out = *in diff --git a/pkg/apis/tensorflow/validation/validation.go b/pkg/apis/tensorflow/validation/validation.go index 5c165478d4..bd0219924c 100644 --- a/pkg/apis/tensorflow/validation/validation.go +++ b/pkg/apis/tensorflow/validation/validation.go @@ -75,8 +75,5 @@ func ValidateTFJobSpec(c *tfv1.TFJobSpec) error { return fmt.Errorf("Missing ReplicaSpec for chief: %v", c.TerminationPolicy.Chief.ReplicaName) } - if c.TensorBoard != nil && c.TensorBoard.LogDir == "" { - return fmt.Errorf("tbReplicaSpec.LogDir must be specified") - } return nil } diff --git a/pkg/trainer/tensorboard.go b/pkg/trainer/tensorboard.go deleted file mode 100644 index 65b3ff61f4..0000000000 --- a/pkg/trainer/tensorboard.go +++ /dev/null @@ -1,206 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "errors" - "fmt" - "strings" - - log "github.com/golang/glog" - "github.com/golang/protobuf/proto" - "k8s.io/api/core/v1" - "k8s.io/api/extensions/v1beta1" - k8s_errors "k8s.io/apimachinery/pkg/api/errors" - meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/intstr" - "k8s.io/client-go/kubernetes" - - "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/helper" - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" -) - -const TbPort = 6006 - -// TBReplicaSet represent the RS for the TensorBoard instance -type TBReplicaSet struct { - ClientSet kubernetes.Interface - Job *TrainingJob - Spec tfv1alpha1.TensorBoardSpec -} - -func NewTBReplicaSet(clientSet kubernetes.Interface, s tfv1alpha1.TensorBoardSpec, job *TrainingJob) (*TBReplicaSet, error) { - if s.LogDir == "" { - return nil, errors.New("tbReplicaSpec.LogDir must be specified") - } - - return &TBReplicaSet{ - ClientSet: clientSet, - Job: job, - Spec: s, - }, nil -} - -func (s *TBReplicaSet) Create() error { - // By default we assume TensorBoard's service will be a ClusterIP - // unless specified otherwise by the user - st := v1.ServiceType("ClusterIP") - if s.Spec.ServiceType != "" { - st = s.Spec.ServiceType - } - - // create the service exposing TensorBoard - service := &v1.Service{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: s.jobName(), - Labels: s.Labels(), - OwnerReferences: []meta_v1.OwnerReference{ - helper.AsOwner(s.Job.job), - }, - }, - Spec: v1.ServiceSpec{ - Type: st, - Selector: s.Labels(), - Ports: []v1.ServicePort{ - { - Name: "tb-port", - Port: 80, - TargetPort: intstr.IntOrString{ - IntVal: TbPort, - }, - }, - }, - }, - } - - log.Infof("Creating Service: %v", service.ObjectMeta.Name) - _, err := s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Create(service) - - // If the job already exists do nothing. - if err != nil { - if k8s_errors.IsAlreadyExists(err) { - log.Infof("Service %v already exists.", s.jobName()) - } else { - return err - } - } - - newD := &v1beta1.Deployment{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: s.jobName(), - Labels: s.Labels(), - OwnerReferences: []meta_v1.OwnerReference{ - helper.AsOwner(s.Job.job), - }, - }, - Spec: v1beta1.DeploymentSpec{ - Selector: &meta_v1.LabelSelector{ - MatchLabels: s.Labels(), - }, - Replicas: proto.Int32(1), - Template: s.getDeploymentSpecTemplate(s.Job.job.Spec.TFImage), - }, - } - - log.Infof("Creating Deployment: %v", newD.ObjectMeta.Name) - _, err = s.ClientSet.ExtensionsV1beta1().Deployments(s.Job.job.ObjectMeta.Namespace).Create(newD) - - if err != nil { - if k8s_errors.IsAlreadyExists(err) { - log.Infof("%v already exists.", s.jobName()) - } else { - return err - } - } - return nil -} - -func (s *TBReplicaSet) Delete() error { - failures := false - - delProp := meta_v1.DeletePropagationForeground - log.V(1).Infof("Deleting deployment %v:%v", s.Job.job.ObjectMeta.Namespace, s.jobName()) - err := s.ClientSet.ExtensionsV1beta1().Deployments(s.Job.job.ObjectMeta.Namespace).Delete(s.jobName(), &meta_v1.DeleteOptions{ - PropagationPolicy: &delProp, - }) - if err != nil { - log.Errorf("There was a problem deleting TensorBoard's deployment %v; %v", s.jobName(), err) - failures = true - } - - log.V(1).Infof("Deleting service %v:%v", s.Job.job.ObjectMeta.Namespace, s.jobName()) - err = s.ClientSet.CoreV1().Services(s.Job.job.ObjectMeta.Namespace).Delete(s.jobName(), &meta_v1.DeleteOptions{}) - if err != nil { - log.Errorf("Error deleting service: %v; %v", s.jobName(), err) - failures = true - } - - if failures { - return errors.New("There was an issue deleting TensorBoard's resources") - } - return nil -} - -func (s *TBReplicaSet) getDeploymentSpecTemplate(image string) v1.PodTemplateSpec { - // TODO: make the TensorFlow image a parameter of the job operator. - c := &v1.Container{ - Name: s.jobName(), - Image: image, - Command: []string{ - "tensorboard", "--logdir", s.Spec.LogDir, "--host", "0.0.0.0", - }, - Ports: []v1.ContainerPort{ - { - ContainerPort: TbPort, - }, - }, - VolumeMounts: make([]v1.VolumeMount, 0), - } - - c.VolumeMounts = append(c.VolumeMounts, s.Spec.VolumeMounts...) - - ps := &v1.PodSpec{ - Containers: []v1.Container{*c}, - Volumes: make([]v1.Volume, 0), - } - - ps.Volumes = append(ps.Volumes, s.Spec.Volumes...) - - return v1.PodTemplateSpec{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: s.jobName(), - Labels: s.Labels(), - }, - Spec: *ps, - } - -} - -func (s *TBReplicaSet) Labels() KubernetesLabels { - return KubernetesLabels(map[string]string{ - "kubeflow.org": "", - "runtime_id": s.Job.job.Spec.RuntimeId, - "app": "tensorboard", - "tf_job_name": s.Job.job.ObjectMeta.Name, - }) -} - -func (s *TBReplicaSet) jobName() string { - // Truncate tfjob name to 40 characters - // The whole job name should be compliant with the DNS_LABEL spec, up to a max length of 63 characters - // Thus jobname(40 chars)-tensorboard(11 chars)-runtimeId(4 chars), also leaving some spaces - // See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/architecture/identifiers.md - return fmt.Sprintf("%v-tensorboard-%v", fmt.Sprintf("%.40s", s.Job.job.ObjectMeta.Name), strings.ToLower(s.Job.job.Spec.RuntimeId)) -} diff --git a/pkg/trainer/tensorboard_test.go b/pkg/trainer/tensorboard_test.go deleted file mode 100644 index f02834056f..0000000000 --- a/pkg/trainer/tensorboard_test.go +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright 2018 The Kubeflow Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package trainer - -import ( - "reflect" - "testing" - - "github.com/golang/protobuf/proto" - "k8s.io/api/core/v1" - meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/fake" - "k8s.io/client-go/tools/record" - - tfv1alpha1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1alpha1" - tfJobFake "github.com/kubeflow/tf-operator/pkg/client/clientset/versioned/fake" - "github.com/kubeflow/tf-operator/pkg/util" -) - -func TestTBReplicaSet(t *testing.T) { - clientSet := fake.NewSimpleClientset() - - jobSpec := &tfv1alpha1.TFJob{ - ObjectMeta: meta_v1.ObjectMeta{ - Name: "some-job", - UID: "some-uid", - }, - Spec: tfv1alpha1.TFJobSpec{ - RuntimeId: "some-runtime", - ReplicaSpecs: []*tfv1alpha1.TFReplicaSpec{ - { - Replicas: proto.Int32(1), - TFPort: proto.Int32(10), - Template: &v1.PodTemplateSpec{}, - TFReplicaType: tfv1alpha1.MASTER, - }, - }, - TensorBoard: &tfv1alpha1.TensorBoardSpec{ - LogDir: "/tmp/tensorflow", - }, - }, - } - - recorder := record.NewFakeRecorder(100) - job, err := initJob(clientSet, &tfJobFake.Clientset{}, recorder, jobSpec) - - if err != nil { - t.Fatalf("initJob failed: %v", err) - } - - replica, err := NewTBReplicaSet(clientSet, *jobSpec.Spec.TensorBoard, job) - - if err != nil { - t.Fatalf("NewTBReplicaSet failed: %v", err) - } - - if err := replica.Create(); err != nil { - t.Fatalf("TBReplicaSet.Create() error; %v", err) - } - - // Expected labels - expectedLabels := map[string]string{ - "kubeflow.org": "", - "app": "tensorboard", - "runtime_id": "some-runtime", - "tf_job_name": "some-job", - } - - trueVal := true - expectedOwnerReference := meta_v1.OwnerReference{ - APIVersion: groupVersionKind.GroupVersion().String(), - Kind: groupVersionKind.Kind, - Name: "some-job", - UID: "some-uid", - Controller: &trueVal, - BlockOwnerDeletion: &trueVal, - } - - // Check that a service was created. - // TODO: Change this List for a Get for clarity - sList, err := clientSet.CoreV1().Services(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) - if err != nil { - t.Fatalf("List services error; %v", err) - } - - if len(sList.Items) != 1 { - t.Fatalf("Expected 1 service got %v", len(sList.Items)) - } - - s := sList.Items[0] - - if !reflect.DeepEqual(expectedLabels, s.ObjectMeta.Labels) { - t.Fatalf("Service Labels; Got %v Want: %v", s.ObjectMeta.Labels, expectedLabels) - } - - name := "some-job-tensorboard-some-runtime" - if s.ObjectMeta.Name != name { - t.Fatalf("Job.ObjectMeta.Name = %v; want %v", s.ObjectMeta.Name, name) - } - - if len(s.ObjectMeta.OwnerReferences) != 1 { - t.Fatalf("Expected 1 owner reference got %v", len(s.ObjectMeta.OwnerReferences)) - } - - if !reflect.DeepEqual(s.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { - t.Fatalf("Service.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(s.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) - } - - // Check that a deployment was created. - l, err := clientSet.ExtensionsV1beta1().Deployments(replica.Job.job.ObjectMeta.Namespace).List(meta_v1.ListOptions{}) - if err != nil { - t.Fatalf("List deployments error; %v", err) - } - - if len(l.Items) != 1 { - t.Fatalf("Expected 1 deployment got %v", len(l.Items)) - } - - d := l.Items[0] - - if !reflect.DeepEqual(expectedLabels, d.ObjectMeta.Labels) { - t.Fatalf("Deployment Labels; Got %v Want: %v", expectedLabels, d.ObjectMeta.Labels) - } - - if d.ObjectMeta.Name != name { - t.Fatalf("Deployment.ObjectMeta.Name = %v; want %v", d.ObjectMeta.Name, name) - } - - if len(d.ObjectMeta.OwnerReferences) != 1 { - t.Fatalf("Expected 1 owner reference got %v", len(d.ObjectMeta.OwnerReferences)) - } - - if !reflect.DeepEqual(d.ObjectMeta.OwnerReferences[0], expectedOwnerReference) { - t.Fatalf("Service.Metadata.OwnerReferences; Got %v; want %v", util.Pformat(s.ObjectMeta.OwnerReferences[0]), util.Pformat(expectedOwnerReference)) - } - - // Delete the job. - // N.B it doesn't look like the Fake clientset is sophisticated enough to delete jobs in response to a - // DeleteCollection request (deleting individual jobs does appear to work with the Fake). So if we were to list - // the jobs after calling Delete we'd still see the job. So we will rely on E2E tests to verify Delete works - // correctly. - if err := replica.Delete(); err != nil { - t.Fatalf("TBReplicaSet.Delete() error; %v", err) - } -} diff --git a/pkg/trainer/training.go b/pkg/trainer/training.go index 5f90f2384d..abf5e42df6 100644 --- a/pkg/trainer/training.go +++ b/pkg/trainer/training.go @@ -48,8 +48,6 @@ type TrainingJob struct { Replicas []*TFReplicaSet - TensorBoard *TBReplicaSet - tfJobClient tfjobclient.Interface // in memory state of the job. @@ -76,7 +74,6 @@ func initJob(kubeCli kubernetes.Interface, tfJobClient tfjobclient.Interface, re tfJobClient: tfJobClient, recorder: recorder, Replicas: make([]*TFReplicaSet, 0), - TensorBoard: nil, job: job, status: *job.Status.DeepCopy(), } @@ -84,13 +81,6 @@ func initJob(kubeCli kubernetes.Interface, tfJobClient tfjobclient.Interface, re return j, nil } -func initTensorBoard(clientSet kubernetes.Interface, tj *TrainingJob) (*TBReplicaSet, error) { - if tj.job.Spec.TensorBoard != nil { - return NewTBReplicaSet(clientSet, *tj.job.Spec.TensorBoard, tj) - } - return nil, nil -} - func NewJob(kubeCli kubernetes.Interface, tfJobClient tfjobclient.Interface, recorder record.EventRecorder, job *tfv1alpha1.TFJob, config *tfv1alpha1.ControllerConfig) (*TrainingJob, error) { j, err := initJob(kubeCli, tfJobClient, recorder, job) if err != nil { @@ -120,7 +110,7 @@ func (j *TrainingJob) ClusterSpec() ClusterSpec { return clusterSpec } -// createResources creates all the replicas and TensorBoard if requested +// createResources creates all the replicas if requested func (j *TrainingJob) createResources(config *tfv1alpha1.ControllerConfig) error { for _, r := range j.Replicas { if err := r.Create(config); err != nil { @@ -128,16 +118,10 @@ func (j *TrainingJob) createResources(config *tfv1alpha1.ControllerConfig) error } } - if j.TensorBoard != nil { - if err := j.TensorBoard.Create(); err != nil { - return err - } - } - return nil } -// deleteResources deletes the replicas and TensorBoard it it was created +// deleteResources deletes the replicas it it was created func (j *TrainingJob) deleteResources() error { for _, r := range j.Replicas { if err := r.Delete(); err != nil { @@ -145,11 +129,6 @@ func (j *TrainingJob) deleteResources() error { } } - if j.TensorBoard != nil { - if err := j.TensorBoard.Delete(); err != nil { - return err - } - } return nil } @@ -290,14 +269,6 @@ func (j *TrainingJob) setupReplicas() error { } } - if j.TensorBoard == nil { - tb, err := initTensorBoard(j.KubeCli, j) - if err != nil { - return err - } - j.TensorBoard = tb - } - return nil } diff --git a/pkg/trainer/training_test.go b/pkg/trainer/training_test.go index 836084148b..75d639afc9 100644 --- a/pkg/trainer/training_test.go +++ b/pkg/trainer/training_test.go @@ -282,19 +282,12 @@ func TestJobSetup(t *testing.T) { TFReplicaType: tfv1alpha1.WORKER, }, }, - TensorBoard: &tfv1alpha1.TensorBoardSpec{}, - TerminationPolicy: &tfv1alpha1.TerminationPolicySpec{ - Chief: &tfv1alpha1.ChiefSpec{ - ReplicaName: string(tfv1alpha1.WORKER), - ReplicaIndex: 0, - }, - }, }, }, expectMounts: 0, expectPhase: tfv1alpha1.TFJobPhaseFailed, expectState: tfv1alpha1.StateFailed, - expectReason: "invalid job spec: tbReplicaSpec.LogDir must be specified", + expectReason: "invalid job spec: Missing ReplicaSpec for chief: MASTER", }, } diff --git a/py/test_runner.py b/py/test_runner.py index 236020f544..531a61085a 100644 --- a/py/test_runner.py +++ b/py/test_runner.py @@ -217,8 +217,7 @@ def run_test(args): # pylint: disable=too-many-branches,too-many-statements # TODO(jlewi): # Here are some validation checks to run: - # 1. Check tensorboard is created if its part of the job spec. - # 2. Check that all resources are garbage collected. + # 1. Check that all resources are garbage collected. # TODO(jlewi): Add an option to add chaos and randomly kill various resources? # TODO(jlewi): Are there other generic validation checks we should # run. diff --git a/test/e2e/main.go b/test/e2e/main.go index edc065f79c..dc07b0e617 100644 --- a/test/e2e/main.go +++ b/test/e2e/main.go @@ -92,9 +92,6 @@ func run() (string, error) { tfReplicaType(tfv1alpha1.PS).toSpec(), tfReplicaType(tfv1alpha1.WORKER).toSpec(), }, - TensorBoard: &tfv1alpha1.TensorBoardSpec{ - LogDir: "/tmp/tensorflow", - }, }, } @@ -149,21 +146,6 @@ func run() (string, error) { } } - // Check that the TensorBoard deployment is present - tbDeployName := fmt.Sprintf("%v-tensorboard-%v", fmt.Sprintf("%.40s", original.ObjectMeta.Name), tfJob.Spec.RuntimeId) - _, err = kubeCli.ExtensionsV1beta1().Deployments(*namespace).Get(tbDeployName, metav1.GetOptions{}) - - if err != nil { - return *name, fmt.Errorf("TFJob %v did not create Deployment %v for TensorBoard", *name, tbDeployName) - } - - // Check that the TensorBoard service is present - _, err = kubeCli.CoreV1().Services(*namespace).Get(tbDeployName, metav1.GetOptions{}) - - if err != nil { - return *name, fmt.Errorf("TFJob %v did not create Service %v for TensorBoard", *name, tbDeployName) - } - // Delete the job and make sure all subresources are properly garbage collected. if err := tfJobClient.KubeflowV1alpha1().TFJobs(*namespace).Delete(*name, &metav1.DeleteOptions{}); err != nil { log.Fatalf("Failed to delete TFJob %v; error %v", *name, err) @@ -172,7 +154,6 @@ func run() (string, error) { // Define sets to keep track of Job controllers corresponding to Replicas // that still exist. jobs := make(map[string]bool) - isTBDeployDeleted := false // Loop over each replica and make sure the expected resources are being deleted. for _, r := range original.Spec.ReplicaSpecs { @@ -186,7 +167,7 @@ func run() (string, error) { } // Wait for all jobs and deployment to be deleted. - for endTime := time.Now().Add(*timeout); time.Now().Before(endTime) && (len(jobs) > 0 || !isTBDeployDeleted); { + for endTime := time.Now().Add(*timeout); time.Now().Before(endTime) && len(jobs) > 0; { for k := range jobs { _, err := kubeCli.BatchV1().Jobs(*namespace).Get(k, metav1.GetOptions{}) if k8s_errors.IsNotFound(err) { @@ -198,17 +179,7 @@ func run() (string, error) { } } - if !isTBDeployDeleted { - // Check that TensorBoard deployment is being deleted - _, err = kubeCli.ExtensionsV1beta1().Deployments(*namespace).Get(tbDeployName, metav1.GetOptions{}) - if k8s_errors.IsNotFound(err) { - isTBDeployDeleted = true - } else { - log.Infof("TensorBoard deployment %v still exists for TFJob %v", tbDeployName, *name) - } - } - - if len(jobs) > 0 || !isTBDeployDeleted { + if len(jobs) > 0 { time.Sleep(5 * time.Second) } } @@ -217,10 +188,6 @@ func run() (string, error) { return *name, fmt.Errorf("Not all Job controllers were successfully deleted for TFJob %v.", *name) } - if !isTBDeployDeleted { - return *name, fmt.Errorf("TensorBoard deployment %v was not successfully deleted for TFJob %v.", tbDeployName, *name) - } - return *name, nil } diff --git a/tf_job_design_doc.md b/tf_job_design_doc.md index 285b413a59..4566007c07 100644 --- a/tf_job_design_doc.md +++ b/tf_job_design_doc.md @@ -44,8 +44,6 @@ kind: "TFJob" metadata: name: "example-job" spec: - tensorboard: - logDir: gs://my-job/log-dir replicaSpecs: - replicas: 1 tfReplicaType: MASTER @@ -70,7 +68,7 @@ spec: - replicas: 1 tfReplicaType: PS ``` -**Fig 1.** An example job spec for a distributed Training job with 1 master, 2 workers and 1 PS. TensorBoard is configured by specifying the location of the event files. +**Fig 1.** An example job spec for a distributed Training job with 1 master, 2 workers and 1 PS. As illustrated by Fig 1, I made an explicit decision not to try to hide or replace K8s abstractions. For example, each TfReplica contains a standard K8s [PodTemplate](https://kubernetes.io/docs/api-reference/v1.7/#podtemplate-v1-core) to specify the processes (including TF) to run in each replica. I did this because K8s already provides a widely adopted and understood API. So introducing new concepts in place of K8s concepts is just confusing. Furthermore, exposing the [PodTemplate](https://kubernetes.io/docs/api-reference/v1.7/#podtemplate-v1-core) makes it easy for TFJob users to leverage K8s features. For example, TFJob users can use K8s to attach volumes to their TF processes. This makes it easy to use TF in conjunction with any storage system supported by K8s (e.g. PDs, NFS, etc...) @@ -78,7 +76,7 @@ As illustrated by Fig 1, I made an explicit decision not to try to hide or repla The controller can be used to configure defaults for TFJob to create a simpler user experience. The most common use for this right now is supporting GPUs. To use GPUs, the NVIDIA drivers and libraries need to be mounted from the host into the container. This step should become unnecessary with Kubernetes 1.8. The TFJob controller will automatically add these volume mounts based on configuration specified when the controller is started. This prevents users from having to specify them for each job. Instead, only the cluster administrator who deploys the TFJob controller needs to know how the volumes should be configured. -Another use case is minimizing the boilerplate users have to write to run standard processes (e.g. [Parameter Servers](https://github.com/kubeflow/tf-operator/pull/36#discussion_r141135711) or TensorBoard) using official TF Docker images. +Another use case is minimizing the boilerplate users have to write to run standard processes (e.g. [Parameter Servers](https://github.com/kubeflow/tf-operator/pull/36#discussion_r141135711)) using official TF Docker images. ## Controller @@ -90,11 +88,6 @@ When the master exits successfully or with a permanent error the job is consider ![Resources for TFJob](docs/diagrams/tfjob_k8s_resources.svg) -## TensorBoard - -If a user specifies the log dir as part of the TFJob spec, then the TFJob controller will launch TensorBoard as part of the job. TensorBoard will run until the job is deleted. A service is also created to give the TensorBoard process a stable DNS name. The DNS name is a deterministic function of the job name; the port is always the same e.g. 80. The user can list services to find the relevant one. Nonetheless, the experience today is still rough, especially when a user has lots of jobs in the cluster. I expect that we will make various usability improvements; for example we might publish the relevant URI for accessing TensorBoard in the TFJob status. - - ## Non-distributed training A TFJob can handle non-distributed training; the TFJob spec would consist of a single replica of type master. @@ -122,4 +115,3 @@ Rather than use a CRD, we could use a tool like Helm or Ksonnet to create templa One disadvantage of templates is that they do not provide a mechanism to add custom control logic. None of the K8s builtin controllers provide a mechanism for distinguishing between retryable and permanent errors. Furthermore, the built in controllers don't propagate errors; if worker i fails with a permanent error this error won't cause the parameter servers and master controllers to be terminated. Another major disadvantage is that the templating approach forces users to manually manage multiple K8s resources. -