Skip to content

Commit

Permalink
Make TfJob CRD prototype functionally equivalent to the helm chart (#93)
Browse files Browse the repository at this point in the history
* Configure the volume mounts correctly for Azure.

* Add a parameter to set the DefaultTfImage.

* Include service and deployment for dashboard

* Update the default image to the latest CRD docker image.

* Fixes #61
  • Loading branch information
jlewi committed Jan 16, 2018
1 parent c844135 commit ba78f90
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 8 deletions.
14 changes: 12 additions & 2 deletions kubeflow/core/prototypes/all.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
// @optionalParam namespace string default Namespace
// @optionalParam disks string null Comma separated list of Google persistent disks to attach to jupyter environments.
// @optionalParam cloud string null String identifying the cloud to customize the deployment for.
// @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20171214-0bd02ac The image for the TfJob controller.
// @optionalParam tfJobImage string gcr.io/tf-on-k8s-dogfood/tf_operator:v20171223-37af20d The image for the TfJob controller.
// @optionalParam tfDefaultImage string null The default image to use for TensorFlow.
// @optionalParam tfJobUiServiceType string ClusterIP The service type for the UI.

// TODO(https://github.com/ksonnet/ksonnet/issues/222): We have to add namespace as an explicit parameter
// because ksonnet doesn't support inheriting it from the environment yet.
Expand All @@ -19,6 +21,8 @@ local nfs = import "kubeflow/core/nfs.libsonnet";
local name = import 'param://name';
local namespace = import 'param://namespace';

local cloud = import 'param://cloud';

// TODO(jlewi): Make this a parameter
local jupyterHubImage = 'gcr.io/kubeflow/jupyterhub:1.0';
local diskParam = import 'param://disks';
Expand All @@ -32,6 +36,8 @@ local jupyterConfigMap = if std.length(diskNames) == 0 then
else jupyter.parts(namespace).jupyterHubConfigMapWithVolumes(diskNames);

local tfJobImage = import 'param://tfJobImage';
local tfDefaultImage = import 'param://tfDefaultImage';
local tfJobUiServiceType = import 'param://tfJobUiServiceType';

// Create a list of the resources needed for a particular disk
local diskToList = function(diskName) [
Expand Down Expand Up @@ -63,7 +69,11 @@ std.prune(k.core.v1.list.new([

// TfJob controller
tfjob.parts(namespace).tfJobDeploy(tfJobImage),
tfjob.parts(namespace).configMap,
tfjob.parts(namespace).configMap(cloud, tfDefaultImage),
tfjob.parts(namespace).serviceAccount,

// TfJob controll ui
tfjob.parts(namespace).ui(tfJobImage),
tfjob.parts(namespace).uiService(tfJobUiServiceType),
] + nfsComponents))

104 changes: 98 additions & 6 deletions kubeflow/core/tf-job.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
{
// TODO(https://github.com/ksonnet/ksonnet/issues/222): Taking namespace as an argument is a work around for the fact that ksonnet
// doesn't support automatically piping in the namespace from the environment to prototypes.
parts(namespace):: {
// TODO(jlewi): We should add options to configure it based on there being a config file or not.
parts(namespace):: {
tfJobDeploy(image): {
"apiVersion": "extensions/v1beta1",
"kind": "Deployment",
Expand Down Expand Up @@ -70,11 +69,49 @@
}
}, // tfJobDeploy

configMap: {
// Default value for
defaultControllerConfig(tfDefaultImage):: {
grpcServerFilePath: "/opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py",
}
+ if tfDefaultImage != "" && tfDefaultImage != "null" then
{
tfImage: tfDefaultImage,
}
else
{},

azureAccelerators:: {
accelerators: {
"alpha.kubernetes.io/nvidia-gpu": {
volumes: [
{ name: "lib",
mountPath: "/usr/local/nvidia/lib64",
hostPath: "/usr/lib/nvidia-384",
},
{
name: "bin",
mountPath: "/usr/local/nvidia/bin",
hostPath: "/usr/lib/nvidia-384/bin",
},
{ name: "libcuda",
mountPath: "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
hostPath: "/usr/lib/x86_64-linux-gnu/libcuda.so.1",
},
]
}
}
},

configData(cloud, tfDefaultImage):: self.defaultControllerConfig(tfDefaultImage) +
if cloud == "azure" then
self.azureAccelerators
else
{},

configMap(cloud, tfDefaultImage): {
"apiVersion": "v1",
"data": {
# TODO(jlewi): We should customize the file based on the Cloud.
"controller_config_file.yaml": @"grpcServerFilePath: /opt/mlkube/grpc_tensorflow_server/grpc_tensorflow_server.py",
"data": {
"controller_config_file.yaml": std.manifestJson($.parts(namespace).configData(cloud, tfDefaultImage)),
},
"kind": "ConfigMap",
"metadata": {
Expand All @@ -95,5 +132,60 @@
}
},

uiService(serviceType):: {
"apiVersion": "v1",
"kind": "Service",
"metadata": {
"name": "tf-job-dashboard",
"namespace": namespace,
},
"spec": {
"ports": [
{
"port": 80,
"targetPort": 8080
}
],
"selector": {
"name": "tf-job-dashboard"
},
"type": serviceType,
}
}, // uiService

ui(image):: {
"apiVersion": "extensions/v1beta1",
"kind": "Deployment",
"metadata": {
"name": "tf-job-dashboard",
"namespace": namespace,
},
"spec": {
"template": {
"metadata": {
"labels": {
"name": "tf-job-dashboard"
}
},
"spec": {
"containers": [
{
"command": [
"/opt/tensorflow_k8s/dashboard/backend"
],
"image": image,
"name": "tf-job-dashboard",
"ports": [
{
"containerPort": 8080
}
]
}
]
}
}
},
}, // ui

},
}

0 comments on commit ba78f90

Please sign in to comment.