Skip to content

Commit

Permalink
Move TF Operator e2e tests to AWS Prow (#1204)
Browse files Browse the repository at this point in the history
  • Loading branch information
ChanYiLin committed Feb 9, 2021
1 parent 62f0e0a commit 047d6af
Show file tree
Hide file tree
Showing 23 changed files with 457 additions and 156 deletions.
47 changes: 47 additions & 0 deletions manifests/crd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: tfjobs.kubeflow.org
spec:
group: kubeflow.org
scope: Namespaced
names:
kind: TFJob
singular: tfjob
plural: tfjobs
versions:
- name: v1
served: true
storage: true
subresources:
status: {}
validation:
openAPIV3Schema:
properties:
spec:
properties:
tfReplicaSpecs:
properties:
# The validation works when the configuration contains
# `Worker`, `PS` , `Chief` or `Evaluator`. Otherwise it will not be validated.
Worker:
properties:
replicas:
type: integer
minimum: 1
PS:
properties:
replicas:
type: integer
minimum: 1
Chief:
properties:
replicas:
type: integer
minimum: 1
maximum: 1
Evaluator:
properties:
replicas:
type: integer
minimum: 0
30 changes: 30 additions & 0 deletions manifests/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: tf-job-operator
namespace: kubeflow
spec:
replicas: 1
selector:
matchLabels:
name: tf-job-operator
template:
metadata:
labels:
name: tf-job-operator
spec:
containers:
- args:
- --monitoring-port=8443
env:
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
image: gcr.io/kubeflow-images-public/tf-operator:v0.6.0
name: tf-job-operator
serviceAccountName: tf-job-operator
15 changes: 15 additions & 0 deletions manifests/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kubeflow
resources:
- crd.yaml
- namespace.yaml
- rbac.yaml
- deployment.yaml
- service.yaml
commonLabels:
kustomize.component: tf-job-operator
images:
- name: gcr.io/kubeflow-images-public/tf-operator
newName: 809251082950.dkr.ecr.us-west-2.amazonaws.com/tf-operator
newTag: "0.1"
4 changes: 4 additions & 0 deletions manifests/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kubeflow
39 changes: 39 additions & 0 deletions manifests/podgroup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
name: podgroups.scheduling.incubator.k8s.io
spec:
group: scheduling.incubator.k8s.io
names:
kind: PodGroup
plural: podgroups
scope: Namespaced
validation:
openAPIV3Schema:
properties:
apiVersion:
type: string
kind:
type: string
metadata:
type: object
spec:
properties:
minMember:
format: int32
type: integer
type: object
status:
properties:
succeeded:
format: int32
type: integer
failed:
format: int32
type: integer
running:
format: int32
type: integer
type: object
type: object
version: v1alpha1
61 changes: 61 additions & 0 deletions manifests/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app: tf-job-operator
name: tf-job-operator
namespace: kubeflow
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
labels:
app: tf-job-operator
name: tf-job-operator
rules:
- apiGroups:
- kubeflow.org
resources:
- tfjobs
- tfjobs/status
- tfjobs/finalizers
verbs:
- '*'
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- '*'
- apiGroups:
- ""
resources:
- pods
- services
- endpoints
- events
verbs:
- '*'
- apiGroups:
- apps
- extensions
resources:
- deployments
verbs:
- '*'
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
labels:
app: tf-job-operator
name: tf-job-operator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: tf-job-operator
subjects:
- kind: ServiceAccount
name: tf-job-operator
namespace: kubeflow
---
19 changes: 19 additions & 0 deletions manifests/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/path: /metrics
prometheus.io/scrape: "true"
prometheus.io/port: "8443"
labels:
app: tf-job-operator
name: tf-job-operator
namespace: kubeflow
spec:
ports:
- name: monitoring-port
port: 8443
targetPort: 8443
selector:
name: tf-job-operator
type: ClusterIP
4 changes: 3 additions & 1 deletion py/kubeflow/tf_operator/cleanpod_policy_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from kubeflow.testing import ks_util, test_util, util
from kubeflow.tf_operator import k8s_util, test_runner, tf_job_client
from kubeflow.tf_operator import util as tf_operator_util
from kubernetes import client as k8s_client

CLEANPOD_ALL_COMPONENT_NAME = "clean_pod_all"
Expand All @@ -23,11 +24,12 @@ def __init__(self, args):
class_name="CleanPodPolicyTests", name=name)

def run_tfjob_with_cleanpod_policy(self, component, clean_pod_policy):
tf_operator_util.load_kube_config()
api_client = k8s_client.ApiClient()

# Setup the ksonnet app
ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)

# Create the TF job
Expand Down
2 changes: 1 addition & 1 deletion py/kubeflow/tf_operator/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def ks_deploy(app_dir, component, params, env=None, account=None):
if not re.search(".*environment.*already exists.*", e.output):
raise

for k, v in params.iteritems():
for k, v in params.items():
util.run([ks_cmd, "param", "set", "--env=" + env, component, k, v],
cwd=app_dir)

Expand Down
4 changes: 3 additions & 1 deletion py/kubeflow/tf_operator/distributed_training_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from kubeflow.testing import ks_util, test_util, util
from kubeflow.tf_operator import test_runner, tf_job_client
from kubeflow.tf_operator import util as tf_operator_util
from kubernetes import client as k8s_client

TFJOB_COMPONENT_NAME = "distributed_training"
Expand All @@ -23,10 +24,11 @@ def __init__(self, args):
# Run a distributed training TFJob, wait for it to complete, and check for pod/service
# creation errors.
def run_distributed_training_job(self, component):
tf_operator_util.load_kube_config()
api_client = k8s_client.ApiClient()

# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)

# Create the TF job
Expand Down
3 changes: 2 additions & 1 deletion py/kubeflow/tf_operator/estimator_runconfig_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,13 @@ def __init__(self, args):

# Run a TFJob, verify that the TensorFlow runconfig specs are set correctly.
def test_tfjob_and_verify_runconfig(self):
tf_operator_util.load_kube_config()
api_client = k8s_client.ApiClient()
masterHost = api_client.configuration.host
component = COMPONENT_NAME + "_" + self.tfjob_version

# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)

# Create the TF job
Expand Down
4 changes: 3 additions & 1 deletion py/kubeflow/tf_operator/invalid_tfjob_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from kubeflow.testing import ks_util, test_util, util
from kubeflow.tf_operator import test_runner, tf_job_client
from kubeflow.tf_operator import util as tf_operator_util
from kubernetes import client as k8s_client

INVALID_TFJOB_COMPONENT_NAME = "invalid_tfjob"
Expand All @@ -22,11 +23,12 @@ def __init__(self, args):
class_name="InvalidTfJobTests", name=name)

def test_invalid_tfjob_spec(self):
tf_operator_util.load_kube_config()
api_client = k8s_client.ApiClient()
component = INVALID_TFJOB_COMPONENT_NAME + "_" + self.tfjob_version

# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)

# Create the TF job
Expand Down
6 changes: 4 additions & 2 deletions py/kubeflow/tf_operator/pod_names_validation_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from kubeflow.testing import ks_util, test_util, util
from kubeflow.tf_operator import test_runner, tf_job_client
from kubeflow.tf_operator import util as tf_operator_util
from kubernetes import client as k8s_client

COMPONENT_NAME = "pod_names_validation"
Expand All @@ -22,7 +23,7 @@ def extract_job_specs(replica_specs):
"""
specs = dict()
for job_type in replica_specs:
specs[job_type.encode("ascii").lower()] = int(
specs[job_type.lower()] = int(
replica_specs.get(job_type, {}).get("replicas", 0))
return specs

Expand All @@ -44,10 +45,11 @@ def __init__(self, args):
class_name="PodNamesValidationTest", name=name)

def test_pod_names(self):
tf_operator_util.load_kube_config()
api_client = k8s_client.ApiClient()
component = COMPONENT_NAME + "_" + self.tfjob_version
ks_cmd = ks_util.get_ksonnet_cmd(self.app_dir)
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)
util.run([ks_cmd, "apply", self.env, "-c", component], cwd=self.app_dir)
logging.info("Created job %s in namespaces %s", self.name, self.namespace)
Expand Down
4 changes: 3 additions & 1 deletion py/kubeflow/tf_operator/replica_restart_policy_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from kubeflow.testing import ks_util, test_util, util
from kubeflow.tf_operator import test_runner, tf_job_client
from kubeflow.tf_operator import util as tf_operator_util
from kubernetes import client as k8s_client

REPLICA_RESTART_POLICY_ALWAYS_COMPONENT_NAME = "replica_restart_policy_always"
Expand All @@ -25,10 +26,11 @@ def __init__(self, args):

def run_tfjob_with_replica_restart_policy(self, component,
replica_restart_policy, exit_code):
tf_operator_util.load_kube_config()
api_client = k8s_client.ApiClient()

# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)

# Create the TF job
Expand Down
4 changes: 3 additions & 1 deletion py/kubeflow/tf_operator/shutdown_policy_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from kubeflow.testing import ks_util, test_util, util
from kubeflow.tf_operator import test_runner, tf_job_client
from kubeflow.tf_operator import util as tf_operator_util
from kubernetes import client as k8s_client

MASTER_IS_CHIEF_COMPONENT_NAME = "master_is_chief"
Expand All @@ -22,10 +23,11 @@ def __init__(self, args):
class_name="ShutdownPolicyTests", name=name)

def run_tfjob_with_shutdown_policy(self, component, shutdown_policy):
tf_operator_util.load_kube_config()
api_client = k8s_client.ApiClient()

# Setup the ksonnet app
ks_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
tf_operator_util.setup_ks_app(self.app_dir, self.env, self.namespace, component,
self.params)

# Create the TF job
Expand Down
Loading

0 comments on commit 047d6af

Please sign in to comment.