Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Katib Doesn't make Trials #926

Closed
wjddyd66 opened this issue Nov 22, 2019 · 17 comments
Closed

Katib Doesn't make Trials #926

wjddyd66 opened this issue Nov 22, 2019 · 17 comments
Labels

Comments

@wjddyd66
Copy link

wjddyd66 commented Nov 22, 2019

/kind bug
Please check if I missed anything.

  1. Make Python File
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np
from tensorflow.keras.datasets.cifar10 import load_data
import os
import sys
import tensorflow as tf

#Dropout & LearningLate 받을 변수 선언
parser = argparse.ArgumentParser()
parser.add_argument('--num_epoch', type=int, default=1000, help='Number of steps to run trainer.')
parser.add_argument('--learning_rate', type=float, default=0.001, help='Initial learning rate')

FLAGS, unparsed = parser.parse_known_args()

num_epoch = FLAGS.num_epoch
learning_rate = FLAGS.learning_rate

#next_batch 선언
def next_batch(num, data, labels):
    idx = np.arange(0, len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[i] for i in idx]
    labels_shuffle = [labels[i] for i in idx]
    
    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

#CNN Model Definition
def build_CNN_Classifier(x):
    x_image = x
    
    # 1_Convolution_Layer
    # RGB(Color) image를 64개의 feature으로 mapping 과정
    W_conv1 = tf.Variable(tf.truncated_normal(shape=[5,5,3,64],stddev=5e-2))
    #truncated: 정규 분포로서 출력
    b_conv1 = tf.Variable(tf.constant(0.1, shape=[64]))
    h_conv1 = tf.nn.relu(tf.nn.conv2d(x_image, W_conv1, strides=[1,1,1,1],padding='SAME')+b_conv1)
    
    # 1_Pooling Layer
    h_pool1 = tf.nn.max_pool(h_conv1,ksize=[1,3,3,1],strides=[1,2,2,1],padding='SAME')
    
    # 2_Convolution_Layer
    # 64개의 feature을 다시 64개의 feature으로 mapping 하는 과정
    W_conv2 = tf.Variable(tf.truncated_normal(shape=[5,5,64,64],stddev=5e-2))
    #truncated: 정규 분포로서 출력
    b_conv2 = tf.Variable(tf.constant(0.1, shape=[64]))
    h_conv2 = tf.nn.relu(tf.nn.conv2d(h_pool1, W_conv2,strides=[1,1,1,1],padding='SAME')+b_conv2)
    
    # 2_Pooling Layer
    h_pool2 = tf.nn.max_pool(h_conv2,ksize=[1,3,3,1],strides=[1,2,2,1],padding='SAME')
    
    # 3_Convolution_Layer
    # 64개의 feature을 다시 128개의 feature으로 mapping 하는 과정
    W_conv3 = tf.Variable(tf.truncated_normal(shape=[3,3,64,128],stddev=5e-2))
    #truncated: 정규 분포로서 출력
    b_conv3 = tf.Variable(tf.constant(0.1, shape=[128]))
    h_conv3 = tf.nn.relu(tf.nn.conv2d(h_pool2, W_conv3,strides=[1,1,1,1],padding='SAME')+b_conv3)
    
    
    # 4_Convolution_Layer
    # 128개의 feature을 다시 128개의 feature으로 mapping 하는 과정
    W_conv4 = tf.Variable(tf.truncated_normal(shape=[3,3,128,128],stddev=5e-2))
    #truncated: 정규 분포로서 출력
    b_conv4 = tf.Variable(tf.constant(0.1, shape=[128]))
    h_conv4 = tf.nn.relu(tf.nn.conv2d(h_conv3, W_conv4,strides=[1,1,1,1],padding='SAME')+b_conv4)
    
    
    # 5_Convolution_Layer
    # 128개의 feature을 다시 128개의 feature으로 mapping 하는 과정
    W_conv5 = tf.Variable(tf.truncated_normal(shape=[3,3,128,128],stddev=5e-2))
    #truncated: 정규 분포로서 출력
    b_conv5 = tf.Variable(tf.constant(0.1, shape=[128]))
    h_conv5 = tf.nn.relu(tf.nn.conv2d(h_conv4, W_conv5,strides=[1,1,1,1],padding='SAME')+b_conv5)
    
    # 완전 연결층
    # 2번의 downsampling 이후에, 32 x 32 이미지는 8 x 8 x 128의 Feature map으로 변환
    W_fc1 = tf.Variable(tf.truncated_normal(shape=[8*8*128, 384],stddev=5e-2))
    b_fc1 = tf.Variable(tf.constant(0.1,shape=[384]))
    h_conv5_flat = tf.reshape(h_conv5,[-1,8*8*128])
    h_fc1 = tf.nn.relu(tf.matmul(h_conv5_flat,W_fc1)+b_fc1)
    
    # Dropout - 모델의 복잡도를 컨트롤
    # 특징들의 co-adaptation을 방지
    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    
    # 완전 연결층2
    # 384개의 feature를 10개의 class로 Mapping
    W_fc2 = tf.Variable(tf.truncated_normal(shape=[384, 10],stddev=5e-2))
    b_fc2 = tf.Variable(tf.constant(0.1,shape=[10]))
    logits = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
    y_pred = tf.nn.softmax(logits)
    
    return y_pred, logits

#Parameter Definition
x = tf.placeholder(tf.float32, shape = [None, 32, 32, 3])
y = tf.placeholder(tf.float32, shape = [None, 10])
keep_prob = tf.placeholder(tf.float32)


#Transfer to one-hot Encoding
(x_train, y_train), (x_test, y_test) = load_data()
#Scalar 현태의 0 ~ 9 형태의 0 ~ 9 을 One - hot Encoding 형태로 변환
y_train_one_hot = tf.squeeze(tf.one_hot(y_train,10),axis=1)
y_test_one_hot = tf.squeeze(tf.one_hot(y_test,10),axis=1)

#Model 생성 및 optimization & Loss Function 선언
y_pred, logits = build_CNN_Classifier(x)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=logits))
train_step = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

#정확도를 계산하는 연산 추가
correct_prediction = tf.equal(tf.argmax(y_pred,1),tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
tf.summary.scalar('accuracy', accuracy)
tf.summary.scalar('test_input', 1)

merged = tf.summary.merge_all()

#세션을 열어서 학습을 실행
with tf.Session() as sess:
    train_writer = tf.summary.FileWriter('/train', sess.graph)
    #모든 변수를 초기화
    sess.run(tf.global_variables_initializer())
    
    for i in range(num_epoch):
        batch = next_batch(128, x_train, y_train_one_hot.eval())
        
        if i % 100 == 0:
            train_accuracy = accuracy.eval(feed_dict={x: batch[0], y: batch[1], keep_prob: 1.0})
            loss_print = loss.eval(feed_dict={x: batch[0], y: batch[1], keep_prob: 1.0})
            
            print('반복(Epoch): %d, 정확도: %f, 손실함수: %f'%(i, train_accuracy, loss_print))
            
            #20 % Dropout을 활용하여 학습을 진행
        summary,_ = sess.run([merged,train_step], feed_dict={x: batch[0], y:batch[1], keep_prob:0.8})
        train_writer.add_summary(summary, i)

    #학습이 끝나면 테스트 데이터에 대한 정확도를 출력
    train_writer.close()

    test_accuracy = 0.0
    for i in range(10):
        test_batch = next_batch(1000,x_test,y_test_one_hot.eval())
        test_accuracy = test_accuracy + accuracy.eval(feed_dict={x: test_batch[0], y:test_batch[1], keep_prob: 1.0})
            
    test_accuracy = test_accuracy/10
    print('테스트 데이터 정확도: %f'%(test_accuracy))

It's Work in Jupyter Notebook in Kubeflow
image

And It Makes tfevents File well in /train Folder
image

  1. Build Image for Docker file & Push Image
    Dockerfile
FROM tensorflow/tensorflow:1.14.0

ADD . /var/tf_cnn
ENTRYPOINT ["python", "/var/tf_cnn/cnn-example.py"]
  1. Create .yaml File
apiVersion: "kubeflow.org/v1alpha3"
kind: Experiment
metadata:
  namespace: kubeflow
  name: cnn-example-accuracy2
spec:
  parallelTrialCount: 3
  maxTrialCount: 12
  maxFailedTrialCount: 3
  objective:
    type: minimize
    goal: 1
    objectiveMetricName: accuracy
  algorithm:
    algorithmName: random
  metricsCollectorSpec:
    source:
      fileSystemPath:
        path: /train
        kind: Directory
    collector:
      kind: TensorFlowEvent
  parameters:
    - name: --learning_rate
      parameterType: float
      feasibleSpace:
        min: "0.01"
        max: "0.05"
    - name: --num_epoch
      parameterType: int
      feasibleSpace:
        min: "100"
        max: "1000"
  trialTemplate:
    goTemplate:
        rawTemplate: |-
          apiVersion: "kubeflow.org/v1"
          kind: TFJob
          metadata:
            name: {{.Trial}}
            namespace: {{.NameSpace}}
          spec:
           tfReplicaSpecs:
            Worker:
              replicas: 1 
              restartPolicy: OnFailure
              template:
                spec:
                  containers:
                    - name: tensorflow 
                      image: wjddyd66/cnn-example:11.0
                      imagePullPolicy: Always
                      command:
                        - "python"
                        - "/var/tf_cnn/cnn-example.py"
                        {{- with .HyperParameters}}
                        {{- range .}}
                        - "{{.Name}}={{.Value}}"
                        {{- end}}
                        {{- end}}
  1. Check the Events
$ kubectl get experiments -n kubeflow

NAME                    STATUS    AGE
cnn-example-accuracy    Created   19m
cnn-example-accuracy2   Created   18m
cnn-example10           Created   21m

image

But there is no Trials

$ kubectl get trials -n kubeflow

No resources found.

I missed anything in this Step??? or Wrong .yaml file or python code?

@gaocegege
Copy link
Member

Can you show us kubectl -s kubeflow describe experiments

@wjddyd66
Copy link
Author

@gaocegege

$ kubectl -s kubeflow describe experiments
Unable to connect to the server: dial tcp: lookup kubeflow on 127.0.0.53:53: server misbehaving

Here is my out

I don't know What's Missing

In my Eviroment, ttfob-example.yaml is Work!...

What is Different?

@gaocegege
Copy link
Member

Sorry, my mistake.

The command should be kubectl -n kubeflow describe experiments

@wjddyd66
Copy link
Author

@gaocegege

$ kubectl -n kubeflow describe experiments

Name:         cnn-example-accuracy
Namespace:    kubeflow
Labels:       <none>
Annotations:  <none>
API Version:  kubeflow.org/v1alpha3
Kind:         Experiment
Metadata:
  Creation Timestamp:  2019-11-22T06:28:58Z
  Generation:          1
  Resource Version:    84566
  Self Link:           /apis/kubeflow.org/v1alpha3/namespaces/kubeflow/experiments/cnn-example-accuracy
  UID:                 8f101fcb-b80b-47ba-8757-c97eb93660ee
Spec:
  Algorithm:
    Algorithm Name:        random
  Max Failed Trial Count:  3
  Max Trial Count:         12
  Metrics Collector Spec:
    Collector:
      Kind:  TensorFlowEvent
    Source:
      File System Path:
        Kind:  Directory
        Path:  /train
  Objective:
    Goal:                   1
    Objective Metric Name:  accuracy
    Type:                   minimize
  Parallel Trial Count:     3
  Parameters:
    Feasible Space:
      Max:           0.05
      Min:           0.01
    Name:            --learning_rate
    Parameter Type:  float
    Feasible Space:
      Max:           1000
      Min:           100
    Name:            --num_epoch
    Parameter Type:  int
  Trial Template:
    Go Template:
      Raw Template:  apiVersion: "kubeflow.org/v1"
kind: TFJob
metadata:
  name: {{.Trial}}
  namespace: {{.NameSpace}}
spec:
 tfReplicaSpecs:
  Worker:
    replicas: 1 
    restartPolicy: OnFailure
    template:
      spec:
        containers:
          - name: tensorflow 
            image: wjddyd66/cnn-example:11.0
            imagePullPolicy: Always
            command:
              - "python"
              - "/var/tf_cnn/cnn-example.py"
              {{- with .HyperParameters}}
              {{- range .}}
              - "{{.Name}}={{.Value}}"
              {{- end}}
              {{- end}}
Status:
  Completion Time:  <nil>
  Conditions:
    Last Transition Time:  2019-11-22T06:28:58Z
    Last Update Time:      2019-11-22T06:28:58Z
    Message:               Experiment is created
    Reason:                ExperimentCreated
    Status:                True
    Type:                  Created
  Current Optimal Trial:
    Observation:
      Metrics:              <nil>
    Parameter Assignments:  <nil>
  Start Time:               2019-11-22T06:28:58Z
Events:                     <none>

@gaocegege
Copy link
Member

The experiment is created. Can you test

kubectl -n kubeflow describe suggestions

@wjddyd66
Copy link
Author

root@jyhwang-XPS-15-9570:/home/jyhwang/kubeflow# kubectl -n kubeflow describe suggestions
Name:         cnn-example-accuracy
Namespace:    kubeflow
Labels:       <none>
Annotations:  <none>
API Version:  kubeflow.org/v1alpha3
Kind:         Suggestion
Metadata:
  Creation Timestamp:  2019-11-22T06:28:58Z
  Generation:          1
  Owner References:
    API Version:           kubeflow.org/v1alpha3
    Block Owner Deletion:  true
    Controller:            true
    Kind:                  Experiment
    Name:                  cnn-example-accuracy
    UID:                   8f101fcb-b80b-47ba-8757-c97eb93660ee
  Resource Version:        84955
  Self Link:               /apis/kubeflow.org/v1alpha3/namespaces/kubeflow/suggestions/cnn-example-accuracy
  UID:                     c58d1f09-6b39-4e88-8718-904654934b1a
Spec:
  Algorithm Name:  random
  Requests:        3
Status:
  Conditions:
    Last Transition Time:  2019-11-22T06:28:58Z
    Last Update Time:      2019-11-22T06:28:58Z
    Message:               Suggestion is created
    Reason:                SuggestionCreated
    Status:                True
    Type:                  Created
    Last Transition Time:  2019-11-22T06:29:36Z
    Last Update Time:      2019-11-22T06:29:36Z
    Message:               Deployment is ready
    Reason:                DeploymentReady
    Status:                True
    Type:                  DeploymentReady
    Last Transition Time:  2019-11-22T06:30:36Z
    Last Update Time:      2019-11-22T06:30:36Z
    Message:               Suggestion is running
    Reason:                SuggestionRunning
    Status:                True
    Type:                  Running
  Start Time:              2019-11-22T06:28:58Z
Events:
  Type     Reason          Age                   From                   Message
  ----     ------          ----                  ----                   -------
  Warning  ReconcileError  76s (x17 over 6m51s)  suggestion-controller  rpc error: code = Unknown desc = Exception calling application: 'NoneType' object has no attribute 'type'

Is this not possible Parameter Type float?

@gaocegege
Copy link
Member

Yeah, we do not support float, we support double

@wjddyd66
Copy link
Author

wjddyd66 commented Nov 22, 2019

@gaocegege
Ok Thank you It Works!
image
But The Pod is Error
Back-off restarting failed container....

root@jyhwang-XPS-15-9570:/home/jyhwang/kubeflow# kubectl describe po -n kubeflow cnn-example-accuracy2-7l2hgl7n-worker-0
Name:           cnn-example-accuracy2-7l2hgl7n-worker-0
Namespace:      kubeflow
Priority:       0
Node:           minikube/192.168.0.153
Start Time:     Fri, 22 Nov 2019 15:47:26 +0900
Labels:         controller-name=tf-operator
                group-name=kubeflow.org
                job-name=cnn-example-accuracy2-7l2hgl7n
                job-role=master
                tf-job-name=cnn-example-accuracy2-7l2hgl7n
                tf-replica-index=0
                tf-replica-type=worker
Annotations:    <none>
Status:         Running
IP:             172.17.0.59
Controlled By:  TFJob/cnn-example-accuracy2-7l2hgl7n
Containers:
  tensorflow:
    Container ID:  docker://6eec75d271552a63ca95f12b2140cc261e99d8b6b6bfd442551c23dde85c9014
    Image:         wjddyd66/cnn-example:11.0
    Image ID:      docker-pullable://wjddyd66/cnn-example@sha256:f683f668b4cd9e287f8d6e3ec1140ef56f69cdc0de22ab35429f66eb9d9c2be6
    Port:          2222/TCP
    Host Port:     0/TCP
    Command:
      python
      /var/tf_cnn/cnn-example.py
      --learning_rate=0.018118971083617
      --num_epoch=766
    State:          Waiting
      Reason:       CrashLoopBackOff
    Last State:     Terminated
      Reason:       Error
      Exit Code:    1
      Started:      Fri, 22 Nov 2019 15:48:37 +0900
      Finished:     Fri, 22 Nov 2019 15:48:37 +0900
    Ready:          False
    Restart Count:  3
    Environment:    <none>
    Mounts:
      /train from metrics-volume (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from default-token-cnwl6 (ro)
  metrics-collector:
    Container ID:  docker://48b1575c8026b11eed0a366533aa84c228121ce5869903b28d2e89419dc96f4d
    Image:         gcr.io/kubeflow-images-public/katib/v1alpha3/tfevent-metrics-collector
    Image ID:      docker-pullable://gcr.io/kubeflow-images-public/katib/v1alpha3/tfevent-metrics-collector@sha256:55efeaad18e6f3ceda83edb5ecb9e658f6819afa31756593a5778697d7b5553e
    Port:          <none>
    Host Port:     <none>
    Args:
      -t
      cnn-example-accuracy2-7l2hgl7n
      -m
      accuracy
      -s
      katib-manager.kubeflow:6789
      -path
      /train
    State:          Terminated
      Reason:       Completed
      Exit Code:    0
      Started:      Fri, 22 Nov 2019 15:47:37 +0900
      Finished:     Fri, 22 Nov 2019 15:47:37 +0900
    Ready:          False
    Restart Count:  0
    Environment:    <none>
    Mounts:
      /train from metrics-volume (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from default-token-cnwl6 (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             False 
  ContainersReady   False 
  PodScheduled      True 
Volumes:
  default-token-cnwl6:
    Type:        Secret (a volume populated by a Secret)
    SecretName:  default-token-cnwl6
    Optional:    false
  metrics-volume:
    Type:        EmptyDir (a temporary directory that shares a pod's lifetime)
    Medium:      
    SizeLimit:   <unset>
QoS Class:       BestEffort
Node-Selectors:  <none>
Tolerations:     node.kubernetes.io/not-ready:NoExecute for 300s
                 node.kubernetes.io/unreachable:NoExecute for 300s
Events:
  Type     Reason     Age                From               Message
  ----     ------     ----               ----               -------
  Normal   Scheduled  89s                default-scheduler  Successfully assigned kubeflow/cnn-example-accuracy2-7l2hgl7n-worker-0 to minikube
  Normal   Started    78s                kubelet, minikube  Started container metrics-collector
  Normal   Pulled     78s                kubelet, minikube  Container image "gcr.io/kubeflow-images-public/katib/v1alpha3/tfevent-metrics-collector" already present on machine
  Normal   Created    78s                kubelet, minikube  Created container metrics-collector
  Normal   Pulling    24s (x4 over 88s)  kubelet, minikube  Pulling image "wjddyd66/cnn-example:11.0"
  Normal   Pulled     18s (x4 over 79s)  kubelet, minikube  Successfully pulled image "wjddyd66/cnn-example:11.0"
  Normal   Created    18s (x4 over 79s)  kubelet, minikube  Created container tensorflow
  Normal   Started    18s (x4 over 78s)  kubelet, minikube  Started container tensorflow
  Warning  BackOff    2s (x6 over 68s)   kubelet, minikube  Back-off restarting failed container

@gaocegege
Copy link
Member

gaocegege commented Nov 22, 2019

Can you run

python
      /var/tf_cnn/cnn-example.py
      --learning_rate=0.018118971083617
      --num_epoch=766

To see if it works? It seems that the command returns exit code 1

BTW, can you show us the logs of the pod?

@wjddyd66
Copy link
Author

root@jyhwang-XPS-15-9570:/home/jyhwang/kubeflow/kubeflow-docker# kubectl logs  -n kubeflow cnn-example-accuracy2-7l2hgl7n-worker-0 -c tensorflow
  File "/var/tf_cnn/cnn-example.py", line 12
SyntaxError: Non-ASCII character '\xeb' in file /var/tf_cnn/cnn-example.py on line 12, but no encoding declared; see http://python.org/dev/peps/pep-0263/ for details
root@jyhwang-XPS-15-9570:/home/jyhwang/kubeflow/kubeflow-docker# kubectl logs  -n kubeflow cnn-example-accuracy2-7l2hgl7n-worker-0 -c metrics-collector
In cnn-example-accuracy2-7l2hgl7n 0 metrics will be reported.

I think It has problem in Using Hangul......

@gaocegege
Copy link
Member

@wjddyd66 Yeah I think so. Prefer English here.

@wjddyd66
Copy link
Author

@gaocegege
It Works... Thank you

My Last Question is:

Yout Eaxmplt .yaml File Here

apiVersion: "kubeflow.org/v1alpha3"
kind: Experiment
metadata:
  namespace: kubeflow
  name: tfjob-example
spec:
  parallelTrialCount: 3
  maxTrialCount: 12
  maxFailedTrialCount: 3
  objective:
    type: maximize
    goal: 0.99
    objectiveMetricName: accuracy_1
  algorithm:
    algorithmName: random
  metricsCollectorSpec:
    source:
      fileSystemPath:
        path: /train
        kind: Directory
    collector:
      kind: TensorFlowEvent
  parameters:
    - name: --learning_rate
      parameterType: double
      feasibleSpace:
        min: "0.01"
        max: "0.05"
    - name: --batch_size
      parameterType: int
      feasibleSpace:
        min: "100"
        max: "200"
  trialTemplate:
    goTemplate:
        rawTemplate: |-
          apiVersion: "kubeflow.org/v1"
          kind: TFJob
          metadata:
            name: {{.Trial}}
            namespace: {{.NameSpace}}
          spec:
           tfReplicaSpecs:
            Worker:
              replicas: 1 
              restartPolicy: OnFailure
              template:
                spec:
                  containers:
                    - name: tensorflow 
                      image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
                      imagePullPolicy: Always
                      command:
                        - "python"
                        - "/var/tf_mnist/mnist_with_summaries.py"
                        - "--log_dir=/train/metrics"
                        {{- with .HyperParameters}}
                        {{- range .}}
                        - "{{.Name}}={{.Value}}"
                        {{- end}}
                        {{- end}}

And I Think Original Python Code Link is Here: https://github.com/kubeflow/tf-operator/blob/master/examples/v1/mnist_with_summaries/mnist_with_summaries.py
But there is no objectiveMetricName: accuracy_1
only accuracy

What's mean accuracy_1????

@gaocegege
Copy link
Member

It is the metric name in tfevent, I think. You can change it if there is another name in your TFEvent.

@johnugeorge
Copy link
Member

@wjddyd66 It is the metric name which you can find when tf event file is parsed

@wjddyd66
Copy link
Author

@johnugeorge Thank you I resolved that! It's very amazing

@gaocegege
Copy link
Member

/close

@k8s-ci-robot
Copy link

@gaocegege: Closing this issue.

In response to this:

/close

Instructions for interacting with me using PR comments are available here. If you have questions or suggestions related to my behavior, please file an issue against the kubernetes/test-infra repository.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

4 participants