From 00831647bba7aaadd8532aeab7d1df3654e5c905 Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 11:01:08 +0800 Subject: [PATCH 01/11] yan-test --- .../run-example-test-ray-integration.sh | 403 +++++++++++++----- .../learn/bigdl/attention/transformer.py | 17 +- .../bigdl/imageInference/imageInference.py | 10 +- .../learn/horovod/pytorch_estimator.py | 11 +- .../learn/horovod/simple_horovod_pytorch.py | 6 + .../orca/example/learn/mxnet/lenet_mnist.py | 1 + python/orca/example/learn/openvino/predict.py | 3 +- .../async_parameter_server.py | 9 +- .../parameter_server/sync_parameter_server.py | 6 +- .../example/ray_on_spark/rl_pong/rl_pong.py | 3 +- .../rllib/multiagent_two_trainers.py | 32 +- .../tfpark/estimator/estimator_dataset.py | 25 +- .../tfpark/estimator/estimator_inception.py | 21 +- .../tfpark/gan/gan_train_and_evaluate.py | 27 +- .../example/tfpark/keras/keras_dataset.py | 40 +- .../example/tfpark/keras/keras_ndarray.py | 33 +- .../example/tfpark/tf_optimizer/evaluate.py | 53 ++- .../orca/example/tfpark/tf_optimizer/train.py | 39 +- 18 files changed, 564 insertions(+), 175 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 94c12d1233b..41106e3c11e 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -42,110 +42,313 @@ python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostRegressor now=$(date "+%s") time3=$((now-start)) -ray stop -f - -#echo "#4 Start rl_pong example" -#start=$(date "+%s") -#python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rl_pong/rl_pong.py --iterations 10 -#now=$(date "+%s") -#time4=$((now-start)) -# -#echo "#5 Start multiagent example" -#start=$(date "+%s") -#python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py --iterations 5 -#now=$(date "+%s") -#time5=$((now-start)) -# -#echo "#6 Start async_parameter example" -#start=$(date "+%s") -#python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py --iterations 10 -#now=$(date "+%s") -#time6=$((now-start)) -# -#echo "#7 Start sync_parameter example" -#start=$(date "+%s") -#python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py --iterations 10 -#now=$(date "+%s") -#time7=$((now-start)) -# -#echo "#8 Start mxnet lenet example" -#start=$(date "+%s") -# -## get_mnist_iterator in MXNet requires the data to be placed in the `data` folder of the running directory. -## The running directory of integration test is ${ANALYTICS_ZOO_ROOT}. -#if [ -f data/mnist.zip ] -#then -# echo "mnist.zip already exists" -#else -# wget -nv $FTP_URI/analytics-zoo-data/mnist.zip -P data -#fi -#unzip -q data/mnist.zip -d data -# -#python ${BIGDL_ROOT}/python/orca/example/learn/mxnet/lenet_mnist.py -e 1 -b 256 -#now=$(date "+%s") -#time8=$((now-start)) -# -#echo "#9 Start fashion_mnist example with Tensorboard visualization" -#start=$(date "+%s") -# -#if [ -d ${BIGDL_ROOT}/python/orca/example/learn/pytorch/fashion_mnist/data ] -#then -# echo "fashion-mnist already exists" -#else -# wget -nv $FTP_URI/analytics-zoo-data/data/fashion-mnist.zip -P ${BIGDL_ROOT}/python/orca/example/learn/pytorch/fashion_mnist/ -# unzip ${BIGDL_ROOT}/python/orca/example/learn/pytorch/fashion_mnist/fashion-mnist.zip -#fi -# -#sed "s/epochs=5/epochs=1/g;s/batch_size=4/batch_size=256/g" \ -# ${BIGDL_ROOT}/python/orca/example/learn/pytorch/fashion_mnist/fashion_mnist.py \ -# > ${BIGDL_ROOT}/python/orca/example/learn/pytorch/fashion_mnist/fashion_mnist_tmp.py -# -#python ${BIGDL_ROOT}/python/orca/example/learn/pytorch/fashion_mnist/fashion_mnist_tmp.py --backend torch_distributed -#now=$(date "+%s") -#time9=$((now-start)) -# -# -#echo "#10 start example for orca super-resolution" -#start=$(date "+%s") -# -#if [ ! -f BSDS300-images.tgz ]; then -# wget -nv $FTP_URI/analytics-zoo-data/BSDS300-images.tgz -#fi -#if [ ! -d dataset/BSDS300/images ]; then -# mkdir dataset -# tar -xzf BSDS300-images.tgz -C dataset -#fi -# -#python ${BIGDL_ROOT}/python/orca/example/learn/pytorch/super_resolution/super_resolution.py --backend torch_distributed -# -#now=$(date "+%s") -#time10=$((now-start)) -# -# -#echo "#11 start example for orca cifar10" -#start=$(date "+%s") -# -#if [ -d ${BIGDL_ROOT}/python/orca/example/learn/pytorch/cifar10/data ]; then -# echo "Cifar10 already exists" -#else -# wget -nv $FTP_URI/analytics-zoo-data/cifar10.zip -P ${BIGDL_ROOT}/python/orca/example/learn/pytorch/cifar10 -# unzip ${BIGDL_ROOT}/python/orca/example/learn/pytorch/cifar10/cifar10.zip -#fi -# -#python ${BIGDL_ROOT}/python/orca/example/learn/pytorch/cifar10/cifar10.py --backend torch_distributed -# -#now=$(date "+%s") -#time11=$((now-start)) +set -e + +echo "#4 start test for orca bigdl transformer" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca transformer failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#4 Total time cost ${time} seconds" + + +echo "#5 start test for orca bigdl imageInference" +#timer +start=$(date "+%s") +if [ -f models/bigdl_inception-v1_imagenet_0.4.0.model ]; then + echo "analytics-zoo-models/bigdl_inception-v1_imagenet_0.4.0.model already exists." +else + wget -nv $FTP_URI/analytics-zoo-models/image-classification/bigdl_inception-v1_imagenet_0.4.0.model \ + -P models +fi +run the example +python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/imageInference/imageInference.py \ + -m models/bigdl_inception-v1_imagenet_0.4.0.model \ + -f ${HDFS_URI}/kaggle/train_100 --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca imageInference failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#5 Total time cost ${time} seconds" + +echo "#6 start test for orca pytorch_estimator" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/learn/horovod/pytorch_estimator.py --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca pytorch_estimator failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#6 Total time cost ${time} seconds" + +# echo "#7 start test for orca simple_pytorch" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/learn/horovod/simple_horovod_pytorch.py --cluster_mode yarn-client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca simple_pytorch failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#7 Total time cost ${time} seconds" + +# echo "#8 start test for orca mxnet" +# #timer +# start=$(date "+%s") + +# # if [ -f ${BIGDL_ROOT}/data/mnist.zip ] +# # then +# # echo "mnist.zip already exists" +# # else +# # wget -nv $FTP_URI/analytics-zoo-data/mnist.zip -P ${BIGDL_ROOT}/data +# # fi +# # unzip -q ${BIGDL_ROOT}/data/mnist.zip -d ${BIGDL_ROOT}/data + +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/learn/mxnet/lenet_mnist.py #--cluster_mode yarn-client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca mxnet failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#8 Total time cost ${time} seconds" + +echo "#prepare dataset for ray_on_spark" +wget -nv $FTP_URI/analytics-zoo-data/mnist/train-labels-idx1-ubyte.gz +wget -nv $FTP_URI/analytics-zoo-data/mnist/train-images-idx3-ubyte.gz +wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-labels-idx1-ubyte.gz +wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-images-idx3-ubyte.gz +zip ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/MNIST_data.zip train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz + +echo "#9 start test for orca ros async" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py \ + --iterations 20 --num_workers 2 --cluster_mode yarn +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros async failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#9 Total time cost ${time} seconds" + +echo "#10 start test for orca ros sync" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py \ + --iterations 20 --num_workers 2 --cluster_mode yarn +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros sync failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#10 Total time cost ${time} seconds" + +echo "#11 start test for orca rllib" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py \ + --iterations 5 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros rllib failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#11 Total time cost ${time} seconds" + +echo "#12 start test for orca rl_pong" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rl_pong/rl_pong.py \ + --iterations 5 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros rl_pong failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#12 Total time cost ${time} seconds" + +echo "#13 start test for orca tfpark keras_dataset" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/tfpark/keras/keras_dataset.py \ + --data_path ${HDFS_URI}/mnist \ + --max_epoch 5 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros rl_pong failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#13 Total time cost ${time} seconds" + +echo "#14 start test for orca tfpark keras_dataset" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/tfpark/keras/keras_ndarray.py \ + --max_epoch 5 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros rl_pong failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#14 Total time cost ${time} seconds" + +# echo "#15 start test for orca tfpark gan" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/tfpark/gan/gan_train_and_evaluate.py \ +# --cluster_mode yarn-client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca tfpark gan failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#15 Total time cost ${time} seconds" + +echo "#16 start test for orca tfpark estimator_dataset" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/tfpark/estimator/estimator_dataset.py \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca tfpark estimator_dataset" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#16 Total time cost ${time} seconds" + +echo "#17 start test for orca tfpark estimator_inception" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/tfpark/estimator/estimator_inception.py \ + --image-path ${HDFS_URI}/dogs_cats \ + --num-classes 2 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca tfpark estimator_inception failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#17 Total time cost ${time} seconds" + +echo "#18 start test for orca tfpark optimizer train" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/tfpark/tf_optimizer/train.py \ + --max_epoch 1 \ + --data_num 1000 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca tfpark optimizer train failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#18 Total time cost ${time} seconds" + +echo "#19 start test for orca tfpark optimizer evaluate" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/tfpark/tf_optimizer/evaluate.py \ + --data_num 1000 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca tfpark optimizer evaluate failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#19 Total time cost ${time} seconds" + echo "Ray example tests finished" echo "#1 auto-estimator-pytorch time used:$time1 seconds" echo "#2 auto-xgboost-classifier time used:$time2 seconds" echo "#3 auto-xgboost-regressor time used:$time3 seconds" -#echo "#4 orca rl_pong time used:$time4 seconds" -#echo "#5 orca async_parameter_server time used:$time5 seconds" -#echo "#6 orca sync_parameter_server time used:$time6 seconds" +echo "#4 bigdl transformer time used:$time4 seconds" +echo "#5 bigdl imageInference time used:$time5 seconds" +echo "#6 horovod pytorch_estimator time used:$time6 seconds" #echo "#7 orca multiagent_two_trainers time used:$time7 seconds" #echo "#8 mxnet_lenet time used:$time8 seconds" -#echo "#9 fashion-mnist time used:$time9 seconds" -#echo "#10 orca super-resolution example time used:$time10 seconds" -#echo "#11 orca cifar10 example time used:$time11 seconds" +echo "#9 paramerter_server async time used:$time9 seconds" +echo "#10 paramerter_server sync example time used:$time10 seconds" +echo "#11 paramerter_server rllib example time used:$time11 seconds" +echo "#12 paramerter_server rl_pong example time used:$time12 seconds" +echo "#13 tfaprk keras_dataset example time used:$time13 seconds" +echo "#14 tfaprk keras_ndarray example time used:$time14 seconds" +#echo "#15 tfaprk gan_train_and_evaluate example time used:$time15 seconds" +echo "#16 tfaprk estimator_dataset example time used:$time16 seconds" +echo "#17 tfaprk estimator_inception example time used:$time17 seconds" +echo "#18 tfaprk opt_train example time used:$time18 seconds" +echo "#19 tfaprk opt_evaluate example time used:$time19 seconds" diff --git a/python/orca/example/learn/bigdl/attention/transformer.py b/python/orca/example/learn/bigdl/attention/transformer.py index dc5a3cf4f56..737860acd8e 100644 --- a/python/orca/example/learn/bigdl/attention/transformer.py +++ b/python/orca/example/learn/bigdl/attention/transformer.py @@ -14,7 +14,6 @@ # limitations under the License. # - import argparse import numpy as np from tensorflow.python.keras.datasets import imdb @@ -35,7 +34,7 @@ cluster_mode = args.cluster_mode conf = {"spark.executor.extraJavaOptions": "-Xss512m", "spark.driver.extraJavaOptions": "-Xss512m"} -max_features = 20000 +max_features = 2000 max_len = 200 if cluster_mode == "local": @@ -44,12 +43,19 @@ driver_memory="20g", conf=conf ) -elif cluster_mode == "yarn": - sc = init_orca_context(cluster_mode="yarn-client", num_nodes=8, cores=8, +elif cluster_mode.startswith("yarn"): + if cluster_mode == "yarn_client": + sc = init_orca_context(cluster_mode="yarn-client", num_nodes=8, cores=8, memory="100g", driver_memory="20g", conf=conf - ) + ) + else: + sc = init_orca_context(cluster_mode="yarn-cluster", num_nodes=8, cores=8, + memory="100g", + driver_memory="20g", + conf=conf + ) elif cluster_mode == "spark-submit": sc = init_orca_context(cluster_mode="spark-submit") else: @@ -106,4 +112,3 @@ print("finished...") stop_orca_context() - diff --git a/python/orca/example/learn/bigdl/imageInference/imageInference.py b/python/orca/example/learn/bigdl/imageInference/imageInference.py index 6db83b5c5d2..4f4d747ee60 100644 --- a/python/orca/example/learn/bigdl/imageInference/imageInference.py +++ b/python/orca/example/learn/bigdl/imageInference/imageInference.py @@ -20,6 +20,7 @@ from pyspark.sql.functions import col, udf from pyspark.sql.types import StringType, DoubleType +from bigdl.dllib.nncontext import * from bigdl.dllib.feature.image import * from bigdl.dllib.nnframes import * from bigdl.orca.learn.bigdl.estimator import Estimator @@ -53,7 +54,7 @@ def inference(image_path, model_path, batch_size, sc): help="training data path.") parser.add_option("--b", "--batch_size", type=int, dest="batch_size", default="56", help="The number of samples per gradient update. Default is 56.") - parser.add_option('--cluster_mode', type=str, default="local", + parser.add_option('--cluster_mode', type=str, dest="clusterMode", default="local", help='The mode for the Spark cluster. local, yarn or spark-submit.') (options, args) = parser.parse_args(sys.argv) @@ -69,8 +70,11 @@ def inference(image_path, model_path, batch_size, sc): cluster_mode = options.cluster_mode if cluster_mode == "local": sc = init_orca_context(memory="3g") - elif cluster_mode == "yarn": - sc = init_orca_context(cluster_mode="yarn-client", num_nodes=2, memory="3g") + elif cluster_mode.startswith("yarn"): + if cluster_mode == "yarn-client": + sc = init_orca_context(cluster_mode="yarn-client", num_nodes=2, memory="3g") + else: + sc = init_orca_context(cluster_mode="yarn-cluster", num_nodes=2, memory="3g") elif cluster_mode == "spark-submit": sc = init_orca_context(cluster_mode="spark-submit") else: diff --git a/python/orca/example/learn/horovod/pytorch_estimator.py b/python/orca/example/learn/horovod/pytorch_estimator.py index a0d39e1389c..b826524c160 100644 --- a/python/orca/example/learn/horovod/pytorch_estimator.py +++ b/python/orca/example/learn/horovod/pytorch_estimator.py @@ -132,9 +132,13 @@ def train_example(workers_per_node): if args.cluster_mode == "local": init_orca_context(cluster_mode="local", cores=args.cores, num_nodes=args.num_nodes, memory=args.memory) - elif args.cluster_mode == "yarn": - init_orca_context(cluster_mode="yarn-client", cores=args.cores, - num_nodes=args.num_nodes, memory=args.memory) + elif args.cluster_mode.startswith("yarn"): + if args.cluster_mode == "yarn-client": + init_orca_context(cluster_mode="yarn-client", cores=args.cores, + num_nodes=args.num_nodes, memory=args.memory) + else: + init_orca_context(cluster_mode="yarn-cluster", cores=args.cores, + num_nodes=args.num_nodes, memory=args.memory) elif args.cluster_mode == "k8s": if not args.k8s_master or not args.container_image \ or not args.k8s_driver_host or not args.k8s_driver_port: @@ -150,4 +154,3 @@ def train_example(workers_per_node): init_orca_context(cluster_mode="spark-submit") train_example(workers_per_node=args.workers_per_node) stop_orca_context() - diff --git a/python/orca/example/learn/horovod/simple_horovod_pytorch.py b/python/orca/example/learn/horovod/simple_horovod_pytorch.py index f2dcf12961c..9c043586c44 100644 --- a/python/orca/example/learn/horovod/simple_horovod_pytorch.py +++ b/python/orca/example/learn/horovod/simple_horovod_pytorch.py @@ -68,6 +68,12 @@ class AppURLopener(urllib.FancyURLopener): # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(4) + # new_mirror = 'https://ossci-datasets.s3.amazonaws.com/mnist' + # datasets.MNIST.resources = [ + # ('/'.join([new_mirror, url.split('/')[-1]]), md5) + # for url, md5 in datasets.MNIST.resources + # ] + kwargs = {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, diff --git a/python/orca/example/learn/mxnet/lenet_mnist.py b/python/orca/example/learn/mxnet/lenet_mnist.py index 41cd5968683..9ffca119f3e 100644 --- a/python/orca/example/learn/mxnet/lenet_mnist.py +++ b/python/orca/example/learn/mxnet/lenet_mnist.py @@ -121,3 +121,4 @@ def get_metrics(config): epochs=opt.epochs, batch_size=opt.batch_size) estimator.shutdown() stop_orca_context() + \ No newline at end of file diff --git a/python/orca/example/learn/openvino/predict.py b/python/orca/example/learn/openvino/predict.py index c385f6de8d9..58e06178af0 100644 --- a/python/orca/example/learn/openvino/predict.py +++ b/python/orca/example/learn/openvino/predict.py @@ -55,7 +55,7 @@ def crop(img, w, h): if args.cluster_mode == "local": init_orca_context(cores=args.core_num, memory=args.memory) - elif args.cluster_mode == "yarn": + elif args.cluster_mode.startswith("yarn"): init_orca_context(cluster_mode=args.cluster_mode, cores=args.core_num, num_nodes=args.executor_num, memory=args.memory) elif args.cluster_mode == "spark-submit": @@ -78,4 +78,3 @@ def crop(img, w, h): assert result[1].shape == (args.data_num, 255, 26, 26) assert result[2].shape == (args.data_num, 255, 52, 52) stop_orca_context() - diff --git a/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py b/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py index 92e135d41c3..71dc245f891 100644 --- a/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py +++ b/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py @@ -24,8 +24,9 @@ import os import time +from python.orca.example.ray_on_spark.parameter_server import model import ray -import model +#import model from bigdl.orca import init_orca_context, stop_orca_context from bigdl.orca import OrcaContext @@ -97,7 +98,7 @@ def worker_task(ps, worker_index, batch_size=50): if __name__ == "__main__": args = parser.parse_args() cluster_mode = args.cluster_mode - if cluster_mode == "yarn": + if cluster_mode.startswith("yarn"): sc = init_orca_context(cluster_mode=cluster_mode, cores=args.executor_cores, memory=args.executor_memory, @@ -106,8 +107,7 @@ def worker_task(ps, worker_index, batch_size=50): driver_memory=args.driver_memory, driver_cores=args.driver_cores, extra_executor_memory_for_ray=args.extra_executor_memory_for_ray, - object_store_memory=args.object_store_memory, - additional_archive="MNIST_data.zip#MNIST_data") + object_store_memory=args.object_store_memory) ray_ctx = OrcaContext.get_ray_context() elif cluster_mode == "local": sc = init_orca_context(cores=args.driver_cores) @@ -143,4 +143,3 @@ def worker_task(ps, worker_index, batch_size=50): time.sleep(1) ray_ctx.stop() stop_orca_context() - diff --git a/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py b/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py index dab0fda1a89..6a22d3aba7f 100644 --- a/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py +++ b/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py @@ -22,9 +22,10 @@ import argparse import os +from python.orca.example.ray_on_spark.parameter_server import model import numpy as np import ray -import model +#import model from bigdl.orca import init_orca_context, stop_orca_context from bigdl.orca import OrcaContext @@ -88,7 +89,7 @@ def compute_gradients(self, weights): if __name__ == "__main__": args = parser.parse_args() cluster_mode = args.cluster_mode - if cluster_mode == "yarn": + if cluster_mode.startswith("yarn"): sc = init_orca_context(cluster_mode=cluster_mode, cores=args.executor_cores, memory=args.executor_memory, @@ -139,4 +140,3 @@ def compute_gradients(self, weights): i += 1 ray_ctx.stop() stop_orca_context() - diff --git a/python/orca/example/ray_on_spark/rl_pong/rl_pong.py b/python/orca/example/ray_on_spark/rl_pong/rl_pong.py index e1cfc0116ba..4acab920e45 100644 --- a/python/orca/example/ray_on_spark/rl_pong/rl_pong.py +++ b/python/orca/example/ray_on_spark/rl_pong/rl_pong.py @@ -210,7 +210,7 @@ def compute_gradient(self, model): args = parser.parse_args() cluster_mode = args.cluster_mode - if cluster_mode == "yarn": + if cluster_mode.startswith("yarn"): sc = init_orca_context(cluster_mode=cluster_mode, cores=args.executor_cores, memory=args.executor_memory, @@ -282,4 +282,3 @@ def compute_gradient(self, model): ray_ctx.stop() stop_orca_context() - diff --git a/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py b/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py index 5058e9e134f..3c73f8a356c 100644 --- a/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py +++ b/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py @@ -72,16 +72,27 @@ if __name__ == "__main__": args = parser.parse_args() cluster_mode = args.cluster_mode - if cluster_mode == "yarn": - sc = init_orca_context(cluster_mode="yarn", - cores=args.executor_cores, - memory=args.executor_memory, - init_ray_on_spark=True, - driver_memory=args.driver_memory, - driver_cores=args.driver_cores, - num_executors=args.slave_num, - extra_executor_memory_for_ray=args.extra_executor_memory_for_ray, - object_store_memory=args.object_store_memory) + if cluster_mode.startswith("yarn"): + if cluster_mode == "yarn-client": + sc = init_orca_context(cluster_mode="yarn-client", + cores=args.executor_cores, + memory=args.executor_memory, + init_ray_on_spark=True, + driver_memory=args.driver_memory, + driver_cores=args.driver_cores, + num_executors=args.slave_num, + extra_executor_memory_for_ray=args.extra_executor_memory_for_ray, + object_store_memory=args.object_store_memory) + else: + sc = init_orca_context(cluster_mode="yarn-cluster", + cores=args.executor_cores, + memory=args.executor_memory, + init_ray_on_spark=True, + driver_memory=args.driver_memory, + driver_cores=args.driver_cores, + num_executors=args.slave_num, + extra_executor_memory_for_ray=args.extra_executor_memory_for_ray, + object_store_memory=args.object_store_memory) ray_ctx = OrcaContext.get_ray_context() elif cluster_mode == "local": sc = init_orca_context(cores=args.driver_cores) @@ -159,4 +170,3 @@ def policy_mapping_fn(agent_id): ray_ctx.stop() stop_orca_context() - diff --git a/python/orca/example/tfpark/estimator/estimator_dataset.py b/python/orca/example/tfpark/estimator/estimator_dataset.py index 294dafff5e4..68f277d7160 100644 --- a/python/orca/example/tfpark/estimator/estimator_dataset.py +++ b/python/orca/example/tfpark/estimator/estimator_dataset.py @@ -19,7 +19,15 @@ from bigdl.dllib.nncontext import init_nncontext from bigdl.orca.tfpark import TFDataset, TFEstimator from bigdl.orca.tfpark import ZooOptimizer +from bigdl.dllib.utils.common import * +import os +import argparse + +parser = argparse.ArgumentParser(description="Run the tfpark keras " + "dataset example.") +parser.add_argument('--cluster_mode', type=str, default="local", + help='The mode for the Spark cluster. local, yarn or spark-submit.') def get_data(dataset): from bigdl.dllib.feature.dataset import mnist @@ -29,7 +37,22 @@ def get_data(dataset): def main(): - sc = init_nncontext() + args = parser.parse_args() + cluster_mode = args.cluster_mode + if cluster_mode.startswith("yarn"): + hadoop_conf = os.environ.get("HADOOP_CONF_DIR") + assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ + "set the environment variable HADOOP_CONF_DIR" + spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ + .set("spark.executor.cores", 2) \ + .set("spark.executor.instances", 2) \ + .set("spark.driver.memory", "2g") + if cluster_mode == "yarn-client": + sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext() def model_fn(features, labels, mode): from nets import lenet diff --git a/python/orca/example/tfpark/estimator/estimator_inception.py b/python/orca/example/tfpark/estimator/estimator_inception.py index cbbbbc9b727..dec3dc61cf3 100644 --- a/python/orca/example/tfpark/estimator/estimator_inception.py +++ b/python/orca/example/tfpark/estimator/estimator_inception.py @@ -14,6 +14,8 @@ # limitations under the License. # from optparse import OptionParser +import sys +import os import tensorflow as tf @@ -23,11 +25,26 @@ from bigdl.dllib.feature.image.imageset import * from bigdl.orca.tfpark import TFDataset, TFEstimator from bigdl.orca.tfpark import ZooOptimizer +from bigdl.dllib.utils.common import * def main(option): batch_size = 16 if not option.batch_size else int(option.batch_size) - sc = init_nncontext() + cluster_mode = options.cluster_mode + if cluster_mode.startswith("yarn"): + hadoop_conf = os.environ.get("HADOOP_CONF_DIR") + assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ + "set the environment variable HADOOP_CONF_DIR" + spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ + .set("spark.executor.cores", 2) \ + .set("spark.executor.instances", 2) \ + .set("spark.driver.memory", "2g") + if cluster_mode == "yarn-client": + sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext() def input_fn(mode, params): @@ -88,6 +105,8 @@ def model_fn(features, labels, mode, params): parser.add_option("--image-path", dest="image_path") parser.add_option("--num-classes", dest="num_classes") parser.add_option("--batch_size", dest="batch_size") + parser.add_option('--cluster_mode', type=str, default="local", + help='The mode for the Spark cluster. local, yarn or spark-submit.') (options, args) = parser.parse_args(sys.argv) main(options) diff --git a/python/orca/example/tfpark/gan/gan_train_and_evaluate.py b/python/orca/example/tfpark/gan/gan_train_and_evaluate.py index 6e6a4a32cd8..9e8b842041b 100644 --- a/python/orca/example/tfpark/gan/gan_train_and_evaluate.py +++ b/python/orca/example/tfpark/gan/gan_train_and_evaluate.py @@ -19,6 +19,7 @@ from bigdl.dllib.nncontext import init_nncontext from bigdl.orca.tfpark import TFDataset from bigdl.orca.tfpark import ZooOptimizer +from bigdl.dllib.utils.common import * import numpy as np import matplotlib.pyplot as plt @@ -26,9 +27,15 @@ from tensorflow_gan.python.losses.losses_impl import * import tensorflow_datasets as tfds +import os +import argparse + MODEL_DIR = "/tmp/gan_model" NOISE_DIM = 64 +parser = argparse.ArgumentParser() +parser.add_argument('--cluster_mode', type=str, default="local", + help='The mode for the Spark cluster. local, yarn or spark-submit.') def eval(): @@ -53,7 +60,23 @@ def eval(): if __name__ == "__main__": - sc = init_nncontext() + conf = {} + args = parser.parse_args() + cluster_mode = args.cluster_mode + if cluster_mode.startswith("yarn"): + hadoop_conf = os.environ.get("HADOOP_CONF_DIR") + assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ + "set the environment variable HADOOP_CONF_DIR" + spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ + .set("spark.executor.cores", 2) \ + .set("spark.executor.instances", 2) \ + .set("spark.driver.memory", "2g") + if cluster_mode == "yarn-client": + sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext() def input_fn(): def map_func(data): @@ -67,7 +90,7 @@ def map_func(data): ds = tfds.load("mnist", split="train") ds = ds.map(map_func) - dataset = TFDataset.from_tf_data_dataset(ds, batch_size=36) + dataset = TFDataset.from_tf_data_dataset(ds, batch_size=56) return dataset opt = GANEstimator( diff --git a/python/orca/example/tfpark/keras/keras_dataset.py b/python/orca/example/tfpark/keras/keras_dataset.py index ae9f477621c..a268971b8ec 100644 --- a/python/orca/example/tfpark/keras/keras_dataset.py +++ b/python/orca/example/tfpark/keras/keras_dataset.py @@ -13,17 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import sys +import argparse +import os import tensorflow as tf import numpy as np from bigdl.dllib.nncontext import init_nncontext from bigdl.orca.tfpark import KerasModel, TFDataset +from bigdl.dllib.utils.common import * +parser = argparse.ArgumentParser(description="Run the tfpark keras " + "dataset example.") +parser.add_argument('--data_path', type=str, default='/tmp/mnist', + help='training data path.') +parser.add_argument('--max_epoch', type=int, default=5, + help='Set max_epoch for training, it should be integer.') +parser.add_argument('--cluster_mode', type=str, default="local", + help='The mode for the Spark cluster. local, yarn or spark-submit.') def get_data_rdd(dataset, sc): + data_path = args.data_path from bigdl.dllib.feature.dataset import mnist - (images_data, labels_data) = mnist.read_data_sets("/tmp/mnist", dataset) + (images_data, labels_data) = mnist.read_data_sets(data_path, dataset) image_rdd = sc.parallelize(images_data) labels_rdd = sc.parallelize(labels_data) rdd = image_rdd.zip(labels_rdd) \ @@ -31,9 +42,25 @@ def get_data_rdd(dataset, sc): np.array(rec_tuple[1]))) return rdd - def main(max_epoch): - sc = init_nncontext() + args = parser.parse_args() + cluster_mode = args.cluster_mode + if cluster_mode.startswith("yarn"): + hadoop_conf = os.environ.get("HADOOP_CONF_DIR") + assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ + "set the environment variable HADOOP_CONF_DIR" + spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ + .set("spark.executor.cores", 2) \ + .set("spark.executor.instances", 2) \ + .set("spark.executorEnv.HTTP_PROXY", "http://child-prc.intel.com:913") \ + .set("spark.executorEnv.HTTPS_PROXY", "http://child-prc.intel.com:913") \ + .set("spark.driver.memory", "2g") + if cluster_mode == "yarn-client": + sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext() training_rdd = get_data_rdd("train", sc) testing_rdd = get_data_rdd("test", sc) @@ -79,8 +106,7 @@ def main(max_epoch): if __name__ == '__main__': - max_epoch = 5 + args = parser.parse_args() + max_epoch = args.max_epoch - if len(sys.argv) > 1: - max_epoch = int(sys.argv[1]) main(max_epoch) diff --git a/python/orca/example/tfpark/keras/keras_ndarray.py b/python/orca/example/tfpark/keras/keras_ndarray.py index 9694f696095..8c5219ff1a7 100644 --- a/python/orca/example/tfpark/keras/keras_ndarray.py +++ b/python/orca/example/tfpark/keras/keras_ndarray.py @@ -13,16 +13,40 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import sys +import argparse +import os import tensorflow as tf from bigdl.dllib.nncontext import init_nncontext from bigdl.dllib.feature.dataset import mnist from bigdl.orca.tfpark import KerasModel +from bigdl.dllib.utils.common import * +parser = argparse.ArgumentParser(description="Run the tfpark keras " + "dataset example.") +parser.add_argument('--max_epoch', type=int, default=5, + help='Set max_epoch for training, it should be integer.') +parser.add_argument('--cluster_mode', type=str, default="local", + help='The mode for the Spark cluster. local, yarn or spark-submit.') def main(max_epoch): - _ = init_nncontext() + args = parser.parse_args() + cluster_mode = args.cluster_mode + if cluster_mode.startswith("yarn"): + hadoop_conf = os.environ.get("HADOOP_CONF_DIR") + assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ + "set the environment variable HADOOP_CONF_DIR" + spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ + .set("spark.executor.cores", 2) \ + .set("spark.executor.instances", 2) \ + .set("spark.driver.memory", "2g") + if cluster_mode == "yarn-client": + _ = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) + else: + _ = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) + else: + _ = init_nncontext() + (training_images_data, training_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") (testing_images_data, testing_labels_data) = mnist.read_data_sets("/tmp/mnist", "test") @@ -65,8 +89,7 @@ def main(max_epoch): if __name__ == '__main__': - max_epoch = 5 + args = parser.parse_args() + max_epoch = args.max_epoch - if len(sys.argv) > 1: - max_epoch = int(sys.argv[1]) main(max_epoch) diff --git a/python/orca/example/tfpark/tf_optimizer/evaluate.py b/python/orca/example/tfpark/tf_optimizer/evaluate.py index 570f3ea874f..365b00f877e 100644 --- a/python/orca/example/tfpark/tf_optimizer/evaluate.py +++ b/python/orca/example/tfpark/tf_optimizer/evaluate.py @@ -13,12 +13,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from optparse import OptionParser +import argparse import tensorflow as tf from bigdl.dllib.nncontext import init_nncontext from bigdl.orca.tfpark import TFDataset, TFPredictor +from bigdl.dllib.utils.common import * + import numpy as np import sys +import os from bigdl.dllib.feature.dataset import mnist @@ -27,11 +30,35 @@ slim = tf.contrib.slim - -def main(options, data_num): - - data_path = '/tmp/mnist' if not options.data_path else options.data_path - sc = init_nncontext() +parser = argparse.ArgumentParser(description="Run the tfpark keras " + "dataset example.") +parser.add_argument('--data_num', type=int, default=10000, + help='Set data_num for evaluation, it should be integer.') +parser.add_argument("--data_path", type=str, default='/tmp/mnist', + help='Assert the data_path for evaluation' ) +parser.add_argument('--cluster_mode', type=str, default="local", + help='The mode for the Spark cluster. local, yarn or spark-submit.') + +def main(data_num): + + data_path = '/tmp/mnist' if not args.data_path else args.data_path + cluster_mode = args.cluster_mode + if cluster_mode.startswith("yarn"): + hadoop_conf = os.environ.get("HADOOP_CONF_DIR") + assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ + "set the environment variable HADOOP_CONF_DIR" + spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ + .set("spark.executor.cores", 2) \ + .set("spark.executor.instances", 2) \ + .set("spark.executorEnv.HTTP_PROXY", "http://child-prc.intel.com:913") \ + .set("spark.executorEnv.HTTPS_PROXY", "http://child-prc.intel.com:913") \ + .set("spark.driver.memory", "2g") + if cluster_mode == "yarn-client": + sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext() # get data, pre-process and create TFDataset (images_data, labels_data) = mnist.read_data_sets(data_path, "test") @@ -65,13 +92,7 @@ def main(options, data_num): if __name__ == '__main__': - data_num = 10000 - - if len(sys.argv) > 1: - data_num = int(sys.argv[1]) - - parser = OptionParser() - parser.add_option("--data_path", dest="data_path") - (options, args) = parser.parse_args(sys.argv) - - main(options, data_num) + args = parser.parse_args() + data_num = args.data_num + + main(data_num) diff --git a/python/orca/example/tfpark/tf_optimizer/train.py b/python/orca/example/tfpark/tf_optimizer/train.py index 1c8ed563e95..b067dfee43b 100644 --- a/python/orca/example/tfpark/tf_optimizer/train.py +++ b/python/orca/example/tfpark/tf_optimizer/train.py @@ -17,17 +17,29 @@ from bigdl.dllib.nncontext import init_nncontext from bigdl.orca.tfpark import TFOptimizer, TFDataset from bigdl.dllib.optim.optimizer import * +from bigdl.dllib.utils.common import * import numpy as np -import sys from bigdl.dllib.feature.dataset import mnist from bigdl.dllib.feature.dataset.transformer import * +import os +import sys +import argparse + sys.path.append("/tmp/models/slim") # add the slim library from nets import lenet slim = tf.contrib.slim +parser = argparse.ArgumentParser(description="Run the tfpark keras " + "dataset example.") +parser.add_argument('--max_epoch', type=int, default=5, + help='Set max_epoch for training, it should be integer.') +parser.add_argument('--data_num', type=int, default=60000, + help='Set data_num for training, it should be integer.') +parser.add_argument('--cluster_mode', type=str, default="local", + help='The mode for the Spark cluster. local, yarn or spark-submit.') def accuracy(logits, labels): predictions = tf.argmax(logits, axis=1, output_type=labels.dtype) @@ -36,7 +48,22 @@ def accuracy(logits, labels): def main(max_epoch, data_num): - sc = init_nncontext() + args = parser.parse_args() + cluster_mode = args.cluster_mode + if cluster_mode.startswith("yarn"): + hadoop_conf = os.environ.get("HADOOP_CONF_DIR") + assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \ + "set the environment variable HADOOP_CONF_DIR" + spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \ + .set("spark.executor.cores", 2) \ + .set("spark.executor.instances", 2) \ + .set("spark.driver.memory", "2g") + if cluster_mode == "yarn-client": + sc = init_nncontext(spark_conf, cluster_mode="yarn-client", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf) + else: + sc = init_nncontext() # get data, pre-process and create TFDataset (train_images_data, train_labels_data) = mnist.read_data_sets("/tmp/mnist", "train") @@ -73,10 +100,8 @@ def main(max_epoch, data_num): if __name__ == '__main__': - max_epoch = 5 - data_num = 60000 + args = parser.parse_args() + max_epoch = args.max_epoch + data_num = args.data_num - if len(sys.argv) > 1: - max_epoch = int(sys.argv[1]) - data_num = int(sys.argv[2]) main(max_epoch, data_num) From 8f1ce0564fbd932a241f67b0b86fca7e98e9d37d Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 12:29:47 +0800 Subject: [PATCH 02/11] no horovod --- .../run-example-test-ray-integration.sh | 104 +++++++++--------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 41106e3c11e..b80b6111120 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -29,7 +29,7 @@ python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostClassifie now=$(date "+%s") time2=$((now-start)) -echo "#3 Start autoxgboost example" +echo "#3 Start autoxgboost example"s if [ -f ${BIGDL_ROOT}/data/incd.csv ] then echo "incd.csv already exists" @@ -49,7 +49,7 @@ echo "#4 start test for orca bigdl transformer" start=$(date "+%s") #run the example python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ - --cluster_mode yarn-client + --cluster_mode yarn_client exit_status=$? if [ $exit_status -ne 0 ]; then clear_up @@ -84,20 +84,20 @@ now=$(date "+%s") time=$((now - start)) echo "#5 Total time cost ${time} seconds" -echo "#6 start test for orca pytorch_estimator" -#timer -start=$(date "+%s") -#run the example -python ${BIGDL_ROOT}/python/orca/example/learn/horovod/pytorch_estimator.py --cluster_mode yarn-client -exit_status=$? -if [ $exit_status -ne 0 ]; then - clear_up - echo "orca pytorch_estimator failed" - exit $exit_status -fi -now=$(date "+%s") -time=$((now - start)) -echo "#6 Total time cost ${time} seconds" +# echo "#6 start test for orca pytorch_estimator" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/learn/horovod/pytorch_estimator.py --cluster_mode yarn-client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca pytorch_estimator failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#6 Total time cost ${time} seconds" # echo "#7 start test for orca simple_pytorch" # #timer @@ -138,44 +138,44 @@ echo "#6 Total time cost ${time} seconds" # time=$((now - start)) # echo "#8 Total time cost ${time} seconds" -echo "#prepare dataset for ray_on_spark" -wget -nv $FTP_URI/analytics-zoo-data/mnist/train-labels-idx1-ubyte.gz -wget -nv $FTP_URI/analytics-zoo-data/mnist/train-images-idx3-ubyte.gz -wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-labels-idx1-ubyte.gz -wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-images-idx3-ubyte.gz -zip ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/MNIST_data.zip train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz +# echo "#prepare dataset for ray_on_spark" +# wget -nv $FTP_URI/analytics-zoo-data/mnist/train-labels-idx1-ubyte.gz +# wget -nv $FTP_URI/analytics-zoo-data/mnist/train-images-idx3-ubyte.gz +# wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-labels-idx1-ubyte.gz +# wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-images-idx3-ubyte.gz +# zip ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/MNIST_data.zip train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz -echo "#9 start test for orca ros async" -#timer -start=$(date "+%s") -#run the example -python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py \ - --iterations 20 --num_workers 2 --cluster_mode yarn -exit_status=$? -if [ $exit_status -ne 0 ]; then - clear_up - echo "orca ros async failed" - exit $exit_status -fi -now=$(date "+%s") -time=$((now - start)) -echo "#9 Total time cost ${time} seconds" +# echo "#9 start test for orca ros async" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py \ +# --iterations 20 --num_workers 2 --cluster_mode yarn +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca ros async failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#9 Total time cost ${time} seconds" -echo "#10 start test for orca ros sync" -#timer -start=$(date "+%s") -#run the example -python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py \ - --iterations 20 --num_workers 2 --cluster_mode yarn -exit_status=$? -if [ $exit_status -ne 0 ]; then - clear_up - echo "orca ros sync failed" - exit $exit_status -fi -now=$(date "+%s") -time=$((now - start)) -echo "#10 Total time cost ${time} seconds" +# echo "#10 start test for orca ros sync" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py \ +# --iterations 20 --num_workers 2 --cluster_mode yarn +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca ros sync failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#10 Total time cost ${time} seconds" echo "#11 start test for orca rllib" #timer From 120c2103a1f9e748f42a87638b9a64cfed565aac Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 13:12:32 +0800 Subject: [PATCH 03/11] skip automl --- .../run-example-test-ray-integration.sh | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index b80b6111120..0f809d76bbf 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -9,40 +9,40 @@ clear_up () { set -e -echo "#start orca ray example tests" -echo "#1 Start autoestimator example" -start=$(date "+%s") -python ${BIGDL_ROOT}/python/orca/example/automl/autoestimator/autoestimator_pytorch.py --trials 5 --epochs 2 --cluster_mode yarn -now=$(date "+%s") -time1=$((now-start)) +# echo "#start orca ray example tests" +# echo "#1 Start autoestimator example" +# start=$(date "+%s") +# python ${BIGDL_ROOT}/python/orca/example/automl/autoestimator/autoestimator_pytorch.py --trials 5 --epochs 2 --cluster_mode yarn +# now=$(date "+%s") +# time1=$((now-start)) -echo "#2 Start autoxgboost example" -if [ -f ${BIGDL_ROOT}/data/airline_14col.data ] -then - echo "airline_14col.data already exists" -else - wget -nv $FTP_URI/analytics-zoo-data/airline_14col.data -P ${BIGDL_ROOT}/data/ -fi +# echo "#2 Start autoxgboost example" +# if [ -f ${BIGDL_ROOT}/data/airline_14col.data ] +# then +# echo "airline_14col.data already exists" +# else +# wget -nv $FTP_URI/analytics-zoo-data/airline_14col.data -P ${BIGDL_ROOT}/data/ +# fi -start=$(date "+%s") -python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostClassifier.py -p ${BIGDL_ROOT}/data/airline_14col.data --cluster_mode yarn -now=$(date "+%s") -time2=$((now-start)) +# start=$(date "+%s") +# python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostClassifier.py -p ${BIGDL_ROOT}/data/airline_14col.data --cluster_mode yarn +# now=$(date "+%s") +# time2=$((now-start)) -echo "#3 Start autoxgboost example"s -if [ -f ${BIGDL_ROOT}/data/incd.csv ] -then - echo "incd.csv already exists" -else - wget -nv $FTP_URI/analytics-zoo-data/incd.csv -P ${BIGDL_ROOT}/data/ -fi +# echo "#3 Start autoxgboost example" +# if [ -f ${BIGDL_ROOT}/data/incd.csv ] +# then +# echo "incd.csv already exists" +# else +# wget -nv $FTP_URI/analytics-zoo-data/incd.csv -P ${BIGDL_ROOT}/data/ +# fi -start=$(date "+%s") -python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostRegressor.py -p ${BIGDL_ROOT}/data/incd.csv --cluster_mode yarn -now=$(date "+%s") -time3=$((now-start)) +# start=$(date "+%s") +# python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostRegressor.py -p ${BIGDL_ROOT}/data/incd.csv --cluster_mode yarn +# now=$(date "+%s") +# time3=$((now-start)) -set -e +# set -e echo "#4 start test for orca bigdl transformer" #timer From a0709cc5c99dd195919bc71e1c1abc680dfcd22c Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 13:43:21 +0800 Subject: [PATCH 04/11] update --- .../run-example-test-ray-integration.sh | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 0f809d76bbf..0c0927c9179 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -44,21 +44,21 @@ set -e # set -e -echo "#4 start test for orca bigdl transformer" -#timer -start=$(date "+%s") -#run the example -python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ - --cluster_mode yarn_client -exit_status=$? -if [ $exit_status -ne 0 ]; then - clear_up - echo "orca transformer failed" - exit $exit_status -fi -now=$(date "+%s") -time=$((now - start)) -echo "#4 Total time cost ${time} seconds" +# echo "#4 start test for orca bigdl transformer" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ +# --cluster_mode yarn_client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca transformer failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#4 Total time cost ${time} seconds" echo "#5 start test for orca bigdl imageInference" From 8d246b7d355f86100b5f538d7c70cfe0266df252 Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 13:46:37 +0800 Subject: [PATCH 05/11] update --- python/orca/dev/example/run-example-test-ray-integration.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 0c0927c9179..8db128c1967 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -70,7 +70,7 @@ else wget -nv $FTP_URI/analytics-zoo-models/image-classification/bigdl_inception-v1_imagenet_0.4.0.model \ -P models fi -run the example +#run the example python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/imageInference/imageInference.py \ -m models/bigdl_inception-v1_imagenet_0.4.0.model \ -f ${HDFS_URI}/kaggle/train_100 --cluster_mode yarn-client From 5e65e24d63b2389444db93c85b3a476ff179ddb0 Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 13:52:08 +0800 Subject: [PATCH 06/11] hot fix imageInference --- .../orca/example/learn/bigdl/imageInference/imageInference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/orca/example/learn/bigdl/imageInference/imageInference.py b/python/orca/example/learn/bigdl/imageInference/imageInference.py index 4f4d747ee60..7714814afba 100644 --- a/python/orca/example/learn/bigdl/imageInference/imageInference.py +++ b/python/orca/example/learn/bigdl/imageInference/imageInference.py @@ -54,7 +54,7 @@ def inference(image_path, model_path, batch_size, sc): help="training data path.") parser.add_option("--b", "--batch_size", type=int, dest="batch_size", default="56", help="The number of samples per gradient update. Default is 56.") - parser.add_option('--cluster_mode', type=str, dest="clusterMode", default="local", + parser.add_option('--cluster_mode', type=str, dest="cluster_mode", default="local", help='The mode for the Spark cluster. local, yarn or spark-submit.') (options, args) = parser.parse_args(sys.argv) From 76b7cbae71950e0ff9c21d80d9728d18f1aa46ea Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 15:02:45 +0800 Subject: [PATCH 07/11] new test --- .../run-example-test-ray-integration.sh | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 8db128c1967..715b625e43e 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -42,23 +42,21 @@ set -e # now=$(date "+%s") # time3=$((now-start)) -# set -e - -# echo "#4 start test for orca bigdl transformer" -# #timer -# start=$(date "+%s") -# #run the example -# python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ -# --cluster_mode yarn_client -# exit_status=$? -# if [ $exit_status -ne 0 ]; then -# clear_up -# echo "orca transformer failed" -# exit $exit_status -# fi -# now=$(date "+%s") -# time=$((now - start)) -# echo "#4 Total time cost ${time} seconds" +echo "#4 start test for orca bigdl transformer" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ + --cluster_mode yarn_client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca transformer failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#4 Total time cost ${time} seconds" echo "#5 start test for orca bigdl imageInference" From 0e4f808a1e3b3f015b94adef5bc8fb17df59bab6 Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 17:27:32 +0800 Subject: [PATCH 08/11] ros test --- .../run-example-test-ray-integration.sh | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 715b625e43e..6bdfa49c1ae 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -42,45 +42,45 @@ set -e # now=$(date "+%s") # time3=$((now-start)) -echo "#4 start test for orca bigdl transformer" -#timer -start=$(date "+%s") -#run the example -python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ - --cluster_mode yarn_client -exit_status=$? -if [ $exit_status -ne 0 ]; then - clear_up - echo "orca transformer failed" - exit $exit_status -fi -now=$(date "+%s") -time=$((now - start)) -echo "#4 Total time cost ${time} seconds" +# echo "#4 start test for orca bigdl transformer" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ +# --cluster_mode yarn_client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca transformer failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#4 Total time cost ${time} seconds" -echo "#5 start test for orca bigdl imageInference" -#timer -start=$(date "+%s") -if [ -f models/bigdl_inception-v1_imagenet_0.4.0.model ]; then - echo "analytics-zoo-models/bigdl_inception-v1_imagenet_0.4.0.model already exists." -else - wget -nv $FTP_URI/analytics-zoo-models/image-classification/bigdl_inception-v1_imagenet_0.4.0.model \ - -P models -fi -#run the example -python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/imageInference/imageInference.py \ - -m models/bigdl_inception-v1_imagenet_0.4.0.model \ - -f ${HDFS_URI}/kaggle/train_100 --cluster_mode yarn-client -exit_status=$? -if [ $exit_status -ne 0 ]; then - clear_up - echo "orca imageInference failed" - exit $exit_status -fi -now=$(date "+%s") -time=$((now - start)) -echo "#5 Total time cost ${time} seconds" +# echo "#5 start test for orca bigdl imageInference" +# #timer +# start=$(date "+%s") +# if [ -f models/bigdl_inception-v1_imagenet_0.4.0.model ]; then +# echo "analytics-zoo-models/bigdl_inception-v1_imagenet_0.4.0.model already exists." +# else +# wget -nv $FTP_URI/analytics-zoo-models/image-classification/bigdl_inception-v1_imagenet_0.4.0.model \ +# -P models +# fi +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/imageInference/imageInference.py \ +# -m models/bigdl_inception-v1_imagenet_0.4.0.model \ +# -f ${HDFS_URI}/kaggle/train_100 --cluster_mode yarn-client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca imageInference failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#5 Total time cost ${time} seconds" # echo "#6 start test for orca pytorch_estimator" # #timer From 6889b1b4bc1df1636fc35e9b7d99088e772ccf5b Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 17:41:34 +0800 Subject: [PATCH 09/11] update --- .../run-example-test-ray-integration.sh | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 6bdfa49c1ae..3a5724ec0d5 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -175,22 +175,22 @@ set -e # time=$((now - start)) # echo "#10 Total time cost ${time} seconds" -echo "#11 start test for orca rllib" -#timer -start=$(date "+%s") -#run the example -python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py \ - --iterations 5 \ - --cluster_mode yarn-client -exit_status=$? -if [ $exit_status -ne 0 ]; then - clear_up - echo "orca ros rllib failed" - exit $exit_status -fi -now=$(date "+%s") -time=$((now - start)) -echo "#11 Total time cost ${time} seconds" +# echo "#11 start test for orca rllib" +# #timer +# start=$(date "+%s") +# #run the example +# python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py \ +# --iterations 5 \ +# --cluster_mode yarn-client +# exit_status=$? +# if [ $exit_status -ne 0 ]; then +# clear_up +# echo "orca ros rllib failed" +# exit $exit_status +# fi +# now=$(date "+%s") +# time=$((now - start)) +# echo "#11 Total time cost ${time} seconds" echo "#12 start test for orca rl_pong" #timer From 1005238fa583328fec60274bc08a5fbdc6f72c2d Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 18:27:33 +0800 Subject: [PATCH 10/11] update --- .../run-example-test-ray-integration.sh | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 3a5724ec0d5..6bdfa49c1ae 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -175,22 +175,22 @@ set -e # time=$((now - start)) # echo "#10 Total time cost ${time} seconds" -# echo "#11 start test for orca rllib" -# #timer -# start=$(date "+%s") -# #run the example -# python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py \ -# --iterations 5 \ -# --cluster_mode yarn-client -# exit_status=$? -# if [ $exit_status -ne 0 ]; then -# clear_up -# echo "orca ros rllib failed" -# exit $exit_status -# fi -# now=$(date "+%s") -# time=$((now - start)) -# echo "#11 Total time cost ${time} seconds" +echo "#11 start test for orca rllib" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/rllib/multiagent_two_trainers.py \ + --iterations 5 \ + --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros rllib failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#11 Total time cost ${time} seconds" echo "#12 start test for orca rl_pong" #timer From 6b73caa9fd03e0b93791480dda31d2880915aa42 Mon Sep 17 00:00:00 2001 From: sgwhat Date: Tue, 23 Nov 2021 18:41:44 +0800 Subject: [PATCH 11/11] update --- .../run-example-test-ray-integration.sh | 231 +++++++++--------- 1 file changed, 115 insertions(+), 116 deletions(-) diff --git a/python/orca/dev/example/run-example-test-ray-integration.sh b/python/orca/dev/example/run-example-test-ray-integration.sh index 6bdfa49c1ae..9645e77591e 100644 --- a/python/orca/dev/example/run-example-test-ray-integration.sh +++ b/python/orca/dev/example/run-example-test-ray-integration.sh @@ -9,93 +9,92 @@ clear_up () { set -e -# echo "#start orca ray example tests" -# echo "#1 Start autoestimator example" -# start=$(date "+%s") -# python ${BIGDL_ROOT}/python/orca/example/automl/autoestimator/autoestimator_pytorch.py --trials 5 --epochs 2 --cluster_mode yarn -# now=$(date "+%s") -# time1=$((now-start)) - -# echo "#2 Start autoxgboost example" -# if [ -f ${BIGDL_ROOT}/data/airline_14col.data ] -# then -# echo "airline_14col.data already exists" -# else -# wget -nv $FTP_URI/analytics-zoo-data/airline_14col.data -P ${BIGDL_ROOT}/data/ -# fi +echo "#start orca ray example tests" +echo "#1 Start autoestimator example" +start=$(date "+%s") +python ${BIGDL_ROOT}/python/orca/example/automl/autoestimator/autoestimator_pytorch.py --trials 5 --epochs 2 --cluster_mode yarn +now=$(date "+%s") +time1=$((now-start)) -# start=$(date "+%s") -# python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostClassifier.py -p ${BIGDL_ROOT}/data/airline_14col.data --cluster_mode yarn -# now=$(date "+%s") -# time2=$((now-start)) +echo "#2 Start autoxgboost example" +if [ -f ${BIGDL_ROOT}/data/airline_14col.data ] +then + echo "airline_14col.data already exists" +else + wget -nv $FTP_URI/analytics-zoo-data/airline_14col.data -P ${BIGDL_ROOT}/data/ +fi -# echo "#3 Start autoxgboost example" -# if [ -f ${BIGDL_ROOT}/data/incd.csv ] -# then -# echo "incd.csv already exists" -# else -# wget -nv $FTP_URI/analytics-zoo-data/incd.csv -P ${BIGDL_ROOT}/data/ -# fi +start=$(date "+%s") +python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostClassifier.py -p ${BIGDL_ROOT}/data/airline_14col.data --cluster_mode yarn +now=$(date "+%s") +time2=$((now-start)) -# start=$(date "+%s") -# python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostRegressor.py -p ${BIGDL_ROOT}/data/incd.csv --cluster_mode yarn -# now=$(date "+%s") -# time3=$((now-start)) +echo "#3 Start autoxgboost example" +if [ -f ${BIGDL_ROOT}/data/incd.csv ] +then + echo "incd.csv already exists" +else + wget -nv $FTP_URI/analytics-zoo-data/incd.csv -P ${BIGDL_ROOT}/data/ +fi -# echo "#4 start test for orca bigdl transformer" -# #timer -# start=$(date "+%s") -# #run the example -# python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ -# --cluster_mode yarn_client -# exit_status=$? -# if [ $exit_status -ne 0 ]; then -# clear_up -# echo "orca transformer failed" -# exit $exit_status -# fi -# now=$(date "+%s") -# time=$((now - start)) -# echo "#4 Total time cost ${time} seconds" +start=$(date "+%s") +python ${BIGDL_ROOT}/python/orca/example/automl/autoxgboost/AutoXGBoostRegressor.py -p ${BIGDL_ROOT}/data/incd.csv --cluster_mode yarn +now=$(date "+%s") +time3=$((now-start)) +echo "#4 start test for orca bigdl transformer" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/attention/transformer.py \ + --cluster_mode yarn_client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca transformer failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#4 Total time cost ${time} seconds" -# echo "#5 start test for orca bigdl imageInference" -# #timer -# start=$(date "+%s") -# if [ -f models/bigdl_inception-v1_imagenet_0.4.0.model ]; then -# echo "analytics-zoo-models/bigdl_inception-v1_imagenet_0.4.0.model already exists." -# else -# wget -nv $FTP_URI/analytics-zoo-models/image-classification/bigdl_inception-v1_imagenet_0.4.0.model \ -# -P models -# fi -# #run the example -# python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/imageInference/imageInference.py \ -# -m models/bigdl_inception-v1_imagenet_0.4.0.model \ -# -f ${HDFS_URI}/kaggle/train_100 --cluster_mode yarn-client -# exit_status=$? -# if [ $exit_status -ne 0 ]; then -# clear_up -# echo "orca imageInference failed" -# exit $exit_status -# fi -# now=$(date "+%s") -# time=$((now - start)) -# echo "#5 Total time cost ${time} seconds" +echo "#5 start test for orca bigdl imageInference" +#timer +start=$(date "+%s") +if [ -f models/bigdl_inception-v1_imagenet_0.4.0.model ]; then + echo "analytics-zoo-models/bigdl_inception-v1_imagenet_0.4.0.model already exists." +else + wget -nv $FTP_URI/analytics-zoo-models/image-classification/bigdl_inception-v1_imagenet_0.4.0.model \ + -P models +fi +#run the example +python ${BIGDL_ROOT}/python/orca/example/learn/bigdl/imageInference/imageInference.py \ + -m models/bigdl_inception-v1_imagenet_0.4.0.model \ + -f ${HDFS_URI}/kaggle/train_100 --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca imageInference failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#5 Total time cost ${time} seconds" -# echo "#6 start test for orca pytorch_estimator" -# #timer -# start=$(date "+%s") -# #run the example -# python ${BIGDL_ROOT}/python/orca/example/learn/horovod/pytorch_estimator.py --cluster_mode yarn-client -# exit_status=$? -# if [ $exit_status -ne 0 ]; then -# clear_up -# echo "orca pytorch_estimator failed" -# exit $exit_status -# fi -# now=$(date "+%s") -# time=$((now - start)) -# echo "#6 Total time cost ${time} seconds" +echo "#6 start test for orca pytorch_estimator" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/learn/horovod/pytorch_estimator.py --cluster_mode yarn-client +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca pytorch_estimator failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#6 Total time cost ${time} seconds" # echo "#7 start test for orca simple_pytorch" # #timer @@ -136,44 +135,44 @@ set -e # time=$((now - start)) # echo "#8 Total time cost ${time} seconds" -# echo "#prepare dataset for ray_on_spark" -# wget -nv $FTP_URI/analytics-zoo-data/mnist/train-labels-idx1-ubyte.gz -# wget -nv $FTP_URI/analytics-zoo-data/mnist/train-images-idx3-ubyte.gz -# wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-labels-idx1-ubyte.gz -# wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-images-idx3-ubyte.gz -# zip ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/MNIST_data.zip train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz +echo "#prepare dataset for ray_on_spark" +wget -nv $FTP_URI/analytics-zoo-data/mnist/train-labels-idx1-ubyte.gz +wget -nv $FTP_URI/analytics-zoo-data/mnist/train-images-idx3-ubyte.gz +wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-labels-idx1-ubyte.gz +wget -nv $FTP_URI/analytics-zoo-data/mnist/t10k-images-idx3-ubyte.gz +zip ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/MNIST_data.zip train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz -# echo "#9 start test for orca ros async" -# #timer -# start=$(date "+%s") -# #run the example -# python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py \ -# --iterations 20 --num_workers 2 --cluster_mode yarn -# exit_status=$? -# if [ $exit_status -ne 0 ]; then -# clear_up -# echo "orca ros async failed" -# exit $exit_status -# fi -# now=$(date "+%s") -# time=$((now - start)) -# echo "#9 Total time cost ${time} seconds" +echo "#9 start test for orca ros async" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/async_parameter_server.py \ + --iterations 20 --num_workers 2 --cluster_mode yarn +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros async failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#9 Total time cost ${time} seconds" -# echo "#10 start test for orca ros sync" -# #timer -# start=$(date "+%s") -# #run the example -# python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py \ -# --iterations 20 --num_workers 2 --cluster_mode yarn -# exit_status=$? -# if [ $exit_status -ne 0 ]; then -# clear_up -# echo "orca ros sync failed" -# exit $exit_status -# fi -# now=$(date "+%s") -# time=$((now - start)) -# echo "#10 Total time cost ${time} seconds" +echo "#10 start test for orca ros sync" +#timer +start=$(date "+%s") +#run the example +python ${BIGDL_ROOT}/python/orca/example/ray_on_spark/parameter_server/sync_parameter_server.py \ + --iterations 20 --num_workers 2 --cluster_mode yarn +exit_status=$? +if [ $exit_status -ne 0 ]; then + clear_up + echo "orca ros sync failed" + exit $exit_status +fi +now=$(date "+%s") +time=$((now - start)) +echo "#10 Total time cost ${time} seconds" echo "#11 start test for orca rllib" #timer