## Step 0:Prepare environment

In [2]:
# !ls ~/analytics-zoo/pyzoo/dev/

In [3]:
# !bash ~/analytics-zoo/pyzoo/dev/prepare_env.sh

In [1]:
# conda create -n zoo python=3.7
# conda activate zoo
# pip install analytics-zoo
# pip install scikit-optimize

## Step 1:Init Orca Context

In [4]:
from zoo.ray import RayContext
from zoo import init_spark_on_local,init_spark_on_yarn

In [5]:
hadoop_conf = 'local'
if hadoop_conf == 'yarn':
    sc = init_spark_on_yarn(hadoop_conf='yarn',conda_name='zoo',num_executors=2,executor_cores=4)
elif hadoop_conf == 'local':
    sc = init_spark_on_local(cores='*')
ray_ctx = RayContext(sc=sc)
ray_ctx.init()

Current pyspark location is : /home/zehuan/spark/python/lib/pyspark.zip/pyspark/__init__.py
Start to getOrCreate SparkContext
pyspark_submit_args is:  --driver-class-path /home/zehuan/anaconda3/envs/automl/lib/python3.6/site-packages/zoo/share/lib/analytics-zoo-bigdl_0.12.1-spark_2.4.3-0.10.0-SNAPSHOT-jar-with-dependencies.jar:/home/zehuan/analytics-zoo/zoo/target/analytics-zoo-bigdl_0.12.2-spark_2.4.3-0.10.0-SNAPSHOT-jar-with-dependencies.jar pyspark-shell 
Successfully got a SparkContext
2021-04-28 16:09:36,705	INFO services.py:1174 -- View the Ray dashboard at [1m[32mhttp://10.239.44.145:8265[39m[22m
{'node_ip_address': '10.239.44.145', 'raylet_ip_address': '10.239.44.145', 'redis_address': '10.239.44.145:47599', 'object_store_address': '/tmp/ray/session_2021-04-28_16-09-36_145303_1106488/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2021-04-28_16-09-36_145303_1106488/sockets/raylet', 'webui_url': '10.239.44.145:8265', 'session_dir': '/tmp/ray/session_2021-04-28

{'node_ip_address': '10.239.44.145',
 'raylet_ip_address': '10.239.44.145',
 'redis_address': '10.239.44.145:47599',
 'object_store_address': '/tmp/ray/session_2021-04-28_16-09-36_145303_1106488/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-04-28_16-09-36_145303_1106488/sockets/raylet',
 'webui_url': '10.239.44.145:8265',
 'session_dir': '/tmp/ray/session_2021-04-28_16-09-36_145303_1106488',
 'metrics_export_port': 59818,
 'node_id': '9d91b7c8e799fbae1b4c6b3c663f893c8a33171be2abf403e9ba2339'}

## step 2: Perpare data

In [6]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd

In [7]:
df = pd.read_csv('./incd.csv',encoding='latin-1')
feature_cols = ["FIPS", "Lower 95% Confidence Interval", "Upper 95% Confidence Interval",
                "Average Annual Count", "Recent 5-Year Trend"]
target_col = "Age-Adjusted Incidence Rate"
train_df, val_df = train_test_split(df, test_size=0.2, random_state=2)
config = {'random_state': 2,
              'min_child_weight': 3,
              'n_jobs': 2}

## Step 3: Init AutoXGBoost

In [8]:
from zoo.orca.automl.xgboost import AutoXGBoost
from zoo.zouwu.config.recipe import XgbRegressorGridRandomRecipe
# from zoo.orca.automl.autoxgboost.AutoXGBoostRegressor import 

In [9]:
num_rand_samples = 10
n_estimators_range = (800, 1000)
max_depth_range = (10, 15)
lr = (1e-4, 1e-1)
min_child_weight = [1, 2, 3]

recipe = XgbRegressorGridRandomRecipe(num_rand_samples=num_rand_samples,
                                              n_estimators=list(n_estimators_range),
                                              max_depth=list(max_depth_range),
                                              lr=lr,
                                              min_child_weight=min_child_weight
                                              )

estimator = AutoXGBoost().regressor(feature_cols=feature_cols,
                                            target_col=target_col,
                                            config=config
                                            )

## Setp 4: fit data

In [10]:
pipeline = estimator.fit(train_df,
                             validation_df=val_df,
                             metric="rmse",
                             recipe=recipe
                             )
print("Training completed.")
pred_df = pipeline.predict(val_df)
rmse = pipeline.evaluate(val_df,metrics=["rmse"])
print("Evaluate: the square root of mean square error is ",rmse[0])


2021-04-28 16:10:56,758	INFO tune.py:450 -- Total run time: 63.02 seconds (62.89 seconds for the tuning loop).
best log dir is  /home/zehuan/zoo_automl_logs/automl/train_func_1cd6c_00005_5_imputation=LastFillImpute,lr=0.0027672,max_depth=15,min_child_weight=1,n_estimators=800_2021-04-28_16-10-01
The best configurations are:
model : XGBRegressor
imputation : LastFillImpute
n_estimators : 800
max_depth : 15
min_child_weight : 1
lr : 0.0027672315930411906
Training completed.
Evaluate: the square root of mean square error is  [1.052205690459954]


In [11]:
ray_ctx.stop()
sc.stop()