### Quick Start

#### Preprocessing

In [1]:
import ray

In [2]:
# Load data
dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")

# Split data into train and validation
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

# Create a test dataset by dropping the target column
test_dataset = valid_dataset.drop_columns(cols=['target'])

2022-10-17 17:44:54,027	INFO worker.py:1518 -- Started a local Ray instance.
Map_Batches: 100%|████████████████████████████████| 1/1 [00:00<00:00,  6.19it/s]


#### XGBoost

In [11]:
from ray.data.preprocessors import StandardScaler
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer

# Preprocessing
preprocessor = StandardScaler(columns=["mean radius", "mean texture"])

# Training
trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(
        num_workers=4,    # number of workers to use for data parallelism
        use_gpu=False,    # Whether to use GPU accleration
    ),
    label_column='target',
    num_boost_round=20,
    params={
        'objective': 'binary:logistic',    # XGBoost specific params
        'eval_metric': ['logloss', 'error']    # "tree_method": "gpu_hist"
    },
    datasets={'train': train_dataset, 'valid': valid_dataset},
    preprocessor=preprocessor,
)

result = trainer.fit()
print(result.metrics)

2022-10-16 22:43:34,371	INFO tensorboardx.py:170 -- pip install "ray[tune]" to see TensorBoard files.


Trial name,status,loc,iter,total time (s),train-logloss,train-error,valid-logloss
XGBoostTrainer_87799_00000,TERMINATED,127.0.0.1:42146,21,15.3273,0.0179314,0,0.0879959


[2m[36m(_RemoteRayXGBoostActor pid=42173)[0m [22:43:50] task [xgboost.ray]:140413578906544 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=42175)[0m [22:43:50] task [xgboost.ray]:140233097997328 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=42191)[0m [22:43:50] task [xgboost.ray]:140477513718752 got new rank 2
[2m[36m(_RemoteRayXGBoostActor pid=42192)[0m [22:43:50] task [xgboost.ray]:140655252605824 got new rank 3


Result for XGBoostTrainer_87799_00000:
  date: 2022-10-16_22-43-53
  done: false
  experiment_id: b3191d18ca96442990797af779089a50
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 42146
  time_since_restore: 14.711936950683594
  time_this_iter_s: 14.711936950683594
  time_total_s: 14.711936950683594
  timestamp: 1665927833
  timesteps_since_restore: 0
  train-error: 0.02512562814070352
  train-logloss: 0.46656074401122244
  training_iteration: 1
  trial_id: '87799_00000'
  valid-error: 0.11695906432748537
  valid-logloss: 0.502969495385711
  warmup_time: 0.005882978439331055
  
Result for XGBoostTrainer_87799_00000:
  date: 2022-10-16_22-43-53
  done: true
  experiment_id: b3191d18ca96442990797af779089a50
  experiment_tag: '0'
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 21
  node_ip: 127.0.0.1
  pid: 42146
  time_since_restore: 15.327291011810303
  time_this_iter_s: 0.47252488136291504
  time_total_s: 15.32729101181

2022-10-16 22:43:54,021	INFO tune.py:758 -- Total run time: 19.66 seconds (19.52 seconds for the tuning loop).


{'train-logloss': 0.01793136571220417, 'train-error': 0.0, 'valid-logloss': 0.0879958809778225, 'valid-error': 0.04093567251461988, 'time_this_iter_s': 0.47252488136291504, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 21, 'trial_id': '87799_00000', 'experiment_id': 'b3191d18ca96442990797af779089a50', 'date': '2022-10-16_22-43-53', 'timestamp': 1665927833, 'time_total_s': 15.327291011810303, 'pid': 42146, 'hostname': 'YONGJINs-MacBook-Pro.local', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 15.327291011810303, 'timesteps_since_restore': 0, 'iterations_since_restore': 21, 'warmup_time': 0.005882978439331055, 'experiment_tag': '0'}


In [12]:
from ray import tune
from ray.tune.tuner import Tuner, TuneConfig
from ray.air.config import RunConfig

In [14]:
# Hyperparmeter Tuning
param_space = {'params': {'max_depth': tune.randint(1, 9)}}
metric = 'train-logloss'

tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=5, metric=metric, mode='min')
)

# Excute tuning
result_grid = tuner.fit()

# Fetch the best result
best_result = result_grid.get_best_result()
print(f"Best Result: {best_result}")



Trial name,status,loc,params/max_depth,iter,total time (s),train-logloss,train-error,valid-logloss
XGBoostTrainer_28481_00000,TERMINATED,127.0.0.1:43130,2,21,12.3457,0.0410722,0.00502513,0.1029
XGBoostTrainer_28481_00001,TERMINATED,127.0.0.1:43203,5,21,13.6486,0.0185925,0.0,0.0818791
XGBoostTrainer_28481_00002,TERMINATED,127.0.0.1:43261,1,21,13.8138,0.0954546,0.0201005,0.11149
XGBoostTrainer_28481_00003,TERMINATED,127.0.0.1:43319,5,21,13.8127,0.0185925,0.0,0.0818791
XGBoostTrainer_28481_00004,TERMINATED,127.0.0.1:43385,2,21,13.7265,0.0410722,0.00502513,0.1029


[2m[36m(_RemoteRayXGBoostActor pid=43161)[0m [22:48:18] task [xgboost.ray]:140566358526800 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=43158)[0m [22:48:18] task [xgboost.ray]:140395063147392 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=43175)[0m [22:48:18] task [xgboost.ray]:140267852003840 got new rank 2
[2m[36m(_RemoteRayXGBoostActor pid=43176)[0m [22:48:18] task [xgboost.ray]:140621523053296 got new rank 3


Result for XGBoostTrainer_28481_00000:
  date: 2022-10-16_22-48-19
  done: false
  experiment_id: e6f83f23135245dabe3aec23525d5504
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 43130
  time_since_restore: 11.232172966003418
  time_this_iter_s: 11.232172966003418
  time_total_s: 11.232172966003418
  timestamp: 1665928099
  timesteps_since_restore: 0
  train-error: 0.04522613065326633
  train-logloss: 0.48458515279856157
  training_iteration: 1
  trial_id: '28481_00000'
  valid-error: 0.1111111111111111
  valid-logloss: 0.5170963152110228
  warmup_time: 0.005660057067871094
  
Result for XGBoostTrainer_28481_00000:
  date: 2022-10-16_22-48-20
  done: true
  experiment_id: e6f83f23135245dabe3aec23525d5504
  experiment_tag: 0_max_depth=2
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 21
  node_ip: 127.0.0.1
  pid: 43130
  time_since_restore: 12.345662832260132
  time_this_iter_s: 0.6764390468597412
  time_total_s: 12.34

[2m[36m(_RemoteRayXGBoostActor pid=43220)[0m [22:48:35] task [xgboost.ray]:140322445052944 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=43221)[0m [22:48:35] task [xgboost.ray]:140210994023152 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=43243)[0m [22:48:35] task [xgboost.ray]:140330614512752 got new rank 3
[2m[36m(_RemoteRayXGBoostActor pid=43242)[0m [22:48:35] task [xgboost.ray]:140679429614608 got new rank 2


Result for XGBoostTrainer_28481_00001:
  date: 2022-10-16_22-48-38
  done: false
  experiment_id: 57cf745d94c74c46b038d67712bc9447
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 43203
  time_since_restore: 13.186815977096558
  time_this_iter_s: 13.186815977096558
  time_total_s: 13.186815977096558
  timestamp: 1665928118
  timesteps_since_restore: 0
  train-error: 0.02512562814070352
  train-logloss: 0.4670509481849383
  training_iteration: 1
  trial_id: '28481_00001'
  valid-error: 0.09941520467836257
  valid-logloss: 0.501814771813956
  warmup_time: 0.005167961120605469
  
Result for XGBoostTrainer_28481_00001:
  date: 2022-10-16_22-48-38
  done: true
  experiment_id: 57cf745d94c74c46b038d67712bc9447
  experiment_tag: 1_max_depth=5
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 21
  node_ip: 127.0.0.1
  pid: 43203
  time_since_restore: 13.648637771606445
  time_this_iter_s: 0.3365809917449951
  time_total_s: 13.648

[2m[36m(_RemoteRayXGBoostActor pid=43284)[0m [22:48:53] task [xgboost.ray]:140684538276928 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=43287)[0m [22:48:53] task [xgboost.ray]:140284058790976 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=43301)[0m [22:48:53] task [xgboost.ray]:140628074556336 got new rank 3
[2m[36m(_RemoteRayXGBoostActor pid=43300)[0m [22:48:53] task [xgboost.ray]:140225933077472 got new rank 2


Result for XGBoostTrainer_28481_00002:
  date: 2022-10-16_22-48-56
  done: false
  experiment_id: 29351ec5345f45098ea25071783863ef
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 43261
  time_since_restore: 13.28571605682373
  time_this_iter_s: 13.28571605682373
  time_total_s: 13.28571605682373
  timestamp: 1665928136
  timesteps_since_restore: 0
  train-error: 0.07537688442211055
  train-logloss: 0.5116842961940334
  training_iteration: 1
  trial_id: '28481_00002'
  valid-error: 0.10526315789473684
  valid-logloss: 0.5233400317660549
  warmup_time: 0.0054357051849365234
  
Result for XGBoostTrainer_28481_00002:
  date: 2022-10-16_22-48-57
  done: true
  experiment_id: 29351ec5345f45098ea25071783863ef
  experiment_tag: 2_max_depth=1
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 21
  node_ip: 127.0.0.1
  pid: 43261
  time_since_restore: 13.813767910003662
  time_this_iter_s: 0.39649200439453125
  time_total_s: 13.813

[2m[36m(_RemoteRayXGBoostActor pid=43345)[0m [22:49:11] task [xgboost.ray]:140218745084944 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=43342)[0m [22:49:11] task [xgboost.ray]:140289721101280 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=43359)[0m [22:49:11] task [xgboost.ray]:140614812167088 got new rank 3
[2m[36m(_RemoteRayXGBoostActor pid=43358)[0m [22:49:11] task [xgboost.ray]:140268613274496 got new rank 2


Result for XGBoostTrainer_28481_00003:
  date: 2022-10-16_22-49-14
  done: false
  experiment_id: bef307c5a147406f81a2e9f0fc2dee87
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 43319
  time_since_restore: 13.43497610092163
  time_this_iter_s: 13.43497610092163
  time_total_s: 13.43497610092163
  timestamp: 1665928154
  timesteps_since_restore: 0
  train-error: 0.02512562814070352
  train-logloss: 0.4670509481849383
  training_iteration: 1
  trial_id: '28481_00003'
  valid-error: 0.09941520467836257
  valid-logloss: 0.501814771813956
  warmup_time: 0.005287885665893555
  
Result for XGBoostTrainer_28481_00003:
  date: 2022-10-16_22-49-15
  done: true
  experiment_id: bef307c5a147406f81a2e9f0fc2dee87
  experiment_tag: 3_max_depth=5
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 21
  node_ip: 127.0.0.1
  pid: 43319
  time_since_restore: 13.81273603439331
  time_this_iter_s: 0.2525599002838135
  time_total_s: 13.8127360

[2m[36m(_RemoteRayXGBoostActor pid=43402)[0m [22:49:29] task [xgboost.ray]:140162951408560 got new rank 1
[2m[36m(_RemoteRayXGBoostActor pid=43401)[0m [22:49:29] task [xgboost.ray]:140440858651568 got new rank 0
[2m[36m(_RemoteRayXGBoostActor pid=43416)[0m [22:49:29] task [xgboost.ray]:140216202292240 got new rank 2
[2m[36m(_RemoteRayXGBoostActor pid=43417)[0m [22:49:29] task [xgboost.ray]:140379561003712 got new rank 3


Result for XGBoostTrainer_28481_00004:
  date: 2022-10-16_22-49-32
  done: false
  experiment_id: cc725e3c8ab04d1a9c2cdede03203dbd
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 43385
  time_since_restore: 13.363611936569214
  time_this_iter_s: 13.363611936569214
  time_total_s: 13.363611936569214
  timestamp: 1665928172
  timesteps_since_restore: 0
  train-error: 0.04522613065326633
  train-logloss: 0.48458515279856157
  training_iteration: 1
  trial_id: '28481_00004'
  valid-error: 0.1111111111111111
  valid-logloss: 0.5170963152110228
  warmup_time: 0.005262136459350586
  
Result for XGBoostTrainer_28481_00004:
  date: 2022-10-16_22-49-32
  done: true
  experiment_id: cc725e3c8ab04d1a9c2cdede03203dbd
  experiment_tag: 4_max_depth=2
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 21
  node_ip: 127.0.0.1
  pid: 43385
  time_since_restore: 13.726532936096191
  time_this_iter_s: 0.23717474937438965
  time_total_s: 13.7

2022-10-16 22:49:33,020	INFO tune.py:758 -- Total run time: 88.87 seconds (88.73 seconds for the tuning loop).


Best Result: Result(metrics={'train-logloss': 0.01859253883974898, 'train-error': 0.0, 'valid-logloss': 0.08187905736866663, 'valid-error': 0.02339181286549707, 'done': True, 'trial_id': '28481_00001', 'experiment_tag': '1_max_depth=5'}, error=None, log_dir=PosixPath('/Users/yjkim/ray_results/XGBoostTrainer_2022-10-16_22-48-04/XGBoostTrainer_28481_00001_1_max_depth=5_2022-10-16_22-48-22'))


In [15]:
from ray.train.batch_predictor import BatchPredictor
from ray.train.xgboost import XGBoostPredictor

In [16]:
# Batch Inference

# also create a checkpoint from a trained model using `XGBoostCheckpoint.from_model`
checkpoint = best_result.checkpoint

batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor)

predicted_probabilities = batch_predictor.predict(test_dataset)
predicted_probabilities.show()

Map Progress (1 actors 1 pending): 100%|███████████████████████| 1/1 [00:02<00:00,  2.03s/it]

{'predictions': 0.9967517852783203}
{'predictions': 0.9956080317497253}
{'predictions': 0.0035305859055370092}
{'predictions': 0.9967517852783203}
{'predictions': 0.9968827962875366}
{'predictions': 0.9960047602653503}
{'predictions': 0.9917015433311462}
{'predictions': 0.995203971862793}
{'predictions': 0.27522677183151245}
{'predictions': 0.9821683764457703}
{'predictions': 0.0035305859055370092}
{'predictions': 0.9960752129554749}
{'predictions': 0.9656598567962646}
{'predictions': 0.9889512658119202}
{'predictions': 0.9943472743034363}
{'predictions': 0.26353344321250916}
{'predictions': 0.4275687336921692}
{'predictions': 0.9949140548706055}
{'predictions': 0.9823238849639893}
{'predictions': 0.0035305859055370092}





#### PyTorch

In [5]:
import numpy as np
from ray.data.preprocessors import Concatenator, Chain, StandardScaler

import torch
import torch.nn as nn
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present

from ray import train
from ray.air import session
from ray.air.config import ScalingConfig
from ray.train.torch import TorchCheckpoint, TorchTrainer

In [13]:
# Preprocessing

# Create a preprocessor to scale some columns and concatenate the result
preprocessor = Chain(
    StandardScaler(columns=['mean radius', 'mean texture']),
    Concatenator(exclude=['target'], dtype=np.float32)
)

# Training
def create_model(input_features):
    return nn.Sequential(
        nn.Linear(in_features=input_features, out_features=16),
        nn.ReLU(),
        nn.Linear(16, 16),
        nn.ReLU(),
        nn.Linear(16, 1),
        nn.Sigmoid(),
    )

def train_loop_per_worker(config):
    batch_size = config['batch_size']
    lr = config['lr']
    epochs = config['num_epochs']
    num_features = config['num_features']
    
    # Get the Ray Dataset shard for this data parallel worker and convert it to a PyTorch Dataset
    train_data = train.get_dataset_shard('train')
    
    # createa model
    model = create_model(num_features)
    model = train.torch.prepare_model(model)
    
    loss_fn = nn.BCELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    
    for cur_epoch in range(epochs):
        for batch in train_data.iter_torch_batches(
            batch_size=batch_size, dtypes=torch.float32
        ):
            # "concat_out" is the output column of the Concatenator.
            inputs, labels = batch["concat_out"], batch["target"]
            optimizer.zero_grad()
            predictions = model(inputs)
            train_loss = loss_fn(predictions, labels.unsqueeze(1))
            train_loss.backward()
            optimizer.step()
            
        loss = train_loss.item()
        session.report({"loss": loss}, checkpoint=TorchCheckpoint.from_model(model))

In [14]:
num_features = len(train_dataset.schema().names) - 1

trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={
        "batch_size": 128,
        "num_epochs": 20,
        "num_features": num_features,
        "lr": 0.001,
    },
    scaling_config=ScalingConfig(
        num_workers=3,  # Number of workers to use for data parallelism.
        use_gpu=False,
        trainer_resources={"CPU": 0},  # so that the example works on Colab.
    ),
    datasets={"train": train_dataset},
    preprocessor=preprocessor,
)
# Execute training.
result = trainer.fit()
print(f"Last result: {result.metrics}")



Trial name,status,loc,iter,total time (s),loss,_timestamp,_time_this_iter_s
TorchTrainer_00f31_00000,TERMINATED,127.0.0.1:19906,20,6.00244,0.240533,1665997605,0.0765009


[2m[36m(RayTrainWorker pid=19921)[0m 2022-10-17 18:06:42,616	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=19921)[0m 2022-10-17 18:06:43,751	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=19921)[0m 2022-10-17 18:06:43,751	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


Result for TorchTrainer_00f31_00000:
  _time_this_iter_s: 0.2908341884613037
  _timestamp: 1665997604
  _training_iteration: 1
  date: 2022-10-17_18-06-44
  done: false
  experiment_id: 7eadc91f541a498b980a041a8f421def
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  loss: 1.672611951828003
  node_ip: 127.0.0.1
  pid: 19906
  should_checkpoint: true
  time_since_restore: 4.489970922470093
  time_this_iter_s: 4.489970922470093
  time_total_s: 4.489970922470093
  timestamp: 1665997604
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 00f31_00000
  warmup_time: 0.004698991775512695
  


[2m[36m(RayTrainWorker pid=19923)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=19922)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=19921)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)


Result for TorchTrainer_00f31_00000:
  _time_this_iter_s: 0.07650089263916016
  _timestamp: 1665997605
  _training_iteration: 20
  date: 2022-10-17_18-06-45
  done: true
  experiment_id: 7eadc91f541a498b980a041a8f421def
  experiment_tag: '0'
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 20
  loss: 0.2405332624912262
  node_ip: 127.0.0.1
  pid: 19906
  should_checkpoint: true
  time_since_restore: 6.002439260482788
  time_this_iter_s: 0.07742428779602051
  time_total_s: 6.002439260482788
  timestamp: 1665997605
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: 00f31_00000
  warmup_time: 0.004698991775512695
  


2022-10-17 18:06:45,768	INFO tune.py:758 -- Total run time: 9.14 seconds (9.00 seconds for the tuning loop).


Last result: {'loss': 0.2405332624912262, '_timestamp': 1665997605, '_time_this_iter_s': 0.07650089263916016, '_training_iteration': 20, 'time_this_iter_s': 0.07742428779602051, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 20, 'trial_id': '00f31_00000', 'experiment_id': '7eadc91f541a498b980a041a8f421def', 'date': '2022-10-17_18-06-45', 'timestamp': 1665997605, 'time_total_s': 6.002439260482788, 'pid': 19906, 'hostname': 'YONGJINs-MacBook-Pro.local', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 6.002439260482788, 'timesteps_since_restore': 0, 'iterations_since_restore': 20, 'warmup_time': 0.004698991775512695, 'experiment_tag': '0'}


In [16]:
from ray import tune
from ray.tune.tuner import Tuner, TuneConfig
from ray.air.config import RunConfig

In [19]:
# Hyperparameter Tuning
param_space = {"train_loop_config": {"lr": tune.loguniform(0.0001, 0.01)}}
metric = "loss"

tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=5, metric=metric, mode='min')
)

# excute tuning
result_grid = tuner.fit()

# Fetch the best result
best_result = result_grid.get_best_result()
print(f"best result: {best_result}")

Trial name,status,loc,train_loop_config/lr,iter,total time (s),loss,_timestamp,_time_this_iter_s
TorchTrainer_c9b63_00000,TERMINATED,127.0.0.1:20250,0.000179404,20,5.37057,0.380847,1665997941,0.092993
TorchTrainer_c9b63_00001,TERMINATED,127.0.0.1:20259,0.000192725,20,8.97167,50.0,1665997948,0.0842109
TorchTrainer_c9b63_00002,TERMINATED,127.0.0.1:20292,0.00346125,20,7.25794,0.668383,1665997954,0.101945
TorchTrainer_c9b63_00003,TERMINATED,127.0.0.1:20310,0.00576613,20,6.9862,0.695373,1665997960,0.089077
TorchTrainer_c9b63_00004,TERMINATED,127.0.0.1:20319,0.00473255,20,8.90445,50.0,1665997966,0.0826631


[2m[36m(RayTrainWorker pid=20261)[0m 2022-10-17 18:12:19,782	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=20261)[0m 2022-10-17 18:12:19,910	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=20261)[0m 2022-10-17 18:12:19,910	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


Result for TorchTrainer_c9b63_00000:
  _time_this_iter_s: 0.3430471420288086
  _timestamp: 1665997940
  _training_iteration: 1
  date: 2022-10-17_18-12-20
  done: false
  experiment_id: 88375b47426044188208392f9db1ee8a
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  loss: 0.6551185250282288
  node_ip: 127.0.0.1
  pid: 20250
  should_checkpoint: true
  time_since_restore: 3.632996082305908
  time_this_iter_s: 3.632996082305908
  time_total_s: 3.632996082305908
  timestamp: 1665997940
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: c9b63_00000
  warmup_time: 0.005287885665893555
  


[2m[36m(RayTrainWorker pid=20263)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20262)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20261)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)


Result for TorchTrainer_c9b63_00000:
  _time_this_iter_s: 0.09299302101135254
  _timestamp: 1665997941
  _training_iteration: 20
  date: 2022-10-17_18-12-22
  done: true
  experiment_id: 88375b47426044188208392f9db1ee8a
  experiment_tag: 0_lr=0.0002
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 20
  loss: 0.38084739446640015
  node_ip: 127.0.0.1
  pid: 20250
  should_checkpoint: true
  time_since_restore: 5.370573043823242
  time_this_iter_s: 0.08456277847290039
  time_total_s: 5.370573043823242
  timestamp: 1665997942
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: c9b63_00000
  warmup_time: 0.005287885665893555
  


[2m[36m(RayTrainWorker pid=20271)[0m 2022-10-17 18:12:23,500	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=20271)[0m 2022-10-17 18:12:26,676	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=20271)[0m 2022-10-17 18:12:26,676	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


Result for TorchTrainer_c9b63_00001:
  _time_this_iter_s: 0.3045942783355713
  _timestamp: 1665997946
  _training_iteration: 1
  date: 2022-10-17_18-12-27
  done: false
  experiment_id: 490fca2ef6b14065af3a73c4634d5c36
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  loss: 50.0
  node_ip: 127.0.0.1
  pid: 20259
  should_checkpoint: true
  time_since_restore: 7.318837881088257
  time_this_iter_s: 7.318837881088257
  time_total_s: 7.318837881088257
  timestamp: 1665997947
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: c9b63_00001
  warmup_time: 0.0056951045989990234
  


[2m[36m(RayTrainWorker pid=20273)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20271)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20272)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)


Result for TorchTrainer_c9b63_00001:
  _time_this_iter_s: 0.08421087265014648
  _timestamp: 1665997948
  _training_iteration: 20
  date: 2022-10-17_18-12-28
  done: true
  experiment_id: 490fca2ef6b14065af3a73c4634d5c36
  experiment_tag: 1_lr=0.0002
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 20
  loss: 50.0
  node_ip: 127.0.0.1
  pid: 20259
  should_checkpoint: true
  time_since_restore: 8.971671104431152
  time_this_iter_s: 0.07729792594909668
  time_total_s: 8.971671104431152
  timestamp: 1665997948
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: c9b63_00001
  warmup_time: 0.0056951045989990234
  


[2m[36m(RayTrainWorker pid=20298)[0m 2022-10-17 18:12:30,831	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=20298)[0m 2022-10-17 18:12:30,965	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=20298)[0m 2022-10-17 18:12:30,966	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.
[2m[36m(RayTrainWorker pid=20300)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20298)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20299)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)


Result for TorchTrainer_c9b63_00002:
  _time_this_iter_s: 0.30349206924438477
  _timestamp: 1665997951
  _training_iteration: 1
  date: 2022-10-17_18-12-31
  done: false
  experiment_id: 420dbc3d981442cf97d61c6cd5d7e7d6
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  loss: 30.769973754882812
  node_ip: 127.0.0.1
  pid: 20292
  should_checkpoint: true
  time_since_restore: 4.161451816558838
  time_this_iter_s: 4.161451816558838
  time_total_s: 4.161451816558838
  timestamp: 1665997951
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: c9b63_00002
  warmup_time: 0.004485130310058594
  
Result for TorchTrainer_c9b63_00002:
  _time_this_iter_s: 0.1019449234008789
  _timestamp: 1665997954
  _training_iteration: 20
  date: 2022-10-17_18-12-34
  done: true
  experiment_id: 420dbc3d981442cf97d61c6cd5d7e7d6
  experiment_tag: 2_lr=0.0035
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 20
  loss: 0.6683834791183472
  node_ip: 127.0.0.1
  pid

[2m[36m(RayTrainWorker pid=20313)[0m 2022-10-17 18:12:37,281	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=20313)[0m 2022-10-17 18:12:38,425	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=20313)[0m 2022-10-17 18:12:38,426	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


Result for TorchTrainer_c9b63_00003:
  _time_this_iter_s: 0.46407389640808105
  _timestamp: 1665997958
  _training_iteration: 1
  date: 2022-10-17_18-12-38
  done: false
  experiment_id: 3f68e79a9884401fb72ec2a4e9696302
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  loss: 0.9059874415397644
  node_ip: 127.0.0.1
  pid: 20310
  should_checkpoint: true
  time_since_restore: 5.267439126968384
  time_this_iter_s: 5.267439126968384
  time_total_s: 5.267439126968384
  timestamp: 1665997958
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: c9b63_00003
  warmup_time: 0.004579782485961914
  


[2m[36m(RayTrainWorker pid=20314)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20315)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20313)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)


Result for TorchTrainer_c9b63_00003:
  _time_this_iter_s: 0.08907699584960938
  _timestamp: 1665997960
  _training_iteration: 20
  date: 2022-10-17_18-12-40
  done: true
  experiment_id: 3f68e79a9884401fb72ec2a4e9696302
  experiment_tag: 3_lr=0.0058
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 20
  loss: 0.6953727602958679
  node_ip: 127.0.0.1
  pid: 20310
  should_checkpoint: true
  time_since_restore: 6.986196994781494
  time_this_iter_s: 0.08703494071960449
  time_total_s: 6.986196994781494
  timestamp: 1665997960
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: c9b63_00003
  warmup_time: 0.004579782485961914
  


[2m[36m(RayTrainWorker pid=20325)[0m 2022-10-17 18:12:41,879	INFO config.py:71 -- Setting up process group for: env:// [rank=0, world_size=3]
[2m[36m(RayTrainWorker pid=20325)[0m 2022-10-17 18:12:44,994	INFO train_loop_utils.py:300 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=20325)[0m 2022-10-17 18:12:44,995	INFO train_loop_utils.py:347 -- Wrapping provided model in DDP.


Result for TorchTrainer_c9b63_00004:
  _time_this_iter_s: 0.28025007247924805
  _timestamp: 1665997965
  _training_iteration: 1
  date: 2022-10-17_18-12-45
  done: false
  experiment_id: c64bf9308ec74c7aa368de807d0b0b0d
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 1
  loss: 50.0
  node_ip: 127.0.0.1
  pid: 20319
  should_checkpoint: true
  time_since_restore: 7.380793809890747
  time_this_iter_s: 7.380793809890747
  time_total_s: 7.380793809890747
  timestamp: 1665997965
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: c9b63_00004
  warmup_time: 0.004536867141723633
  


[2m[36m(RayTrainWorker pid=20325)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20327)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)
[2m[36m(RayTrainWorker pid=20326)[0m   return torch.as_tensor(ndarray, dtype=dtype, device=device)


Result for TorchTrainer_c9b63_00004:
  _time_this_iter_s: 0.08266305923461914
  _timestamp: 1665997966
  _training_iteration: 20
  date: 2022-10-17_18-12-46
  done: true
  experiment_id: c64bf9308ec74c7aa368de807d0b0b0d
  experiment_tag: 4_lr=0.0047
  hostname: YONGJINs-MacBook-Pro.local
  iterations_since_restore: 20
  loss: 50.0
  node_ip: 127.0.0.1
  pid: 20319
  should_checkpoint: true
  time_since_restore: 8.904445886611938
  time_this_iter_s: 0.07838678359985352
  time_total_s: 8.904445886611938
  timestamp: 1665997966
  timesteps_since_restore: 0
  training_iteration: 20
  trial_id: c9b63_00004
  warmup_time: 0.004536867141723633
  


2022-10-17 18:12:47,035	INFO tune.py:758 -- Total run time: 33.58 seconds (33.42 seconds for the tuning loop).


best result: Result(metrics={'loss': 0.38084739446640015, '_timestamp': 1665997941, '_time_this_iter_s': 0.09299302101135254, '_training_iteration': 20, 'should_checkpoint': True, 'done': True, 'trial_id': 'c9b63_00000', 'experiment_tag': '0_lr=0.0002'}, error=None, log_dir=PosixPath('/Users/yjkim/ray_results/TorchTrainer_2022-10-17_18-12-13/TorchTrainer_c9b63_00000_0_lr=0.0002_2022-10-17_18-12-13'))


In [20]:
from ray.train.batch_predictor import BatchPredictor
from ray.train.torch import TorchPredictor

In [None]:
# Batch inference
checkpoint = best_result.checkpoint

batch_predictor = BatchPredictor.from_checkpoint(
    che
)