In [1]:
import os
os.environ['http_proxy'] = 'http://proxygate2.ctripcorp.com:8080'
os.environ['https_proxy'] = 'http://proxygate2.ctripcorp.com:8080'
CUDA_VISIBLE_DEVICES=1

In [2]:
# Adapted from Hugging Face tutorial: https://huggingface.co/docs/transformers/training

import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

# Datasets
dataset = load_dataset("yelp_review_full")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

small_train_dataset = dataset["train"].select(range(1000)).map(tokenize_function, batched=True)
small_eval_dataset = dataset["test"].select(range(1000)).map(tokenize_function, batched=True)

# Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

# Metrics
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Hugging Face Trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", report_to="none")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

# Start Training
trainer.train()

[2023-10-30 07:50:36,485] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.455955,0.372
2,No log,1.095957,0.512
3,No log,1.055615,0.525




TrainOutput(global_step=189, training_loss=1.2343917120070684, metrics={'train_runtime': 102.3268, 'train_samples_per_second': 29.318, 'train_steps_per_second': 1.847, 'total_flos': 789354427392000.0, 'train_loss': 1.2343917120070684, 'epoch': 3.0})

In [2]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

import ray.train.huggingface.transformers
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer

# [1] Encapsulate data preprocessing, training, and evaluation
# logic in a training function
def train_func(config):
    # Datasets
    dataset = load_dataset("yelp_review_full")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    small_train_dataset = dataset["train"].select(range(1000)).map(tokenize_function, batched=True)
    small_eval_dataset = dataset["test"].select(range(1000)).map(tokenize_function, batched=True)

    # Model
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

    # Evaluation Metrics
    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    # Hugging Face Trainer
    training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", report_to="none")

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_dataset,
        eval_dataset=small_eval_dataset,
        compute_metrics=compute_metrics,
    )

    # [2] Report Metrics and Checkpoints to Ray Train
    callback = ray.train.huggingface.transformers.RayTrainReportCallback()
    trainer.add_callback(callback)

    # [3] Prepare Transformers Trainer
    trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)

    # Start Training
    trainer.train()

# [4] Define a Ray TorchTrainer to launch `train_func` on all workers
ray_trainer = TorchTrainer(train_func, scaling_config=ScalingConfig(num_workers=2, use_gpu=True))
ray_trainer.fit()

0,1
Current time:,2023-10-30 08:04:44
Running for:,00:01:52.62
Memory:,44.0/116.0 GiB

Trial name,status,loc
TorchTrainer_b97a0_00000,TERMINATED,10.60.134.127:756917


[2m[36m(TrainTrainable pid=756917)[0m [2023-10-30 08:02:57,057] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[2m[36m(TorchTrainer pid=756917)[0m Starting distributed worker processes: ['757221 (10.60.134.127)', '757222 (10.60.134.127)']
[2m[36m(RayTrainWorker pid=757221)[0m Setting up process group for: env:// [rank=0, world_size=2]


[2m[36m(RayTrainWorker pid=757221)[0m [2023-10-30 08:03:08,085] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2m[36m(RayTrainWorker pid=757222)[0m [2023-10-30 08:03:08,124] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2450.78 examples/s]
Map:   0%|          | 0/1000 [00:00<?, ? examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2944.89 examples/s]
[2m[36m(RayTrainWorker pid=757221)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=757221)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/189 [00:00<?, ?it/s]m 
Map:   0%|          | 0/1000 [00:00<?, ? examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2458.23 examples/s]
[2m[36m(RayTrainWorker pid=757222)[0m Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
[2m[3

[2m[36m(RayTrainWorker pid=757221)[0m {'eval_loss': 1.280670166015625, 'eval_accuracy': 0.404, 'eval_runtime': 6.0052, 'eval_samples_per_second': 166.524, 'eval_steps_per_second': 10.491, 'epoch': 1.0}


[2m[36m(RayTrainWorker pid=757221)[0m 
                                                [A
 33%|███▎      | 63/189 [00:25<00:37,  3.37it/s]
100%|██████████| 63/63 [00:05<00:00, 10.80it/s][A
                                               [A
 34%|███▍      | 64/189 [00:25<04:23,  2.10s/it]
 34%|███▍      | 65/189 [00:26<03:13,  1.56s/it]
 35%|███▍      | 66/189 [00:26<02:25,  1.18s/it]
 35%|███▌      | 67/189 [00:26<01:52,  1.09it/s]
 36%|███▌      | 68/189 [00:27<01:28,  1.36it/s]
 37%|███▋      | 69/189 [00:27<01:12,  1.66it/s]
 37%|███▋      | 70/189 [00:27<01:00,  1.95it/s]
 38%|███▊      | 71/189 [00:27<00:52,  2.23it/s]
 38%|███▊      | 72/189 [00:28<00:47,  2.48it/s]
 39%|███▊      | 73/189 [00:28<00:43,  2.68it/s]
 39%|███▉      | 74/189 [00:28<00:40,  2.85it/s]
 40%|███▉      | 75/189 [00:29<00:38,  2.98it/s]
 40%|████      | 76/189 [00:29<00:36,  3.07it/s]
 41%|████      | 77/189 [00:29<00:35,  3.15it/s]
 41%|████▏     | 78/189 [00:30<00:34,  3.20it/s]
 42%|████▏     | 79/1

[2m[36m(RayTrainWorker pid=757221)[0m {'eval_loss': 1.0747922658920288, 'eval_accuracy': 0.52, 'eval_runtime': 6.0137, 'eval_samples_per_second': 166.287, 'eval_steps_per_second': 10.476, 'epoch': 2.0}


[2m[36m(RayTrainWorker pid=757221)[0m 
                                                 A
 67%|██████▋   | 126/189 [00:50<00:18,  3.37it/s]
100%|██████████| 63/63 [00:05<00:00, 10.82it/s][A
                                               [A
 67%|██████▋   | 127/189 [00:50<02:10,  2.11s/it]
 68%|██████▊   | 128/189 [00:51<01:35,  1.56s/it]
 68%|██████▊   | 129/189 [00:51<01:11,  1.18s/it]
 69%|██████▉   | 130/189 [00:51<00:54,  1.09it/s]
 69%|██████▉   | 131/189 [00:51<00:42,  1.36it/s]
 70%|██████▉   | 132/189 [00:52<00:34,  1.66it/s]
 70%|███████   | 133/189 [00:52<00:28,  1.95it/s]
 71%|███████   | 134/189 [00:52<00:24,  2.22it/s]
 71%|███████▏  | 135/189 [00:53<00:21,  2.47it/s]
 72%|███████▏  | 136/189 [00:53<00:19,  2.68it/s]
 72%|███████▏  | 137/189 [00:53<00:18,  2.84it/s]
 73%|███████▎  | 138/189 [00:54<00:17,  2.98it/s]
 74%|███████▎  | 139/189 [00:54<00:16,  3.08it/s]
 74%|███████▍  | 140/189 [00:54<00:15,  3.15it/s]
 75%|███████▍  | 141/189 [00:54<00:14,  3.20it/s]
 75%|

[2m[36m(RayTrainWorker pid=757221)[0m {'eval_loss': 1.0825996398925781, 'eval_accuracy': 0.531, 'eval_runtime': 6.0244, 'eval_samples_per_second': 165.991, 'eval_steps_per_second': 10.457, 'epoch': 3.0}
[2m[36m(RayTrainWorker pid=757221)[0m {'train_runtime': 75.567, 'train_samples_per_second': 39.7, 'train_steps_per_second': 2.501, 'train_loss': 1.125707636434565, 'epoch': 3.0}
Trial TorchTrainer_b97a0_00000 completed. Last result: 


2023-10-30 08:04:44,721	INFO tune.py:1143 -- Total run time: 112.65 seconds (112.62 seconds for the tuning loop).


Result(
  metrics={},
  path='/root/ray_results/TorchTrainer_2023-10-30_08-02-48/TorchTrainer_b97a0_00000_0_2023-10-30_08-02-52',
  filesystem='local',
  checkpoint=None
)