In [1]:
from config import *
import datasets
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
import ray
from ray.train.huggingface.transformers import (
    RayTrainReportCallback,
    prepare_trainer,
)
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer
from ray import tune
from ray.tune.search.optuna import OptunaSearch

In [2]:
dataset = datasets.load_from_disk("/data/lab/assignments/proj2/project2/Task1/squad_v2_tokenized_datasets")
dataset = dataset.train_test_split(test_size=test_size)

ray_train_ds = ray.data.from_huggingface(dataset["train"])
ray_eval_ds = ray.data.from_huggingface(dataset["test"])

max_steps_per_epoch = ray_train_ds.count() // (batch_size * num_workers)

In [None]:
import os
# 指定checkpoint目录
checkpoint_dir = "/root/ray_results/TorchTrainer_2024-05-29_07-12-59/TorchTrainer_e1617_00000_0_2024-05-29_07-12-59/checkpoint_000000/checkpoint"

# 确认 checkpoint 目录存在
if os.path.exists(checkpoint_dir):
    # 加载模型和分词器
    model = T5ForConditionalGeneration.from_pretrained(checkpoint_dir)
    tokenizer = T5Tokenizer.from_pretrained(checkpoint_dir)
    print("Model and tokenizer loaded from checkpoint:", checkpoint_dir)
else:
    raise FileNotFoundError("Checkpoint directory not found: {}".format(checkpoint_dir))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model and tokenizer loaded from checkpoint: /root/ray_results/TorchTrainer_2024-05-29_07-12-59/TorchTrainer_e1617_00000_0_2024-05-29_07-12-59/checkpoint_000000/checkpoint


In [6]:
def train_func(config):
    model = T5ForConditionalGeneration.from_pretrained(model_dir)
    tokenizer = T5Tokenizer.from_pretrained(model_dir)
    
    dataset = datasets.load_from_disk("/data/lab/assignments/proj2/project2/Task1/squad_v2_tokenized_datasets")
    dataset = dataset.train_test_split(test_size=test_size)
    
    training_args = Seq2SeqTrainingArguments(
        per_device_train_batch_size=batch_size,
        overwrite_output_dir=True,
        save_total_limit=save_total_limit,
        logging_dir="./logs",
        num_train_epochs=num_train_epochs,
        report_to="none",
        per_device_eval_batch_size=batch_size,
        save_strategy="steps",
        max_steps=20,
        evaluation_strategy="steps",
        output_dir="./checkpoints",
        warmup_steps=2*eval_steps,
        learning_rate=learning_rate,
        logging_steps=eval_steps,
        predict_with_generate=True,
        gradient_accumulation_steps=gradient_accumulation_steps,
        save_steps=eval_steps,
        eval_steps=eval_steps,
    )
   
    trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
    )
    
    trainer.add_callback(RayTrainReportCallback())
    trainer = prepare_trainer(trainer)
    trainer.train()
    
    return {"eval_loss": trainer.state.log_history[1]['eval_loss']}

In [7]:
tuner = tune.Tuner(
    train_func,
    tune_config=tune.TuneConfig(
        metric="eval_loss",
        mode="min",
    ),
    param_space=search_space,
)
results = tuner.fit()

0,1
Current time:,2024-05-29 19:35:09
Running for:,03:31:50.69
Memory:,598.9/1007.4 GiB

Trial name,status,loc,lr,iter,total time (s)
train_func_f74df_00000,TERMINATED,172.17.0.38:487477,0.0001,11,12706.6


[36m(train_func pid=487477)[0m You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
[36m(train_func pid=487477)[0m Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[36m(train_func pid=487477)[0m max_steps is given, it will override any value given in num_train_epochs
  0%|          | 0/20 [00:00<?, ?it/s]
  5%|▌         | 1/20 [02:42<51:33, 162.82s/it]


[36m(train_func pid=487477)[0m {'loss': 1.1286, 'grad_norm': 3.6692216396331787, 'learning_rate': 5e-05, 'epoch': 0.0}


 10%|█         | 2/20 [05:16<47:12, 157.36s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:41,  2.87s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<10:46,  4.04s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:20,  4.66s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:22<13:13,  5.02s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:28<13:43,  5.24s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:34<14:02,  5.40s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:39<14:11,  5.49s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:45<14:16,  5.56s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:51<14:30,  5.69s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:57<14:33,  5.75s/it][A
[36m(train_func pid=48

[36m(train_func pid=487477)[0m {'eval_loss': 1.095703363418579, 'eval_runtime': 942.821, 'eval_samples_per_second': 2.765, 'eval_steps_per_second': 0.173, 'epoch': 0.0}


[36m(train_func pid=487477)[0m 
                                               t][A
 10%|█         | 2/20 [20:59<47:12, 157.36s/it]  
100%|██████████| 163/163 [15:37<00:00,  5.69s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000000)
 15%|█▌        | 3/20 [23:34<2:46:20, 587.09s/it]


[36m(train_func pid=487477)[0m {'loss': 1.3568, 'grad_norm': 3.956453800201416, 'learning_rate': 0.0001, 'epoch': 0.0}


 20%|██        | 4/20 [26:07<1:50:48, 415.56s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:52,  2.94s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<11:06,  4.17s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:39,  4.78s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:33,  5.15s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:29<14:07,  5.40s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:35<14:24,  5.54s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:40<14:29,  5.61s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:46<14:30,  5.65s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:52<14:28,  5.68s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:58<14:26,  5.70s/it][A
[36m(train_func pid=

[36m(train_func pid=487477)[0m {'eval_loss': 0.9346609115600586, 'eval_runtime': 958.082, 'eval_samples_per_second': 2.721, 'eval_steps_per_second': 0.17, 'epoch': 0.0}


[36m(train_func pid=487477)[0m 
                                                 [A
 20%|██        | 4/20 [42:05<1:50:48, 415.56s/it]
100%|██████████| 163/163 [15:52<00:00,  5.77s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000001)
 25%|██▌       | 5/20 [44:41<2:46:52, 667.47s/it]


[36m(train_func pid=487477)[0m {'loss': 1.0755, 'grad_norm': 2.4827215671539307, 'learning_rate': 8.75e-05, 'epoch': 0.01}


 30%|███       | 6/20 [47:14<1:54:54, 492.45s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<08:00,  2.99s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<11:04,  4.16s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:42,  4.80s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:34,  5.15s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:29<14:02,  5.37s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:35<14:21,  5.52s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:40<14:29,  5.61s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:46<14:31,  5.66s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:52<14:30,  5.69s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:58<14:27,  5.71s/it][A
[36m(train_func pid=

[36m(train_func pid=487477)[0m {'eval_loss': 0.810267448425293, 'eval_runtime': 953.0705, 'eval_samples_per_second': 2.735, 'eval_steps_per_second': 0.171, 'epoch': 0.01}


[36m(train_func pid=487477)[0m 
                                                 [A
 30%|███       | 6/20 [1:03:07<1:54:54, 492.45s/it]
100%|██████████| 163/163 [15:47<00:00,  5.82s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000002)
 35%|███▌      | 7/20 [1:05:43<2:30:21, 693.94s/it]


[36m(train_func pid=487477)[0m {'loss': 0.8239, 'grad_norm': 2.009735345840454, 'learning_rate': 7.500000000000001e-05, 'epoch': 0.01}


 40%|████      | 8/20 [1:08:16<1:44:21, 521.80s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:49,  2.91s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<11:03,  4.14s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:35,  4.75s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:36,  5.17s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:29<14:06,  5.39s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:35<14:20,  5.52s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:40<14:30,  5.62s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:46<14:35,  5.68s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:52<14:41,  5.76s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:58<14:50,  5.86s/it][A
[36m(train_func pi

[36m(train_func pid=487477)[0m {'eval_loss': 0.7447213530540466, 'eval_runtime': 963.4776, 'eval_samples_per_second': 2.706, 'eval_steps_per_second': 0.169, 'epoch': 0.01}


[36m(train_func pid=487477)[0m 
                                                   A
 40%|████      | 8/20 [1:24:19<1:44:21, 521.80s/it]
100%|██████████| 163/163 [15:57<00:00,  5.77s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000003)
 45%|████▌     | 9/20 [1:27:02<2:10:17, 710.71s/it]


[36m(train_func pid=487477)[0m {'loss': 0.9085, 'grad_norm': 1.8381211757659912, 'learning_rate': 6.25e-05, 'epoch': 0.01}


 50%|█████     | 10/20 [1:29:40<1:30:02, 540.25s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:51,  2.93s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<11:03,  4.15s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:39,  4.77s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:36,  5.17s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:29<14:09,  5.41s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:35<14:40,  5.65s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:41<14:46,  5.72s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:47<14:53,  5.80s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:53<15:00,  5.88s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:59<14:53,  5.88s/it][A
[36m(train_func p

[36m(train_func pid=487477)[0m {'eval_loss': 0.6959055066108704, 'eval_runtime': 966.2796, 'eval_samples_per_second': 2.698, 'eval_steps_per_second': 0.169, 'epoch': 0.01}


[36m(train_func pid=487477)[0m 
                                                    
 50%|█████     | 10/20 [1:45:47<1:30:02, 540.25s/it]
100%|██████████| 163/163 [16:00<00:00,  5.93s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000004)
 55%|█████▌    | 11/20 [1:48:26<1:47:53, 719.30s/it]


[36m(train_func pid=487477)[0m {'loss': 0.8445, 'grad_norm': 2.1341114044189453, 'learning_rate': 5e-05, 'epoch': 0.01}


 60%|██████    | 12/20 [1:51:04<1:13:09, 548.71s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:06<08:21,  3.12s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:12<11:51,  4.45s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:18<13:32,  5.11s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:25<14:30,  5.51s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:31<14:53,  5.69s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:37<15:14,  5.86s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:43<15:23,  5.96s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:49<15:32,  6.06s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:55<15:32,  6.10s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [01:02<15:30,  6.12s/it][A
[36m(train_func p

[36m(train_func pid=487477)[0m {'eval_loss': 0.6620151400566101, 'eval_runtime': 977.9725, 'eval_samples_per_second': 2.666, 'eval_steps_per_second': 0.167, 'epoch': 0.01}


[36m(train_func pid=487477)[0m 
                                                    
 60%|██████    | 12/20 [2:07:22<1:13:09, 548.71s/it]
100%|██████████| 163/163 [16:11<00:00,  5.67s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000005)
 65%|██████▌   | 13/20 [2:10:03<1:24:52, 727.51s/it]


[36m(train_func pid=487477)[0m {'loss': 0.8346, 'grad_norm': 2.3615574836730957, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.01}


 70%|███████   | 14/20 [2:12:37<55:26, 554.34s/it]  
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:52,  2.93s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<11:01,  4.13s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:35,  4.75s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:25,  5.10s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:29<14:15,  5.45s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:35<14:28,  5.57s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:41<14:35,  5.65s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:46<14:39,  5.71s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:52<14:37,  5.74s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:58<14:35,  5.76s/it][A
[36m(train_func p

[36m(train_func pid=487477)[0m {'eval_loss': 0.6375592947006226, 'eval_runtime': 951.5974, 'eval_samples_per_second': 2.74, 'eval_steps_per_second': 0.171, 'epoch': 0.01}


[36m(train_func pid=487477)[0m 
                                                  [A
 70%|███████   | 14/20 [2:28:29<55:26, 554.34s/it]
100%|██████████| 163/163 [15:45<00:00,  5.75s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000006)
 75%|███████▌  | 15/20 [2:31:09<1:00:12, 722.42s/it]


[36m(train_func pid=487477)[0m {'loss': 0.781, 'grad_norm': 2.0171899795532227, 'learning_rate': 2.5e-05, 'epoch': 0.02}


 80%|████████  | 16/20 [2:33:44<36:46, 551.63s/it]  
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:53,  2.94s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<11:05,  4.16s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:45,  4.82s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:48,  5.25s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:29<14:19,  5.47s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:35<14:34,  5.61s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:41<14:41,  5.69s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:47<14:38,  5.71s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:52<14:35,  5.72s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:58<14:32,  5.74s/it][A
[36m(train_func p

[36m(train_func pid=487477)[0m {'eval_loss': 0.6211398839950562, 'eval_runtime': 954.4857, 'eval_samples_per_second': 2.731, 'eval_steps_per_second': 0.171, 'epoch': 0.02}


[36m(train_func pid=487477)[0m 
                                                  [A
 80%|████████  | 16/20 [2:49:39<36:46, 551.63s/it]
100%|██████████| 163/163 [15:48<00:00,  5.69s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000007)
 85%|████████▌ | 17/20 [2:52:19<36:02, 720.94s/it]


[36m(train_func pid=487477)[0m {'loss': 0.7671, 'grad_norm': 2.1352009773254395, 'learning_rate': 1.25e-05, 'epoch': 0.02}


 90%|█████████ | 18/20 [2:54:52<18:20, 550.30s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:46,  2.90s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<10:57,  4.11s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:35,  4.75s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:26,  5.11s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:28<13:56,  5.33s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:34<14:13,  5.47s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:40<14:26,  5.59s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:46<14:31,  5.66s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:52<14:31,  5.70s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:57<14:28,  5.72s/it][A
[36m(train_func pid

[36m(train_func pid=487477)[0m {'eval_loss': 0.6111030578613281, 'eval_runtime': 947.186, 'eval_samples_per_second': 2.752, 'eval_steps_per_second': 0.172, 'epoch': 0.02}


[36m(train_func pid=487477)[0m 
                                                  [A
 90%|█████████ | 18/20 [3:10:39<18:20, 550.30s/it]
100%|██████████| 163/163 [15:41<00:00,  5.59s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000008)
 95%|█████████▌| 19/20 [3:13:19<11:57, 717.40s/it]


[36m(train_func pid=487477)[0m {'loss': 0.828, 'grad_norm': 2.116673231124878, 'learning_rate': 0.0, 'epoch': 0.02}


100%|██████████| 20/20 [3:15:52<00:00, 548.07s/it]
[36m(train_func pid=487477)[0m 
  0%|          | 0/163 [00:00<?, ?it/s][A
[36m(train_func pid=487477)[0m 
  1%|          | 2/163 [00:05<07:48,  2.91s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 3/163 [00:11<11:08,  4.18s/it][A
[36m(train_func pid=487477)[0m 
  2%|▏         | 4/163 [00:17<12:43,  4.80s/it][A
[36m(train_func pid=487477)[0m 
  3%|▎         | 5/163 [00:23<13:32,  5.14s/it][A
[36m(train_func pid=487477)[0m 
  4%|▎         | 6/163 [00:29<14:03,  5.37s/it][A
[36m(train_func pid=487477)[0m 
  4%|▍         | 7/163 [00:35<14:17,  5.50s/it][A
[36m(train_func pid=487477)[0m 
  5%|▍         | 8/163 [00:41<14:36,  5.66s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 9/163 [00:47<14:48,  5.77s/it][A
[36m(train_func pid=487477)[0m 
  6%|▌         | 10/163 [00:52<14:48,  5.81s/it][A
[36m(train_func pid=487477)[0m 
  7%|▋         | 11/163 [00:58<14:43,  5.81s/it][A
[36m(train_func pid

[36m(train_func pid=487477)[0m {'eval_loss': 0.6069474220275879, 'eval_runtime': 949.8699, 'eval_samples_per_second': 2.745, 'eval_steps_per_second': 0.172, 'epoch': 0.02}


[36m(train_func pid=487477)[0m 
                                                  [A
100%|██████████| 20/20 [3:31:42<00:00, 548.07s/it]
100%|██████████| 163/163 [15:44<00:00,  5.72s/it][A
                                                 [A
[36m(train_func pid=487477)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_func_2024-05-29_16-03-19/train_func_f74df_00000_0_lr=0.0001_2024-05-29_16-03-19/checkpoint_000009)
2024-05-29 19:35:09,807	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_func_2024-05-29_16-03-19' in 0.0032s.
2024-05-29 19:35:09,811	INFO tune.py:1039 -- Total run time: 12710.70 seconds (12710.69 seconds for the tuning loop).


In [8]:
print("Best config is:", results.get_best_result().config)

[36m(train_func pid=487477)[0m {'train_runtime': 12705.7604, 'train_samples_per_second': 0.201, 'train_steps_per_second': 0.002, 'train_loss': 0.9348451197147369, 'epoch': 0.02}Best config is: {'lr': 0.0001}



100%|██████████| 20/20 [3:31:45<00:00, 635.29s/it]
