In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Define a simple model
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.layer1 = nn.Linear(10000, 10000)
        self.layer2 = nn.Linear(10000, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.layer2(x)
        return x

model = MyModel().cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Dummy data
inputs = torch.randn(64, 10000).to("cuda")
targets = torch.randn(64, 1).to("cuda")

# Training loop
model.train()
for epoch in tqdm(range(1000)):
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = nn.MSELoss()(outputs, targets)
    loss.backward()
    optimizer.step()

100%|██████████| 1000/1000 [00:07<00:00, 132.66it/s]


In [1]:
import deepspeed
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Define a simple model
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.layer1 = nn.Linear(10000, 10000)
        self.layer2 = nn.Linear(10000, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.layer2(x)
        return x

model = MyModel()
# Configure DeepSpeed
config = {
    "train_batch_size": 64,
    "gradient_accumulation_steps": 1,
    "fp16": {
        "enabled": False
    },
   "optimizer": {
        "type": "Adam",
        "params": {
            "lr": 0.001
        }
    }
}

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Initialize DeepSpeed
model_engine, optimizer, _, _ = deepspeed.initialize(model=model,
                                                      model_parameters=model.parameters(),
                                                      config=config,
                                                     optimizer=optimizer,
                                                     args=None
                                                     )
# Dummy data
inputs = torch.randn(64, 10000).to("cuda")
targets = torch.randn(64, 1).to("cuda")
# Training loop with DeepSpeed
model_engine.train()
for epoch in tqdm(range(1000)):
    optimizer.zero_grad()
    outputs = model_engine(inputs)
    loss = nn.MSELoss()(outputs, targets)
    model_engine.backward(loss)
    model_engine.step()

[2024-04-15 18:45:58,364] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-04-15 18:46:03,511] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown
[2024-04-15 18:46:03,512] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-04-15 18:46:03,513] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2024-04-15 18:46:05,364] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=10.151.0.1, master_port=29500
[2024-04-15 18:46:05,365] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
[2024-04-15 18:46:08,403] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2024-04-15 18:46:08,404] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2024-04-15 18

  1%|          | 8/1000 [00:07<10:52,  1.52it/s]  

[2024-04-15 18:46:15,938] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:15,952] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=4401.916253289932, CurrSamplesPerSec=4419.418109976951, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  2%|▏         | 15/1000 [00:07<04:50,  3.39it/s]

[2024-04-15 18:46:16,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:16,098] [INFO] [timer.py:260:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=4426.507355925279, CurrSamplesPerSec=4434.1645908355085, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  3%|▎         | 29/1000 [00:07<01:46,  9.16it/s]

[2024-04-15 18:46:16,231] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:16,244] [INFO] [timer.py:260:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=4447.327658934929, CurrSamplesPerSec=4463.435193960859, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  4%|▎         | 36/1000 [00:07<01:12, 13.25it/s]

[2024-04-15 18:46:16,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:16,393] [INFO] [timer.py:260:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=4450.387960514088, CurrSamplesPerSec=4424.152550473836, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  4%|▍         | 43/1000 [00:07<00:52, 18.18it/s]

[2024-04-15 18:46:16,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:16,542] [INFO] [timer.py:260:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=4449.628983415207, CurrSamplesPerSec=4423.277736582793, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  6%|▌         | 57/1000 [00:07<00:31, 29.99it/s]

[2024-04-15 18:46:16,679] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:16,714] [INFO] [timer.py:260:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=4449.074650579508, CurrSamplesPerSec=4433.432252097509, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  6%|▋         | 64/1000 [00:08<00:26, 35.03it/s]

[2024-04-15 18:46:16,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:16,860] [INFO] [timer.py:260:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=4453.989389157987, CurrSamplesPerSec=4467.223431519387, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  8%|▊         | 78/1000 [00:08<00:19, 46.97it/s]

[2024-04-15 18:46:16,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:17,007] [INFO] [timer.py:260:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=4456.622477394917, CurrSamplesPerSec=4433.725158562368, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


  8%|▊         | 85/1000 [00:08<00:17, 51.61it/s]

[2024-04-15 18:46:17,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:17,156] [INFO] [timer.py:260:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=4455.669523648567, CurrSamplesPerSec=4394.098150270093, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 10%|▉         | 99/1000 [00:08<00:15, 58.56it/s]

[2024-04-15 18:46:17,293] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:17,306] [INFO] [timer.py:260:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=4453.8742578900765, CurrSamplesPerSec=4428.897145685531, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 11%|█         | 106/1000 [00:08<00:14, 60.78it/s]

[2024-04-15 18:46:17,441] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:17,454] [INFO] [timer.py:260:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=4453.892062321549, CurrSamplesPerSec=4458.83852964138, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 11%|█▏        | 113/1000 [00:08<00:14, 62.77it/s]

[2024-04-15 18:46:17,587] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:17,600] [INFO] [timer.py:260:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=4456.200171070995, CurrSamplesPerSec=4462.322228871601, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 13%|█▎        | 127/1000 [00:08<00:13, 65.70it/s]

[2024-04-15 18:46:17,733] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:17,746] [INFO] [timer.py:260:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=4458.70545095166, CurrSamplesPerSec=4470.050223139945, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 13%|█▎        | 134/1000 [00:09<00:13, 66.51it/s]

[2024-04-15 18:46:17,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:17,893] [INFO] [timer.py:260:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=4459.705454553772, CurrSamplesPerSec=4428.9702189443815, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 15%|█▍        | 148/1000 [00:09<00:12, 66.96it/s]

[2024-04-15 18:46:18,030] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:18,042] [INFO] [timer.py:260:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=4459.183851938086, CurrSamplesPerSec=4430.8709704042385, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 16%|█▌        | 155/1000 [00:09<00:12, 66.89it/s]

[2024-04-15 18:46:18,178] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:18,191] [INFO] [timer.py:260:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=4458.3815397145545, CurrSamplesPerSec=4427.217125987499, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 17%|█▋        | 169/1000 [00:09<00:12, 67.33it/s]

[2024-04-15 18:46:18,326] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:18,339] [INFO] [timer.py:260:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=4458.59695499163, CurrSamplesPerSec=4452.182774119716, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 18%|█▊        | 176/1000 [00:09<00:12, 67.65it/s]

[2024-04-15 18:46:18,472] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:18,485] [INFO] [timer.py:260:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=4460.227442749078, CurrSamplesPerSec=4461.209818683419, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 18%|█▊        | 183/1000 [00:09<00:12, 67.87it/s]

[2024-04-15 18:46:18,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:18,631] [INFO] [timer.py:260:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=4461.492996729857, CurrSamplesPerSec=4420.436979218127, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 20%|█▉        | 197/1000 [00:10<00:11, 68.37it/s]

[2024-04-15 18:46:18,763] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:18,776] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=4462.856157998056, CurrSamplesPerSec=4461.135677723858, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 20%|██        | 204/1000 [00:10<00:11, 68.42it/s]

[2024-04-15 18:46:18,910] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:18,923] [INFO] [timer.py:260:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=4463.34385284115, CurrSamplesPerSec=4423.642200322995, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 22%|██▏       | 218/1000 [00:10<00:11, 67.89it/s]

[2024-04-15 18:46:19,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:19,073] [INFO] [timer.py:260:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=4462.702344433428, CurrSamplesPerSec=4425.538380374572, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 22%|██▎       | 225/1000 [00:10<00:11, 67.50it/s]

[2024-04-15 18:46:19,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:19,222] [INFO] [timer.py:260:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=4461.84662189993, CurrSamplesPerSec=4424.51715839789, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 24%|██▍       | 239/1000 [00:10<00:11, 67.50it/s]

[2024-04-15 18:46:19,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:19,371] [INFO] [timer.py:260:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=4461.53507116862, CurrSamplesPerSec=4463.2125565309925, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 25%|██▍       | 246/1000 [00:10<00:11, 67.63it/s]

[2024-04-15 18:46:19,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:19,517] [INFO] [timer.py:260:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=4462.418544229307, CurrSamplesPerSec=4465.217093334664, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 25%|██▌       | 253/1000 [00:10<00:11, 67.84it/s]

[2024-04-15 18:46:19,651] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:19,664] [INFO] [timer.py:260:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=4462.548803115573, CurrSamplesPerSec=4424.371308016877, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 27%|██▋       | 267/1000 [00:11<00:10, 67.61it/s]

[2024-04-15 18:46:19,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:19,813] [INFO] [timer.py:260:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=4461.908192514387, CurrSamplesPerSec=4426.925078747299, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 27%|██▋       | 274/1000 [00:11<00:10, 67.34it/s]

[2024-04-15 18:46:19,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:19,962] [INFO] [timer.py:260:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=4461.448526641, CurrSamplesPerSec=4423.933814562115, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 29%|██▉       | 288/1000 [00:11<00:10, 67.49it/s]

[2024-04-15 18:46:20,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:20,110] [INFO] [timer.py:260:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=4461.51619113925, CurrSamplesPerSec=4466.777422790203, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 30%|██▉       | 295/1000 [00:11<00:10, 67.69it/s]

[2024-04-15 18:46:20,243] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:20,256] [INFO] [timer.py:260:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=4462.367533389199, CurrSamplesPerSec=4472.135412501666, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 31%|███       | 309/1000 [00:11<00:10, 68.24it/s]

[2024-04-15 18:46:20,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:20,402] [INFO] [timer.py:260:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=4463.166056062376, CurrSamplesPerSec=4469.9757880538855, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 32%|███▏      | 316/1000 [00:11<00:10, 68.10it/s]

[2024-04-15 18:46:20,537] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:20,550] [INFO] [timer.py:260:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=4463.2526949714575, CurrSamplesPerSec=4428.458756763891, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 32%|███▏      | 323/1000 [00:11<00:09, 67.74it/s]

[2024-04-15 18:46:20,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:20,699] [INFO] [timer.py:260:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=4462.87909445252, CurrSamplesPerSec=4431.675625701644, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 34%|███▎      | 337/1000 [00:12<00:09, 67.55it/s]

[2024-04-15 18:46:20,835] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:20,848] [INFO] [timer.py:260:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=4462.450400115207, CurrSamplesPerSec=4438.343545906979, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 34%|███▍      | 344/1000 [00:12<00:09, 67.56it/s]

[2024-04-15 18:46:20,980] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:20,994] [INFO] [timer.py:260:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=4463.005933042828, CurrSamplesPerSec=4459.801561721216, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 36%|███▌      | 358/1000 [00:12<00:09, 68.12it/s]

[2024-04-15 18:46:21,127] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:21,140] [INFO] [timer.py:260:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=4463.693307049071, CurrSamplesPerSec=4473.327822956939, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 36%|███▋      | 365/1000 [00:12<00:09, 68.22it/s]

[2024-04-15 18:46:21,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:21,286] [INFO] [timer.py:260:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=4464.369747083404, CurrSamplesPerSec=4465.514214895281, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 38%|███▊      | 379/1000 [00:12<00:09, 68.53it/s]

[2024-04-15 18:46:21,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:21,432] [INFO] [timer.py:260:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=4464.906062002017, CurrSamplesPerSec=4428.385700381081, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 39%|███▊      | 386/1000 [00:12<00:08, 68.26it/s]

[2024-04-15 18:46:21,567] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:21,580] [INFO] [timer.py:260:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=4464.810146915734, CurrSamplesPerSec=4431.529303001288, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 39%|███▉      | 393/1000 [00:12<00:08, 67.79it/s]

[2024-04-15 18:46:21,716] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:21,729] [INFO] [timer.py:260:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=4464.341822412421, CurrSamplesPerSec=4430.066607254844, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 41%|████      | 407/1000 [00:13<00:08, 67.52it/s]

[2024-04-15 18:46:21,865] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:21,878] [INFO] [timer.py:260:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=4463.827591928404, CurrSamplesPerSec=4422.986208828327, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 41%|████▏     | 414/1000 [00:13<00:08, 67.36it/s]

[2024-04-15 18:46:22,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:22,025] [INFO] [timer.py:260:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=4464.043210256214, CurrSamplesPerSec=4463.2125565309925, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 43%|████▎     | 428/1000 [00:13<00:08, 68.01it/s]

[2024-04-15 18:46:22,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:22,171] [INFO] [timer.py:260:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=4464.49250469763, CurrSamplesPerSec=4468.561992275935, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 44%|████▎     | 435/1000 [00:13<00:08, 67.86it/s]

[2024-04-15 18:46:22,307] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:22,320] [INFO] [timer.py:260:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=4464.370717184436, CurrSamplesPerSec=4432.40738416829, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 45%|████▍     | 449/1000 [00:13<00:08, 67.51it/s]

[2024-04-15 18:46:22,457] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:22,470] [INFO] [timer.py:260:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=4463.800837987925, CurrSamplesPerSec=4415.855763378243, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 46%|████▌     | 456/1000 [00:13<00:08, 67.24it/s]

[2024-04-15 18:46:22,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:22,619] [INFO] [timer.py:260:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=4463.323710433582, CurrSamplesPerSec=4440.986946811151, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 46%|████▋     | 463/1000 [00:13<00:07, 67.14it/s]

[2024-04-15 18:46:22,752] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:22,766] [INFO] [timer.py:260:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=4463.589499170635, CurrSamplesPerSec=4461.135677723858, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 48%|████▊     | 477/1000 [00:14<00:07, 67.91it/s]

[2024-04-15 18:46:22,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:22,912] [INFO] [timer.py:260:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=4463.9982521431075, CurrSamplesPerSec=4463.732078420939, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 48%|████▊     | 484/1000 [00:14<00:07, 68.04it/s]

[2024-04-15 18:46:23,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:23,058] [INFO] [timer.py:260:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=4464.481917905392, CurrSamplesPerSec=4456.321795574148, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 50%|████▉     | 498/1000 [00:14<00:07, 67.91it/s]

[2024-04-15 18:46:23,194] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:23,207] [INFO] [timer.py:260:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=4464.2913777012745, CurrSamplesPerSec=4427.217125987499, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 50%|█████     | 505/1000 [00:14<00:07, 67.52it/s]

[2024-04-15 18:46:23,343] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:23,356] [INFO] [timer.py:260:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=4463.86650760249, CurrSamplesPerSec=4423.933814562115, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 52%|█████▏    | 519/1000 [00:14<00:07, 67.42it/s]

[2024-04-15 18:46:23,492] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:23,505] [INFO] [timer.py:260:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=4463.521733422178, CurrSamplesPerSec=4459.357033689947, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 53%|█████▎    | 526/1000 [00:14<00:07, 67.61it/s]

[2024-04-15 18:46:23,638] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:23,651] [INFO] [timer.py:260:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=4463.866336018582, CurrSamplesPerSec=4463.509411373462, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 53%|█████▎    | 533/1000 [00:14<00:06, 67.84it/s]

[2024-04-15 18:46:23,784] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:23,797] [INFO] [timer.py:260:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=4464.275595184349, CurrSamplesPerSec=4466.108576657516, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 55%|█████▍    | 547/1000 [00:15<00:06, 68.34it/s]

[2024-04-15 18:46:23,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:23,943] [INFO] [timer.py:260:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=4464.690721379143, CurrSamplesPerSec=4461.506407166719, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 55%|█████▌    | 554/1000 [00:15<00:06, 68.36it/s]

[2024-04-15 18:46:24,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:24,089] [INFO] [timer.py:260:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=4465.038865996207, CurrSamplesPerSec=4428.751006401372, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 57%|█████▋    | 568/1000 [00:15<00:06, 67.98it/s]

[2024-04-15 18:46:24,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:24,239] [INFO] [timer.py:260:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=4464.804562722545, CurrSamplesPerSec=4426.487080124664, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 57%|█████▊    | 575/1000 [00:15<00:06, 67.50it/s]

[2024-04-15 18:46:24,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:24,388] [INFO] [timer.py:260:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=4464.398799850204, CurrSamplesPerSec=4424.808888009758, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 59%|█████▉    | 589/1000 [00:15<00:06, 67.34it/s]

[2024-04-15 18:46:24,524] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:24,537] [INFO] [timer.py:260:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=4464.024457325931, CurrSamplesPerSec=4464.251721270581, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 60%|█████▉    | 596/1000 [00:15<00:05, 67.55it/s]

[2024-04-15 18:46:24,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:24,683] [INFO] [timer.py:260:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=4464.262398431221, CurrSamplesPerSec=4460.394403642285, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 60%|██████    | 603/1000 [00:15<00:05, 67.78it/s]

[2024-04-15 18:46:24,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:24,830] [INFO] [timer.py:260:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=4464.459441317652, CurrSamplesPerSec=4428.0934989525085, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 62%|██████▏   | 617/1000 [00:16<00:05, 67.73it/s]

[2024-04-15 18:46:24,966] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:24,979] [INFO] [timer.py:260:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=4464.269861706031, CurrSamplesPerSec=4426.998086945049, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 62%|██████▏   | 624/1000 [00:16<00:05, 67.42it/s]

[2024-04-15 18:46:25,115] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:25,128] [INFO] [timer.py:260:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=4463.980536773526, CurrSamplesPerSec=4424.51715839789, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 64%|██████▍   | 638/1000 [00:16<00:05, 67.34it/s]

[2024-04-15 18:46:25,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:25,277] [INFO] [timer.py:260:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=4463.78582896777, CurrSamplesPerSec=4462.025531914894, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 64%|██████▍   | 645/1000 [00:16<00:05, 67.57it/s]

[2024-04-15 18:46:25,410] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:25,423] [INFO] [timer.py:260:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=4464.107593030121, CurrSamplesPerSec=4462.322228871601, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 66%|██████▌   | 659/1000 [00:16<00:05, 68.14it/s]

[2024-04-15 18:46:25,556] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:25,569] [INFO] [timer.py:260:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=4464.421539612934, CurrSamplesPerSec=4449.52603225646, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 67%|██████▋   | 666/1000 [00:16<00:04, 68.11it/s]

[2024-04-15 18:46:25,704] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:25,717] [INFO] [timer.py:260:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=4464.55593961603, CurrSamplesPerSec=4427.290143818445, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 67%|██████▋   | 673/1000 [00:17<00:04, 67.59it/s]

[2024-04-15 18:46:25,853] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:25,867] [INFO] [timer.py:260:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=4464.286872152763, CurrSamplesPerSec=4424.07963609994, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 69%|██████▊   | 687/1000 [00:17<00:04, 67.41it/s]

[2024-04-15 18:46:26,003] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:26,016] [INFO] [timer.py:260:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=4463.987784554952, CurrSamplesPerSec=4424.8818264238025, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 69%|██████▉   | 694/1000 [00:17<00:04, 67.26it/s]

[2024-04-15 18:46:26,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:26,163] [INFO] [timer.py:260:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=4464.078670807391, CurrSamplesPerSec=4462.841543500307, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 71%|███████   | 708/1000 [00:17<00:04, 67.94it/s]

[2024-04-15 18:46:26,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:26,310] [INFO] [timer.py:260:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=4464.211558923954, CurrSamplesPerSec=4388.351414091875, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 72%|███████▏  | 715/1000 [00:17<00:04, 67.90it/s]

[2024-04-15 18:46:26,443] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:26,456] [INFO] [timer.py:260:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=4464.4420942622355, CurrSamplesPerSec=4460.320289782829, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 73%|███████▎  | 729/1000 [00:17<00:03, 68.34it/s]

[2024-04-15 18:46:26,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:26,602] [INFO] [timer.py:260:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=4464.756490238873, CurrSamplesPerSec=4461.5805604494235, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 74%|███████▎  | 736/1000 [00:17<00:03, 68.39it/s]

[2024-04-15 18:46:26,735] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:26,748] [INFO] [timer.py:260:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=4464.979082398855, CurrSamplesPerSec=4429.481799280552, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 74%|███████▍  | 743/1000 [00:18<00:03, 68.06it/s]

[2024-04-15 18:46:26,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:26,897] [INFO] [timer.py:260:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=4464.857462312828, CurrSamplesPerSec=4433.725158562368, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 76%|███████▌  | 757/1000 [00:18<00:03, 67.67it/s]

[2024-04-15 18:46:27,034] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:27,046] [INFO] [timer.py:260:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=4464.5905434156, CurrSamplesPerSec=4427.071097550919, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 76%|███████▋  | 764/1000 [00:18<00:03, 67.38it/s]

[2024-04-15 18:46:27,182] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:27,195] [INFO] [timer.py:260:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=4464.433663524056, CurrSamplesPerSec=4462.6189652879375, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 78%|███████▊  | 778/1000 [00:18<00:03, 67.82it/s]

[2024-04-15 18:46:27,328] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:27,341] [INFO] [timer.py:260:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=4464.603402814454, CurrSamplesPerSec=4462.099702455161, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 78%|███████▊  | 785/1000 [00:18<00:03, 67.93it/s]

[2024-04-15 18:46:27,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:27,489] [INFO] [timer.py:260:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=4464.3666694947915, CurrSamplesPerSec=4416.800315914176, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 80%|███████▉  | 799/1000 [00:18<00:02, 67.49it/s]

[2024-04-15 18:46:27,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:27,639] [INFO] [timer.py:260:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=4464.076819013696, CurrSamplesPerSec=4426.341099843351, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 81%|████████  | 806/1000 [00:18<00:02, 67.25it/s]

[2024-04-15 18:46:27,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:27,788] [INFO] [timer.py:260:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=4463.801069295304, CurrSamplesPerSec=4443.412831887705, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 81%|████████▏ | 813/1000 [00:19<00:02, 67.13it/s]

[2024-04-15 18:46:27,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:27,936] [INFO] [timer.py:260:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=4463.723730274244, CurrSamplesPerSec=4458.172058725835, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 83%|████████▎ | 827/1000 [00:19<00:02, 67.74it/s]

[2024-04-15 18:46:28,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:28,082] [INFO] [timer.py:260:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=4463.95306426139, CurrSamplesPerSec=4464.177479170477, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 83%|████████▎ | 834/1000 [00:19<00:02, 67.91it/s]

[2024-04-15 18:46:28,214] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:28,228] [INFO] [timer.py:260:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=4464.242064342195, CurrSamplesPerSec=4464.177479170477, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 85%|████████▍ | 848/1000 [00:19<00:02, 68.26it/s]

[2024-04-15 18:46:28,362] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:28,375] [INFO] [timer.py:260:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=4464.36300149548, CurrSamplesPerSec=4436.436379261903, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 86%|████████▌ | 855/1000 [00:19<00:02, 67.82it/s]

[2024-04-15 18:46:28,511] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:28,524] [INFO] [timer.py:260:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=4464.243414339487, CurrSamplesPerSec=4433.725158562368, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 87%|████████▋ | 869/1000 [00:19<00:01, 67.54it/s]

[2024-04-15 18:46:28,661] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:28,674] [INFO] [timer.py:260:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=4463.889686074926, CurrSamplesPerSec=4339.753552663487, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 88%|████████▊ | 876/1000 [00:20<00:01, 67.32it/s]

[2024-04-15 18:46:28,808] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:28,821] [INFO] [timer.py:260:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=4463.816535401321, CurrSamplesPerSec=4459.06073089701, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 88%|████████▊ | 883/1000 [00:20<00:01, 67.57it/s]

[2024-04-15 18:46:28,954] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:28,967] [INFO] [timer.py:260:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=4464.03677710018, CurrSamplesPerSec=4457.949946026738, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 90%|████████▉ | 897/1000 [00:20<00:01, 68.13it/s]

[2024-04-15 18:46:29,100] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:29,113] [INFO] [timer.py:260:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=4464.303559924767, CurrSamplesPerSec=4464.400212879191, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 90%|█████████ | 904/1000 [00:20<00:01, 68.22it/s]

[2024-04-15 18:46:29,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:29,259] [INFO] [timer.py:260:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=4464.568913294592, CurrSamplesPerSec=4461.951363840361, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 92%|█████████▏| 918/1000 [00:20<00:01, 68.40it/s]

[2024-04-15 18:46:29,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:29,406] [INFO] [timer.py:260:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=4464.654272901021, CurrSamplesPerSec=4424.590087194449, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 92%|█████████▎| 925/1000 [00:20<00:01, 67.88it/s]

[2024-04-15 18:46:29,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:29,555] [INFO] [timer.py:260:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=4464.5123881977115, CurrSamplesPerSec=4428.458756763891, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 94%|█████████▍| 939/1000 [00:20<00:00, 67.51it/s]

[2024-04-15 18:46:29,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:29,705] [INFO] [timer.py:260:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=4464.232962639013, CurrSamplesPerSec=4429.701084176307, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 95%|█████████▍| 946/1000 [00:21<00:00, 67.24it/s]

[2024-04-15 18:46:29,840] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:29,853] [INFO] [timer.py:260:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=4464.099872188805, CurrSamplesPerSec=4452.995189277064, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 95%|█████████▌| 953/1000 [00:21<00:00, 67.48it/s]

[2024-04-15 18:46:29,986] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:29,999] [INFO] [timer.py:260:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=4464.256293672709, CurrSamplesPerSec=4456.025895984462, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 97%|█████████▋| 967/1000 [00:21<00:00, 68.02it/s]

[2024-04-15 18:46:30,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:30,147] [INFO] [timer.py:260:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=4464.318986131819, CurrSamplesPerSec=4420.582570318161, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 97%|█████████▋| 974/1000 [00:21<00:00, 67.61it/s]

[2024-04-15 18:46:30,283] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:30,296] [INFO] [timer.py:260:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=4464.183324317843, CurrSamplesPerSec=4426.779069575686, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


 99%|█████████▉| 988/1000 [00:21<00:00, 67.39it/s]

[2024-04-15 18:46:30,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:30,446] [INFO] [timer.py:260:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=4463.924788990304, CurrSamplesPerSec=4416.291660497179, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


100%|█████████▉| 995/1000 [00:21<00:00, 67.19it/s]

[2024-04-15 18:46:30,580] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:46:30,594] [INFO] [timer.py:260:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=4463.89466765887, CurrSamplesPerSec=4463.06414391647, MemAllocated=1.14GB, MaxMemAllocated=1.88GB


100%|██████████| 1000/1000 [00:21<00:00, 45.73it/s]


In [5]:
import deepspeed
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Define a simple model
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.layer1 = nn.Linear(10000, 10000)
        self.layer2 = nn.Linear(10000, 1)

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = self.layer2(x)
        return x

def training_function():
    model = MyModel()
    # Configure DeepSpeed
    config = {
        "train_batch_size": 64,
        "gradient_accumulation_steps": 1,
        "fp16": {
            "enabled": False
            },
        "optimizer": {
        "type": "Adam",
        "params": {
            "lr": 0.001
            }
        }
    }

    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Initialize DeepSpeed
    model_engine, optimizer, _, _ = deepspeed.initialize(model=model,
                                                      model_parameters=model.parameters(),
                                                      config=config,
                                                     optimizer=optimizer,
                                                     args=None
                                                     )
    # Dummy data
    inputs = torch.randn(64, 10000).to("cuda")
    targets = torch.randn(64, 1).to("cuda")
    # Training loop with DeepSpeed
    model_engine.train()
    for epoch in tqdm(range(1000)):
        optimizer.zero_grad()
        outputs = model_engine(inputs)
        loss = nn.MSELoss()(outputs, targets)
        model_engine.backward(loss)
        model_engine.step()

from accelerate import notebook_launcher
notebook_launcher(training_function, num_processes=1)

Launching training on one GPU.
[2024-04-15 18:52:32,980] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.0, git-hash=unknown, git-branch=unknown
[2024-04-15 18:52:33,068] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
[2024-04-15 18:52:33,069] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
[2024-04-15 18:52:33,070] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
[2024-04-15 18:52:33,070] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = Adam
[2024-04-15 18:52:33,070] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = Adam
[2024-04-15 18:52:33,071] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
[2024-04-15 18:52:33,071] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = None
[2024-04-15 18:52:33,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0,

  1%|          | 7/1000 [00:00<00:14, 66.54it/s]

[2024-04-15 18:52:33,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:33,250] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=4356.560346174211, CurrSamplesPerSec=4324.651705305215, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  1%|▏         | 14/1000 [00:00<00:14, 65.99it/s]

[2024-04-15 18:52:33,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:33,399] [INFO] [timer.py:260:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=4387.817412735631, CurrSamplesPerSec=4422.330411861614, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  3%|▎         | 28/1000 [00:00<00:14, 67.46it/s]

[2024-04-15 18:52:33,533] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:33,546] [INFO] [timer.py:260:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=4414.758617692061, CurrSamplesPerSec=4461.506407166719, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  4%|▎         | 35/1000 [00:00<00:14, 67.79it/s]

[2024-04-15 18:52:33,678] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:33,691] [INFO] [timer.py:260:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=4433.191404068495, CurrSamplesPerSec=4458.023981133955, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  5%|▍         | 49/1000 [00:00<00:14, 67.86it/s]

[2024-04-15 18:52:33,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:33,840] [INFO] [timer.py:260:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=4438.644746909202, CurrSamplesPerSec=4425.6843077126, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  6%|▌         | 56/1000 [00:00<00:13, 67.50it/s]

[2024-04-15 18:52:33,976] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:33,989] [INFO] [timer.py:260:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=4439.367363645462, CurrSamplesPerSec=4427.655269104524, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  6%|▋         | 63/1000 [00:00<00:13, 67.29it/s]

[2024-04-15 18:52:34,124] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:34,138] [INFO] [timer.py:260:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=4441.098237366602, CurrSamplesPerSec=4454.473067604792, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  8%|▊         | 77/1000 [00:01<00:13, 67.70it/s]

[2024-04-15 18:52:34,271] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:34,284] [INFO] [timer.py:260:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=4445.881961193913, CurrSamplesPerSec=4458.764467477244, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


  8%|▊         | 84/1000 [00:01<00:13, 67.89it/s]

[2024-04-15 18:52:34,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:34,430] [INFO] [timer.py:260:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=4450.336639609946, CurrSamplesPerSec=4463.435193960859, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 10%|▉         | 98/1000 [00:01<00:13, 68.33it/s]

[2024-04-15 18:52:34,563] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:34,576] [INFO] [timer.py:260:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=4453.470114640141, CurrSamplesPerSec=4421.2378489664825, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 10%|█         | 105/1000 [00:01<00:13, 68.35it/s]

[2024-04-15 18:52:34,708] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:34,722] [INFO] [timer.py:260:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=4456.290970866668, CurrSamplesPerSec=4462.6189652879375, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 12%|█▏        | 119/1000 [00:01<00:12, 68.04it/s]

[2024-04-15 18:52:34,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:34,871] [INFO] [timer.py:260:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=4456.1481378731705, CurrSamplesPerSec=4426.049168164355, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 13%|█▎        | 126/1000 [00:01<00:12, 67.58it/s]

[2024-04-15 18:52:35,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:35,020] [INFO] [timer.py:260:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=4454.965141577173, CurrSamplesPerSec=4435.336836191798, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 13%|█▎        | 133/1000 [00:01<00:12, 67.35it/s]

[2024-04-15 18:52:35,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:35,169] [INFO] [timer.py:260:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=4454.234720169935, CurrSamplesPerSec=4428.385700381081, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 15%|█▍        | 147/1000 [00:02<00:12, 67.68it/s]

[2024-04-15 18:52:35,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:35,316] [INFO] [timer.py:260:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=4455.4237217464015, CurrSamplesPerSec=4454.325235629895, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 15%|█▌        | 154/1000 [00:02<00:12, 67.84it/s]

[2024-04-15 18:52:35,449] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:35,462] [INFO] [timer.py:260:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=4456.798032959632, CurrSamplesPerSec=4439.077508227084, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 17%|█▋        | 168/1000 [00:02<00:12, 67.71it/s]

[2024-04-15 18:52:35,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:35,612] [INFO] [timer.py:260:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=4456.469760106251, CurrSamplesPerSec=4432.041474730464, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 18%|█▊        | 175/1000 [00:02<00:12, 67.39it/s]

[2024-04-15 18:52:35,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:35,761] [INFO] [timer.py:260:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=4455.602894315844, CurrSamplesPerSec=4429.116372696223, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 19%|█▉        | 189/1000 [00:02<00:12, 67.34it/s]

[2024-04-15 18:52:35,897] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:35,910] [INFO] [timer.py:260:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=4455.04561591836, CurrSamplesPerSec=4461.803035088011, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 20%|█▉        | 196/1000 [00:02<00:11, 67.53it/s]

[2024-04-15 18:52:36,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:36,056] [INFO] [timer.py:260:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=4455.959772281698, CurrSamplesPerSec=4447.83032873807, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 20%|██        | 203/1000 [00:03<00:11, 67.75it/s]

[2024-04-15 18:52:36,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:36,202] [INFO] [timer.py:260:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=4457.274489251608, CurrSamplesPerSec=4460.023858972868, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 22%|██▏       | 217/1000 [00:03<00:11, 68.24it/s]

[2024-04-15 18:52:36,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:36,349] [INFO] [timer.py:260:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=4457.984925610396, CurrSamplesPerSec=4425.8302446745365, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 22%|██▏       | 224/1000 [00:03<00:11, 67.78it/s]

[2024-04-15 18:52:36,485] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:36,498] [INFO] [timer.py:260:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=4457.6486355228735, CurrSamplesPerSec=4427.290143818445, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 24%|██▍       | 238/1000 [00:03<00:11, 67.52it/s]

[2024-04-15 18:52:36,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:36,648] [INFO] [timer.py:260:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=4456.872360405409, CurrSamplesPerSec=4420.073043420987, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 24%|██▍       | 245/1000 [00:03<00:11, 67.34it/s]

[2024-04-15 18:52:36,782] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:36,795] [INFO] [timer.py:260:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=4457.03694904903, CurrSamplesPerSec=4450.632622608349, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 26%|██▌       | 259/1000 [00:03<00:10, 67.98it/s]

[2024-04-15 18:52:36,928] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:36,942] [INFO] [timer.py:260:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=4457.860705425753, CurrSamplesPerSec=4452.995189277064, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 27%|██▋       | 266/1000 [00:03<00:10, 68.07it/s]

[2024-04-15 18:52:37,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:37,087] [INFO] [timer.py:260:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=4458.871139881864, CurrSamplesPerSec=4459.357033689947, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 27%|██▋       | 273/1000 [00:04<00:10, 68.10it/s]

[2024-04-15 18:52:37,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:37,233] [INFO] [timer.py:260:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=4459.825283033594, CurrSamplesPerSec=4459.727467561595, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 29%|██▊       | 287/1000 [00:04<00:10, 68.38it/s]

[2024-04-15 18:52:37,368] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:37,380] [INFO] [timer.py:260:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=4460.207836998499, CurrSamplesPerSec=4430.285950058589, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 29%|██▉       | 294/1000 [00:04<00:10, 67.90it/s]

[2024-04-15 18:52:37,516] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:37,529] [INFO] [timer.py:260:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=4459.907983143802, CurrSamplesPerSec=4425.903216764769, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 31%|███       | 308/1000 [00:04<00:10, 67.54it/s]

[2024-04-15 18:52:37,666] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:37,679] [INFO] [timer.py:260:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=4459.208396360366, CurrSamplesPerSec=4422.039008961518, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 32%|███▏      | 315/1000 [00:04<00:10, 67.28it/s]

[2024-04-15 18:52:37,814] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:37,827] [INFO] [timer.py:260:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=4459.119662199092, CurrSamplesPerSec=4459.134802903703, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 33%|███▎      | 329/1000 [00:04<00:09, 67.84it/s]

[2024-04-15 18:52:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:37,974] [INFO] [timer.py:260:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=4459.4751592718, CurrSamplesPerSec=4429.408709139811, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 34%|███▎      | 336/1000 [00:04<00:09, 67.79it/s]

[2024-04-15 18:52:38,109] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:38,122] [INFO] [timer.py:260:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=4459.578848237495, CurrSamplesPerSec=4418.908851465916, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 34%|███▍      | 343/1000 [00:05<00:09, 67.49it/s]

[2024-04-15 18:52:38,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:38,271] [INFO] [timer.py:260:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=4459.026675627788, CurrSamplesPerSec=4420.50977356937, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 36%|███▌      | 357/1000 [00:05<00:09, 67.37it/s]

[2024-04-15 18:52:38,408] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:38,420] [INFO] [timer.py:260:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=4458.574771982619, CurrSamplesPerSec=4426.925078747299, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 36%|███▋      | 364/1000 [00:05<00:09, 67.23it/s]

[2024-04-15 18:52:38,554] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:38,567] [INFO] [timer.py:260:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=4458.953049080906, CurrSamplesPerSec=4462.470592146823, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 38%|███▊      | 378/1000 [00:05<00:09, 67.96it/s]

[2024-04-15 18:52:38,700] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:38,714] [INFO] [timer.py:260:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=4459.554394765262, CurrSamplesPerSec=4464.251721270581, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 38%|███▊      | 385/1000 [00:05<00:09, 67.97it/s]

[2024-04-15 18:52:38,846] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:38,859] [INFO] [timer.py:260:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=4460.111714511622, CurrSamplesPerSec=4468.561992275935, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 40%|███▉      | 399/1000 [00:05<00:08, 67.97it/s]

[2024-04-15 18:52:38,995] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:39,008] [INFO] [timer.py:260:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=4460.123285518695, CurrSamplesPerSec=4423.204851041392, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 41%|████      | 406/1000 [00:05<00:08, 67.60it/s]

[2024-04-15 18:52:39,144] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:39,157] [INFO] [timer.py:260:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=4459.716389969879, CurrSamplesPerSec=4417.890686460065, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 41%|████▏     | 413/1000 [00:06<00:08, 67.37it/s]

[2024-04-15 18:52:39,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:39,306] [INFO] [timer.py:260:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=4459.511757674708, CurrSamplesPerSec=4457.505787017818, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 43%|████▎     | 427/1000 [00:06<00:08, 67.79it/s]

[2024-04-15 18:52:39,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:39,452] [INFO] [timer.py:260:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=4459.976419848985, CurrSamplesPerSec=4462.099702455161, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 43%|████▎     | 434/1000 [00:06<00:08, 67.95it/s]

[2024-04-15 18:52:39,584] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:39,597] [INFO] [timer.py:260:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=4460.578176028117, CurrSamplesPerSec=4465.9599713843645, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 45%|████▍     | 448/1000 [00:06<00:08, 68.38it/s]

[2024-04-15 18:52:39,730] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:39,744] [INFO] [timer.py:260:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=4461.118466782109, CurrSamplesPerSec=4456.173840867213, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 46%|████▌     | 455/1000 [00:06<00:07, 68.38it/s]

[2024-04-15 18:52:39,876] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:39,889] [INFO] [timer.py:260:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=4461.6101900703825, CurrSamplesPerSec=4451.223029217656, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 47%|████▋     | 469/1000 [00:06<00:07, 68.01it/s]

[2024-04-15 18:52:40,026] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:40,039] [INFO] [timer.py:260:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=4461.4319394703425, CurrSamplesPerSec=4416.436978661095, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 48%|████▊     | 476/1000 [00:07<00:07, 67.55it/s]

[2024-04-15 18:52:40,175] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:40,188] [INFO] [timer.py:260:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=4460.978252686564, CurrSamplesPerSec=4417.163712955193, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 48%|████▊     | 483/1000 [00:07<00:07, 67.33it/s]

[2024-04-15 18:52:40,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:40,337] [INFO] [timer.py:260:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=4460.720047814748, CurrSamplesPerSec=4445.105168159765, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 50%|████▉     | 497/1000 [00:07<00:07, 67.64it/s]

[2024-04-15 18:52:40,471] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:40,484] [INFO] [timer.py:260:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=4461.015836604796, CurrSamplesPerSec=4457.431769120919, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 50%|█████     | 504/1000 [00:07<00:07, 67.84it/s]

[2024-04-15 18:52:40,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:40,630] [INFO] [timer.py:260:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=4461.3233700483115, CurrSamplesPerSec=4421.310669697269, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 52%|█████▏    | 518/1000 [00:07<00:07, 67.68it/s]

[2024-04-15 18:52:40,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:40,780] [INFO] [timer.py:260:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=4461.10605065503, CurrSamplesPerSec=4426.195129190232, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 52%|█████▎    | 525/1000 [00:07<00:07, 67.34it/s]

[2024-04-15 18:52:40,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:40,929] [INFO] [timer.py:260:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=4460.732719773211, CurrSamplesPerSec=4422.986208828327, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 54%|█████▍    | 539/1000 [00:07<00:06, 67.31it/s]

[2024-04-15 18:52:41,065] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:41,078] [INFO] [timer.py:260:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=4460.414654511866, CurrSamplesPerSec=4457.949946026738, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 55%|█████▍    | 546/1000 [00:08<00:06, 67.53it/s]

[2024-04-15 18:52:41,211] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:41,224] [INFO] [timer.py:260:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=4460.750806688557, CurrSamplesPerSec=4463.732078420939, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 55%|█████▌    | 553/1000 [00:08<00:06, 67.77it/s]

[2024-04-15 18:52:41,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:41,370] [INFO] [timer.py:260:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=4461.200517711016, CurrSamplesPerSec=4466.48013311148, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 57%|█████▋    | 567/1000 [00:08<00:06, 68.24it/s]

[2024-04-15 18:52:41,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:41,517] [INFO] [timer.py:260:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=4461.403667167287, CurrSamplesPerSec=4422.039008961518, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 57%|█████▋    | 574/1000 [00:08<00:06, 67.78it/s]

[2024-04-15 18:52:41,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:41,666] [INFO] [timer.py:260:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=4461.151839729908, CurrSamplesPerSec=4410.486765358263, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 59%|█████▉    | 588/1000 [00:08<00:06, 67.51it/s]

[2024-04-15 18:52:41,803] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:41,816] [INFO] [timer.py:260:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=4460.78316361223, CurrSamplesPerSec=4417.745272616559, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 60%|█████▉    | 595/1000 [00:08<00:06, 67.34it/s]

[2024-04-15 18:52:41,950] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:41,964] [INFO] [timer.py:260:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=4460.881038413046, CurrSamplesPerSec=4459.875658342887, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 61%|██████    | 609/1000 [00:08<00:05, 67.84it/s]

[2024-04-15 18:52:42,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:42,110] [INFO] [timer.py:260:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=4461.164455830009, CurrSamplesPerSec=4464.548714366497, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 62%|██████▏   | 616/1000 [00:09<00:05, 68.01it/s]

[2024-04-15 18:52:42,243] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:42,256] [INFO] [timer.py:260:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=4461.560642011616, CurrSamplesPerSec=4465.142819122392, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 62%|██████▏   | 623/1000 [00:09<00:05, 68.14it/s]

[2024-04-15 18:52:42,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:42,402] [INFO] [timer.py:260:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=4461.873419158151, CurrSamplesPerSec=4419.418109976951, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 64%|██████▎   | 637/1000 [00:09<00:05, 68.37it/s]

[2024-04-15 18:52:42,536] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:42,549] [INFO] [timer.py:260:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=4461.978566264757, CurrSamplesPerSec=4419.272595568141, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 64%|██████▍   | 644/1000 [00:09<00:05, 67.86it/s]

[2024-04-15 18:52:42,685] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:42,698] [INFO] [timer.py:260:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=4461.789759229913, CurrSamplesPerSec=4422.039008961518, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 66%|██████▌   | 658/1000 [00:09<00:05, 67.50it/s]

[2024-04-15 18:52:42,835] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:42,848] [INFO] [timer.py:260:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=4461.408817089405, CurrSamplesPerSec=4425.246554566436, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 66%|██████▋   | 665/1000 [00:09<00:04, 67.26it/s]

[2024-04-15 18:52:42,982] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:42,996] [INFO] [timer.py:260:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=4461.366987908186, CurrSamplesPerSec=4460.542638750416, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 68%|██████▊   | 679/1000 [00:10<00:04, 67.90it/s]

[2024-04-15 18:52:43,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:43,142] [INFO] [timer.py:260:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=4461.531671503894, CurrSamplesPerSec=4421.674809336342, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 69%|██████▊   | 686/1000 [00:10<00:04, 67.87it/s]

[2024-04-15 18:52:43,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:43,290] [INFO] [timer.py:260:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=4461.50511381738, CurrSamplesPerSec=4384.337634338353, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 69%|██████▉   | 693/1000 [00:10<00:04, 67.53it/s]

[2024-04-15 18:52:43,426] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:43,439] [INFO] [timer.py:260:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=4461.257512379667, CurrSamplesPerSec=4425.027710466017, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 71%|███████   | 707/1000 [00:10<00:04, 67.41it/s]

[2024-04-15 18:52:43,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:43,589] [INFO] [timer.py:260:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=4461.012324356663, CurrSamplesPerSec=4419.56363396885, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 71%|███████▏  | 714/1000 [00:10<00:04, 67.19it/s]

[2024-04-15 18:52:43,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:43,736] [INFO] [timer.py:260:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=4461.135058171904, CurrSamplesPerSec=4465.885672456246, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 73%|███████▎  | 728/1000 [00:10<00:04, 67.91it/s]

[2024-04-15 18:52:43,869] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:43,882] [INFO] [timer.py:260:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=4461.40424672255, CurrSamplesPerSec=4467.966977363515, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 74%|███████▎  | 735/1000 [00:10<00:03, 68.02it/s]

[2024-04-15 18:52:44,015] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:44,028] [INFO] [timer.py:260:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=4461.705461049364, CurrSamplesPerSec=4462.767348295927, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 75%|███████▍  | 749/1000 [00:11<00:03, 67.96it/s]

[2024-04-15 18:52:44,164] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:44,177] [INFO] [timer.py:260:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=4461.648172943758, CurrSamplesPerSec=4424.444232005406, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 76%|███████▌  | 756/1000 [00:11<00:03, 67.60it/s]

[2024-04-15 18:52:44,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:44,326] [INFO] [timer.py:260:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=4461.43871265459, CurrSamplesPerSec=4428.458756763891, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 76%|███████▋  | 763/1000 [00:11<00:03, 67.39it/s]

[2024-04-15 18:52:44,461] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:44,474] [INFO] [timer.py:260:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=4461.325186144779, CurrSamplesPerSec=4463.6578536033785, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 78%|███████▊  | 777/1000 [00:11<00:03, 67.83it/s]

[2024-04-15 18:52:44,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:44,620] [INFO] [timer.py:260:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=4461.585516797846, CurrSamplesPerSec=4464.771485122166, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 78%|███████▊  | 784/1000 [00:11<00:03, 67.97it/s]

[2024-04-15 18:52:44,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:44,766] [INFO] [timer.py:260:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=4461.883409977178, CurrSamplesPerSec=4458.098018700281, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 80%|███████▉  | 798/1000 [00:11<00:02, 68.36it/s]

[2024-04-15 18:52:44,899] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:44,912] [INFO] [timer.py:260:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=4462.1804749334215, CurrSamplesPerSec=4465.142819122392, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 80%|████████  | 805/1000 [00:11<00:02, 68.36it/s]

[2024-04-15 18:52:45,044] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:45,058] [INFO] [timer.py:260:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=4462.46765416444, CurrSamplesPerSec=4453.512335130651, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 82%|████████▏ | 819/1000 [00:12<00:02, 68.01it/s]

[2024-04-15 18:52:45,195] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:45,208] [INFO] [timer.py:260:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=4462.342179349166, CurrSamplesPerSec=4417.890686460065, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 83%|████████▎ | 826/1000 [00:12<00:02, 67.47it/s]

[2024-04-15 18:52:45,344] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:45,357] [INFO] [timer.py:260:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=4462.062796032704, CurrSamplesPerSec=4419.199841957098, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 83%|████████▎ | 833/1000 [00:12<00:02, 67.27it/s]

[2024-04-15 18:52:45,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:45,506] [INFO] [timer.py:260:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=4461.891889552418, CurrSamplesPerSec=4458.83852964138, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 85%|████████▍ | 847/1000 [00:12<00:02, 67.70it/s]

[2024-04-15 18:52:45,639] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:45,652] [INFO] [timer.py:260:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=4462.081859316928, CurrSamplesPerSec=4462.322228871601, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 85%|████████▌ | 854/1000 [00:12<00:02, 67.87it/s]

[2024-04-15 18:52:45,786] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:45,799] [INFO] [timer.py:260:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=4462.232662346001, CurrSamplesPerSec=4424.371308016877, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 87%|████████▋ | 868/1000 [00:12<00:01, 67.66it/s]

[2024-04-15 18:52:45,936] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:45,949] [INFO] [timer.py:260:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=4462.06381317616, CurrSamplesPerSec=4423.933814562115, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 88%|████████▊ | 875/1000 [00:12<00:01, 67.34it/s]

[2024-04-15 18:52:46,085] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:46,098] [INFO] [timer.py:260:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=4461.832345283995, CurrSamplesPerSec=4423.204851041392, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 89%|████████▉ | 889/1000 [00:13<00:01, 67.34it/s]

[2024-04-15 18:52:46,234] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:46,247] [INFO] [timer.py:260:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=4461.643108321634, CurrSamplesPerSec=4460.616759999335, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 90%|████████▉ | 896/1000 [00:13<00:01, 67.53it/s]

[2024-04-15 18:52:46,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:46,393] [INFO] [timer.py:260:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=4461.789656253364, CurrSamplesPerSec=4465.588501463934, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 90%|█████████ | 903/1000 [00:13<00:01, 67.80it/s]

[2024-04-15 18:52:46,525] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:46,538] [INFO] [timer.py:260:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=4462.044237688675, CurrSamplesPerSec=4462.6189652879375, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 92%|█████████▏| 917/1000 [00:13<00:01, 68.08it/s]

[2024-04-15 18:52:46,674] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:46,687] [INFO] [timer.py:260:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=4462.102530356801, CurrSamplesPerSec=4429.62798679868, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 92%|█████████▏| 924/1000 [00:13<00:01, 67.50it/s]

[2024-04-15 18:52:46,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:46,836] [INFO] [timer.py:260:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=4461.935939108718, CurrSamplesPerSec=4427.5822392293985, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 94%|█████████▍| 938/1000 [00:13<00:00, 67.36it/s]

[2024-04-15 18:52:46,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:46,986] [INFO] [timer.py:260:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=4461.659855012975, CurrSamplesPerSec=4432.041474730464, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 94%|█████████▍| 945/1000 [00:13<00:00, 67.25it/s]

[2024-04-15 18:52:47,120] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:47,133] [INFO] [timer.py:260:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=4461.700086962864, CurrSamplesPerSec=4460.320289782829, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 96%|█████████▌| 959/1000 [00:14<00:00, 67.95it/s]

[2024-04-15 18:52:47,266] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:47,280] [INFO] [timer.py:260:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=4461.9089389522505, CurrSamplesPerSec=4462.470592146823, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 97%|█████████▋| 966/1000 [00:14<00:00, 68.07it/s]

[2024-04-15 18:52:47,412] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:47,425] [INFO] [timer.py:260:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=4462.164297196839, CurrSamplesPerSec=4463.435193960859, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 97%|█████████▋| 973/1000 [00:14<00:00, 68.18it/s]

[2024-04-15 18:52:47,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:47,571] [INFO] [timer.py:260:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=4462.400277646995, CurrSamplesPerSec=4457.061716505886, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 99%|█████████▊| 987/1000 [00:14<00:00, 68.35it/s]

[2024-04-15 18:52:47,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:47,718] [INFO] [timer.py:260:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=4462.446414867401, CurrSamplesPerSec=4419.127090741473, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


 99%|█████████▉| 994/1000 [00:14<00:00, 67.84it/s]

[2024-04-15 18:52:47,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=0, lr=[0.001], mom=[(0.9, 0.999)]
[2024-04-15 18:52:47,867] [INFO] [timer.py:260:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=4462.318661138759, CurrSamplesPerSec=4424.152550473836, MemAllocated=2.26GB, MaxMemAllocated=3.01GB


100%|██████████| 1000/1000 [00:14<00:00, 67.71it/s]


In [6]:
!torchrun --nproc_per_node 2 deepspeed.py

[W socket.cpp:436] [c10d] The server socket has failed to bind to [::]:29500 (errno: 98 - Address already in use).
[W socket.cpp:436] [c10d] The server socket has failed to bind to 0.0.0.0:29500 (errno: 98 - Address already in use).
[E socket.cpp:472] [c10d] The server socket has failed to listen on any local network address.
Traceback (most recent call last):
  File "/scratch/qualis/miniconda3/envs/dp/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==2.1.1', 'console_scripts', 'torchrun')())
  File "/scratch/qualis/miniconda3/envs/dp/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "/scratch/qualis/miniconda3/envs/dp/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
    run(args)
  File "/scratch/qualis/miniconda3/envs/dp/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
    elastic_launch(
  File "/scratch/qualis/m