In [1]:
import torch
from functions import get_loader, get_model
from accelerate import Accelerator
import datetime

_, _, loader = get_loader()
model, optimizer, scheduler = get_model()

#使用4步梯度累积
#可以修改这里的mixed_precision,来查看不同精度的时间差,显存差
#no,fp8,fp16,bf16
accelerator = Accelerator(gradient_accumulation_steps=4,
                          mixed_precision='fp16')

loader, model, optimizer, scheduler = accelerator.prepare(
    loader, model, optimizer, scheduler)

now = datetime.datetime.now()
for i, data in enumerate(loader):
    #在这个范围内累积梯度
    with accelerator.accumulate(model):
        out = model(**data)
        accelerator.backward(out.loss)
        if accelerator.sync_gradients:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    if i % 1 == 0:
        lr = optimizer.state_dict()['param_groups'][0]['lr']

        labels = data['labels']
        logits = out['logits'].argmax(1)
        acc = (labels == logits).sum().item() / len(labels)

        print(i, len(loader), out.loss.item(), lr, acc)

datetime.datetime.now() - now

  from .autonotebook import tqdm as notebook_tqdm
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


0 62 2.447235107421875 0.0001 0.03125
1 62 2.36968994140625 0.0001 0.03125
2 62 2.384185791015625 0.0001 0.125
3 62 2.410858154296875 9.990133642141359e-05 0.0
4 62 1.7743988037109375 9.990133642141359e-05 0.375
5 62 1.9027099609375 9.990133642141359e-05 0.21875
6 62 1.891082763671875 9.990133642141359e-05 0.34375
7 62 1.7799530029296875 9.96057350657239e-05 0.3125
8 62 1.593292236328125 9.96057350657239e-05 0.5
9 62 1.4858932495117188 9.96057350657239e-05 0.59375
10 62 1.5505752563476562 9.96057350657239e-05 0.4375
11 62 1.83154296875 9.911436253643445e-05 0.3125
12 62 1.01068115234375 9.911436253643445e-05 0.625
13 62 0.9045181274414062 9.911436253643445e-05 0.71875
14 62 0.8840827941894531 9.911436253643445e-05 0.71875
15 62 1.1139945983886719 9.842915805643155e-05 0.5
16 62 0.5190963745117188 9.842915805643155e-05 0.9375
17 62 0.5722999572753906 9.842915805643155e-05 0.90625
18 62 0.5345573425292969 9.842915805643155e-05 0.90625
19 62 0.5995712280273438 9.755282581475769e-05 0.9062

datetime.timedelta(seconds=14, microseconds=139662)

In [2]:
from accelerate import notebook_launcher


def f():
    print('f runed')


#在jupyter中也可以这样运行,主要就是可以增加一些参数
notebook_launcher(f, num_processes=0, mixed_precision='fp16')

Launching training on one GPU.
f runed
