In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from accelerate import Accelerator
def same_seeds(seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
# 设置随机种子
same_seeds()
# 数据集参数
N = 10000  # 数据集总样本数
B = 1     # 理想batch_size
b = 1      # 实际 batch size
E = 50      # epochs
gradient_accumulation_steps = B // b  # 等于 4，可以手动设置
# gradient_accumulation_steps：梯度累积步数，这是一个重要的训练技巧。它允许你在较小的实际batch size下模拟较大的有效batch size。
accelerator=Accelerator(gradient_accumulation_steps=gradient_accumulation_steps)


# 创建一个示例数据集
X = torch.randn(N, 10)        # 生成服从标准正态分布（均值为0，方差为1）的随机数数列N*10
y = torch.randint(0, 2, (N,)) # 二分类标签  如果N=5，可能会得到类似这样的结果： tensor([0, 1, 1, 0, 1])
# 这里的 X 和 y 都是 torch.tensor 类型

# 定义数据集和 DataLoader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=B, shuffle=True, drop_last=False)

# 定义模型
model = nn.Sequential(
    nn.Linear(10, 50),
    nn.ReLU(),
    nn.Linear(50, 2)
)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

# scheduler = optim.CosineAnnealingLR(optimizer, T_max=E)
# 将模型和优化器交给 Accelerator 管理
model, optimizer, dataloader = accelerator.prepare(model, optimizer, dataloader)

# 训练模型
steps_per_epoch = len(dataloader)   # 一个 epoch 内 batch 的数量
total_steps = steps_per_epoch * E

device = "cuda" if torch.cuda.is_available() else "cpu"
# 这里设置 device 为 cuda，如果没有 cuda 设备，则使用 cpu
model.to(device)
# 记录最佳损失
best_loss = float('inf')
# 记录当前 step
current_step = 0
# 这里设置 patience=2，即如果连续 2 次没有提升，则提前结束训练
patience=5
origin_patience=patience
def train_one_epoch(model, dataloader,optimizer,criterion,current_step=0):
    epoch_best_loss=float('inf')
    for batch_idx, (inputs, targets) in enumerate(dataloader, start=1): 
        # with accelerator
        with accelerator.accumulate(model):
            inputs = inputs.to(device)
            targets = targets.to(device)
            
            # 清零梯度（这一步放在反向传播前和参数更新之后都可以）
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(inputs)
            
            # 计算损失
            loss = criterion(outputs, targets)

            # 反向传播（计算梯度）
            # loss.backward()
            # accelerator.backward(loss) 会自动把 loss 累积到当前的 batch 内，然后反向传播
            accelerator.backward(loss)
            # 更新参数
            # optimizer.step()

            epoch_best_loss = min(epoch_best_loss, loss.item())
            current_step += 1
            # 每 50 步打印一次
            if batch_idx % 50 == 0:
                # 如果不需要打印累积 step，可以去除 current_step 项直接使用 batch_idx
                print(f"Epoch [{epoch}/{E}], Batch [{batch_idx}/{steps_per_epoch}], "
                    f"Step [{current_step}/{total_steps}], Loss: {loss.item():.4f}")
    optimizer.step()
    return epoch_best_loss
    
# 这里设置成从 1 开始只是为了不在 print 中额外设置，实际写代码的时候不需要纠结这一点
for epoch in range(1, E + 1):
    current_loss=train_one_epoch(model, dataloader,optimizer,criterion,current_step=current_step)
    if current_loss<best_loss:
        best_loss=current_loss
        patience=origin_patience
         # 保存最佳模型参数
        print(f"Best model saved with loss {best_loss:.4f}")
        torch.save(model.state_dict(), "best_model.pth")
    else:
        patience-=1
        if patience==0:
            print(f"No improvement in validation for {epoch} epochs, stopping early.")
            break
    
print(f'Training complete with best loss {best_loss:.6f}')
# 可以看到:
# - epoch 从 1 到 E
# - batch 从 1 到 steps_per_epoch
# - step 累计从 1 到 total_steps
# - 最佳模型保存了

Epoch [1/50], Batch [50/10000], Step [50/500000], Loss: 0.8390
Epoch [1/50], Batch [100/10000], Step [100/500000], Loss: 0.6147
Epoch [1/50], Batch [150/10000], Step [150/500000], Loss: 0.5295
Epoch [1/50], Batch [200/10000], Step [200/500000], Loss: 0.6609
Epoch [1/50], Batch [250/10000], Step [250/500000], Loss: 0.4225
Epoch [1/50], Batch [300/10000], Step [300/500000], Loss: 1.0206
Epoch [1/50], Batch [350/10000], Step [350/500000], Loss: 0.8659
Epoch [1/50], Batch [400/10000], Step [400/500000], Loss: 0.5234
Epoch [1/50], Batch [450/10000], Step [450/500000], Loss: 0.8742
Epoch [1/50], Batch [500/10000], Step [500/500000], Loss: 0.8313
Epoch [1/50], Batch [550/10000], Step [550/500000], Loss: 0.5952
Epoch [1/50], Batch [600/10000], Step [600/500000], Loss: 0.7431
Epoch [1/50], Batch [650/10000], Step [650/500000], Loss: 0.5761
Epoch [1/50], Batch [700/10000], Step [700/500000], Loss: 0.6298
Epoch [1/50], Batch [750/10000], Step [750/500000], Loss: 0.6725
Epoch [1/50], Batch [800/10