In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Subset, DataLoader, TensorDataset

import torchvision
import torchvision.transforms as transforms

import time
import copy
import random
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [31]:
# from accelerate import Accelerator

In [2]:
device_count = torch.cuda.device_count()
print(f"Device count: {device_count}")

for i in range(device_count):
    print(torch.cuda.get_device_properties(i))

Device count: 2
_CudaDeviceProperties(name='Z100SM', major=7, minor=5, total_memory=16368MB, multi_processor_count=64)
_CudaDeviceProperties(name='Z100SM', major=7, minor=5, total_memory=16368MB, multi_processor_count=64)


In [3]:
computing_device = "cuda"
if not torch.cuda.is_available():
    computing_device = "cpu"
print("Computing Device: ", computing_device)

Computing Device:  cuda


In [33]:
# accelerator = Accelerator()
# print(accelerator)

<accelerate.accelerator.Accelerator object at 0x7fde7dd9c150>


In [14]:
batch_size = 256

# transform = transforms.Compose([transforms.ToTensor(),
#                                 transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),  # 先四周填充0，在吧图像随机裁剪成32*32
    transforms.RandomHorizontalFlip(),  # 图像一半的概率翻转，一半的概率不翻转
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),  # R,G,B每层的归一化用到的均值和方差
])

transform_val = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
validate_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_val)

trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
valloader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False, num_workers=8)

print(len(trainloader.dataset))
print(len(valloader.dataset))

Files already downloaded and verified
Files already downloaded and verified
50000
10000


In [24]:
# print(torchvision.models())
resnet = torchvision.models.resnet101(num_classes=10).to(computing_device)


# resnet = torchvision.models.resnet101(num_classes=10)




if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(resnet)

lr = 0.001
lossfn = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(resnet.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
optimizer = torch.optim.Adam(resnet.parameters(), lr=lr)



# resnet, optimizer, trainloader, valloader = accelerator.prepare(model, optimizer, trainloader)


Using 2 GPUs!


In [21]:
print(sum(p.numel() for p in resnet.parameters()))   # 查看一下模型参数量

42520650


In [28]:
# print(model)
# for X, y in valloader:
#     print(X.size(), y.size())
#     break

# 测试一下模型的输入输出维度
tmpdata = torch.randn([128, 3, 32, 32]).to(computing_device)
tmpout = resnet(tmpdata)

print(tmpout.size())
print(tmpout)


torch.Size([128, 10])
tensor([[ 0.7716, -0.1400,  0.5104,  ...,  0.9468, -0.1179,  0.0514],
        [ 2.3137, -0.8620, -1.6912,  ...,  1.0892, -0.3543, -1.1141],
        [-0.2872,  0.0983, -0.1753,  ...,  0.8857, -0.4489, -0.3886],
        ...,
        [ 0.8075, -0.0333, -0.6333,  ...,  0.3251,  0.5447,  0.0713],
        [ 1.5661,  0.1089,  0.5665,  ...,  0.1866, -0.2049,  0.3054],
        [ 0.5656, -0.9399, -1.8978,  ..., -0.2961, -0.2272,  0.9700]],
       device='cuda:0', grad_fn=<AddmmBackward0>)


In [None]:
for k in resnet.state_dict():
    if k.find("bias") >= 1:
        continue
    print(k)

In [29]:
# 训练
def train_model(epoch, model, loss_fn, optimizer, trainloader):
    # training
    num_batches = len(trainloader)
    model.train()
    train_loss = 0
    for batch, (X, y) in enumerate(trainloader):
        X, y = X.to(computing_device), y.to(computing_device)
        optimizer.zero_grad()
        
        predict = model(X)

        loss = loss_fn(predict, y)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        
    train_loss /= num_batches

    return train_loss

# 验证
def val_model(epoch, model, loss_fn, valloader):
    size = len(valloader.dataset)
    num_batches = len(valloader)
    
    model.eval()
    test_loss, val_correct = 0, 0
    with torch.no_grad():        
        for batch, (X, y) in enumerate(valloader):
            X, y = X.to(computing_device), y.to(computing_device)

            predict = model(X)
            loss = loss_fn(predict, y)
            test_loss += loss.item()
            val_correct += (predict.argmax(1) == y).type(torch.float).sum().item() 
            
    test_loss /= num_batches
    val_correct /= size

    return test_loss, val_correct

# 测试
def test_model(model, loss_fn, testloader):
    pass


In [30]:
start_epoch = 0 # 从哪一个epoch开始
num_epochs = 64 # 要训练多少个epoch

# time_all = 0    # 消耗的总时长，单位s

for epoch in range(start_epoch, start_epoch + num_epochs):    
    ts = time.perf_counter() # 打一个时间戳

    train_loss = train_model(epoch, resnet, lossfn, optimizer, trainloader)
    td = time.perf_counter()    # 打一个时间戳
    
    val_loss, val_correct = val_model(epoch, resnet, lossfn, valloader)
    
    # writer.add_scalar("Loss/train", train_loss, epoch)
    
    print(f"Epoch {epoch} | TrainLoss {train_loss:.5f} | ValLoss {val_loss:.5f} | ValCorrect {val_correct:.5f}| TrainTime {(td - ts):.5f}s")

    # print(f"Epoch: {epoch} | TrainLoss: {train_loss:.5f} | ValLoss: {test_loss:.5f} | EpochTime: {(td - ts):.5f}s ｜ TimeRemaining: {(time_all / (epoch - start_epoch + 1)) * (start_epoch + num_epochs - epoch - 1):.5f}s")
    print("----- ----- ----- ----- -----")


Epoch 0 | TrainLoss 2.12057 | ValLoss 2.34432 | ValCorrect 0.25860| TrainTime 27.38683s
----- ----- ----- ----- -----
Epoch 1 | TrainLoss 1.79553 | ValLoss 1.59346 | ValCorrect 0.39810| TrainTime 26.76777s
----- ----- ----- ----- -----
Epoch 2 | TrainLoss 1.65222 | ValLoss 1.41069 | ValCorrect 0.48030| TrainTime 26.92491s
----- ----- ----- ----- -----
Epoch 3 | TrainLoss 1.68830 | ValLoss 1.78956 | ValCorrect 0.34410| TrainTime 27.18425s
----- ----- ----- ----- -----
Epoch 4 | TrainLoss 1.92030 | ValLoss 1.64077 | ValCorrect 0.40100| TrainTime 27.29303s
----- ----- ----- ----- -----
Epoch 5 | TrainLoss 1.74762 | ValLoss 1.75225 | ValCorrect 0.35700| TrainTime 27.50385s
----- ----- ----- ----- -----


KeyboardInterrupt: 

In [None]:
# writer.flush()