In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim

In [None]:
def make_batch(samples):
    inputs = [sample[0] for sample in samples]
    labels = [sample[1]['annotation']['object'][0]['name'] for sample in samples]
    print(labels)
    padded_inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    return {'input': padded_inputs.contiguous(),
            'label': torch.stack(labels).contiguous()}

In [None]:
# 1. 데이터셋 준비
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

trainset = torchvision.datasets.VOCDetection(root='./data', year='2007', image_set='train', transform=train_transform, download=False)
valset = torchvision.datasets.VOCDetection(root='./data', year='2007', image_set='val', transform=val_transform, download=False)
testset = torchvision.datasets.VOCDetection(root='./data', year='2007', image_set='test', transform=test_transform, download=False)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=make_batch)
valloader = torch.utils.data.DataLoader(valset, batch_size=32, shuffle=False, collate_fn=make_batch)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False, collate_fn=make_batch)

In [None]:
trainset[0][1]['annotation']['object'][0]['name']

In [None]:
batch = next(iter(trainloader))

In [None]:
# 3. 모델 정의
resnet152 = torchvision.models.resnet152(pretrained=True)
resnet152.fc = nn.Linear(2048, 20)

In [None]:
# 4. 손실 함수 정의
criterion = nn.MultiLabelSoftMarginLoss()

# 5. 옵티마이저 설정
optimizer = optim.SGD(resnet152.parameters(), lr=0.001, momentum=0.9)

In [None]:
# VOC class names
classes = [
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor"
]

In [None]:
# 6. GPU 사용 여부 확인 및 설정
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
resnet152.to(device)

# 7. 학습
for epoch in range(10):
    running_loss = 0.0
    resnet152.train()
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data['image'].to(device), data['annotation']['object'][0]['name'].to(device)

        optimizer.zero_grad()

        outputs = resnet152(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0

    # 8. 평가
    resnet152.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data['image'].to(device), data['annotation']['object'].to(device)
            outputs = resnet152(images)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total += labels.size(0) * labels.size(1)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the test images: %d %%' % (
        100 * correct / total))

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device =  torch.device('cuda')

In [7]:
x = torch.rand(4,2)