In [1]:
%%capture
!wget https://thor.robots.ox.ac.uk/~vgg/data/pets/images.tar.gz
!tar -xzvf images.tar.gz

!wget https://thor.robots.ox.ac.uk/~vgg/data/pets/annotations.tar.gz
!tar -xzvf annotations.tar.gz

## Data load

In [2]:
from torch.utils.data import Dataset
import xml.etree.ElementTree as ET
import os
import collections
from PIL import Image
import numpy as np

class PetDataset(Dataset):

	def __init__(self, transform):
		self.class_idx = {}
		self.idx_class = {}
		self.transform = transform
		self.data = self.load_data()		

  	# 총 데이터의 개수를 리턴
	def __len__(self): 
		return len(self.data)

  	# 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
	def __getitem__(self, idx): 
		imgpath, bbox, label = self.data[idx]
		img = Image.open(imgpath).convert('RGB')
		if self.transform:
			img = self.transform(img)
		sample = (img, bbox, label)
		return sample

	def parse_voc_xml(self, node): # xml 파일을 dictionary로 반환
		voc_dict = {}
		children = list(node)
		if children:
			def_dic: Dict[str, Any] = collections.defaultdict(list)
			for dc in map(self.parse_voc_xml, children):
				for ind, v in dc.items():
					def_dic[ind].append(v)
			if node.tag == "annotation":
				def_dic["object"] = [def_dic["object"]]
			voc_dict = {node.tag: {ind: v[0] if len(v) == 1 else v for ind, v in def_dic.items()}}
		if node.text:
			text = node.text.strip()
			if not children:
				voc_dict[node.tag] = text
		return voc_dict
	
	def load_data(self):
		xmlpath = '/content/annotations/xmls'
		imgpath = '/content/images'
		data = []
		for xmlfile in os.listdir(xmlpath):
			node = ET.parse(os.path.join(xmlpath, xmlfile)).getroot()
			xmls = self.parse_voc_xml(node)
			imgfile = os.path.join(imgpath, xmls['annotation']['filename'])
			dogobject = xmls['annotation']['object'][0]
			bbox = np.array([int(dogobject['bndbox']['xmin']), int(dogobject['bndbox']['ymin']), int(dogobject['bndbox']['xmax']), int(dogobject['bndbox']['ymax'])], dtype=np.float32)
			class_name = xmls['annotation']['filename'].split('_')[0]
			if class_name not in self.class_idx:
				classidx = len(class_name)
				self.class_idx[class_name] = classidx
				self.idx_class[classidx] = class_name
			else:
				classidx = self.class_idx[class_name]
			data.append((imgfile, bbox, classidx))
		return data

if __name__=="__main__":
	import torch
	from torchvision import transforms
	transform = transforms.Compose([transforms.ToTensor(), 
							transforms.Resize((224, 224)),
							transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
							])
	dataset = PetDataset(transform=transform)
	train_set , val_set = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])
	
	train_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)
	val_loader = torch.utils.data.DataLoader(dataset,batch_size=32, shuffle=False) 

	bbox = (next(iter(train_loader)))[1]
	print(bbox)

tensor([[ 87.,  39., 319., 356.],
        [214., 148., 300., 243.]])


## Model

In [3]:
# import the necessary packages
from torch.nn import Dropout
from torch.nn import Identity
from torch.nn import Linear
from torch.nn import Module
from torch.nn import ReLU
from torch.nn import Sequential
from torch.nn import Sigmoid

class ObjectDetector(Module):
	def __init__(self, baseModel, numClasses):
		super(ObjectDetector, self).__init__()
		# initialize the base model and the number of classes
		self.baseModel = baseModel
		self.numClasses = numClasses
	    # build the regressor head for outputting the bounding box
		# coordinates
		self.regressor = Sequential(
			Linear(baseModel.fc.in_features, 128),
			ReLU(),
			Linear(128, 64),
			ReLU(),
			Linear(64, 32),
			ReLU(),
			Linear(32, 4),
			Sigmoid()
		)
	    # build the classifier head to predict the class labels
		self.classifier = Sequential(
			Linear(baseModel.fc.in_features, 512),
			ReLU(),
			Dropout(),
			Linear(512, 512),
			ReLU(),
			Dropout(),
			Linear(512, self.numClasses)
		)
		# set the classifier of our base model to produce outputs
		# from the last convolution block
		self.baseModel.fc = Identity()
  
	def forward(self, x):
		# pass the inputs through the base model and then obtain
		# predictions from two different branches of the network
		features = self.baseModel(x)
		bboxes = self.regressor(features)
		classLogits = self.classifier(features)
		# return the outputs as a tuple
		return (bboxes, classLogits)

## Train

In [4]:
import torch
from torchvision import transforms
# from dataload import PetDataset
# from model import ObjectDetector
import tqdm
from torchvision.models import resnet50
from torch.nn import CrossEntropyLoss
from torch.nn import MSELoss, L1Loss
from torch.optim import Adam
import numpy as np

def train_one_epoch(train_loader, detector):
	# set the model in training mode
	detector.train()
	# initialize the total training and validation loss
	totalTrainLoss = 0
	# initialize the number of correct predictions in the training
	# and validation step
	trainCorrect = 0

	classLossFunc = CrossEntropyLoss()
	bboxLossFunc = L1Loss()
	opt = Adam(detector.parameters(), lr=1e-4)

	for imgs, bboxes, labels in train_loader:
		imgs = imgs.to("cuda")
		bboxes = bboxes.to("cuda")
		labels = labels.to("cuda")

		# perform a forward pass and calculate the training loss
		predictions = detector(imgs)

		bboxLoss = bboxLossFunc(predictions[0], bboxes)
		classLoss = classLossFunc(predictions[1], labels)					
		totalLoss = (0.1 * bboxLoss) + (0.9 * classLoss)

		# zero out the gradients, perform the backpropagation step,
		# and update the weights
		opt.zero_grad()
		totalLoss.backward()
		opt.step()

		# add the loss to the total training loss so far and
		# calculate the number of correct predictions
		totalTrainLoss += totalLoss
		trainCorrect += (predictions[1].argmax(1) == labels).type(
			torch.float).sum().item()

	return totalTrainLoss, trainCorrect

def eval(val_loader, detector):
	classLossFunc = CrossEntropyLoss()
	bboxLossFunc = L1Loss()

	# switch off autograd
	with torch.no_grad():
		detector.eval()
		# initialize the total training and validation loss
		totalValLoss = 0
		# initialize the number of correct predictions in the training
		# and validation step
		valCorrect = 0

		for imgs, bboxes, labels in val_loader:        
			imgs = imgs.to("cuda")
			bboxes = bboxes.to("cuda")
			labels = labels.to("cuda")

			# perform a forward pass and calculate the training loss
			predictions = detector(imgs)
			bboxLoss = bboxLossFunc(predictions[0], bboxes)
			classLoss = classLossFunc(predictions[1], labels)
			totalLoss = (0.1 * bboxLoss) + (0.9 * classLoss)

			# add the loss to the total training loss so far and
			# calculate the number of correct predictions
			totalValLoss += totalLoss
			valCorrect += (predictions[1].argmax(1) == labels).type(
			torch.float).sum().item()
		return totalValLoss, valCorrect

def train(epoch):
	transform = transforms.Compose([transforms.ToTensor(), 
							transforms.Resize((224, 224)),
							transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
							])
	dataset = PetDataset(transform=transform)
	train_set , val_set = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])
	
	train_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)
	val_loader = torch.utils.data.DataLoader(dataset,batch_size=32, shuffle=False) 


	resnet = resnet50(pretrained=True)

	# freeze all ResNet50 layers so they will *not* be updated during the
	# training process
	for param in resnet.parameters():
		param.requires_grad = False

	num_classes = len(dataset.class_idx)
	objectDetector = ObjectDetector(resnet, num_classes)
	objectDetector = objectDetector.to("cuda")
	print(objectDetector)


	print("[INFO] training the network...")
	for e in tqdm.tqdm(range(epoch)):
		totalTrainLoss, trainCorrect = train_one_epoch(train_loader, objectDetector)
		print("Train loss", float(totalTrainLoss), "Correct", trainCorrect/len(train_loader))
		totalValLoss, valCorrect = eval(val_loader, objectDetector)
		print("Val loss", float(totalValLoss), "Correct", valCorrect/len(val_loader))

if __name__=="__main__":
	train(3)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

ObjectDetector(
  (baseModel): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
 

  0%|          | 0/3 [00:00<?, ?it/s]

Train loss 4496.38232421875 Correct 4.822510822510822


 33%|███▎      | 1/3 [01:10<02:20, 70.36s/it]

Val loss 2193.388427734375 Correct 16.017241379310345
Train loss 4347.28369140625 Correct 8.796536796536797


 67%|██████▋   | 2/3 [02:14<01:06, 66.79s/it]

Val loss 2136.343994140625 Correct 22.370689655172413
Train loss 4279.10986328125 Correct 10.38961038961039


100%|██████████| 3/3 [03:18<00:00, 66.30s/it]

Val loss 2109.836669921875 Correct 25.70689655172414



