In [1]:
import warnings
warnings.filterwarnings('ignore')

import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import albumentations as A
import cv2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import yaml
import os
import random

data_config = open('../data/IndoorObjectsDetection/data.yaml')

data_info = yaml.load(data_config, Loader=yaml.FullLoader)

train_data_path = data_info['train_data_path']
val_data_path = data_info['val_data_path']
test_data_path = data_info['test_data_path']

train_labels_path = data_info['train_labels_path']
val_labels_path = data_info['val_labels_path']
test_labels_path = data_info['test_labels_path']

target_list = data_info['names']
target_dict = dict(zip(range(len(target_list)), target_list))

target_dict

{0: 'door',
 1: 'cabinetDoor',
 2: 'refrigeratorDoor',
 3: 'window',
 4: 'chair',
 5: 'table',
 6: 'cabinet',
 7: 'couch',
 8: 'openedDoor',
 9: 'pole'}

In [3]:
IMAGE_SIZE = 224

resnet18 = torchvision.models.resnet18(pretrained = False)
layers = [m for m in resnet18.children()]

# 마지막 2층인 average pooling & fully connected layer 은 back bone으로 사용하지 않음
test_net = nn.Sequential(*layers[:-2]) 

temp_x = torch.randn(1,3,IMAGE_SIZE,IMAGE_SIZE)
temp_y = test_net(temp_x)


print(type(temp_x))
print(temp_x.shape)
print(temp_y.shape)

<class 'torch.Tensor'>
torch.Size([1, 3, 224, 224])
torch.Size([1, 512, 7, 7])


In [4]:
class YOLOv1_RESNET(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        
        self.num_classes = num_classes
        self.num_bboxes = 2
        self.grid_size = 7
        
        resnet18 = torchvision.models.resnet18(pretrained = False)
        layers = [m for m in resnet18.children()]
        
        self.backbone = nn.Sequential(*layers[:-2])
        
        self.neck = nn.Sequential(
            nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=1, padding=0, bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True)
        )
        
        self.head = nn.Sequential(
            nn.Conv2d(in_channels=1024, out_channels=5*self.num_bboxes+self.num_classes, kernel_size=1, padding=0, bias=False),
            nn.AdaptiveAvgPool2d(output_size=(self.grid_size, self.grid_size))
        )
        
    def forward(self, x):
        out = self.backbone(x)
        out = self.neck(out)
        out = self.head(out)
        return out

In [5]:
NUM_CLASSES = len(target_list)
model = YOLOv1_RESNET(num_classes = NUM_CLASSES)

model

YOLOv1_RESNET(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stat

In [6]:
class Detection_dataset():
    def __init__(self, data_dir, phase, transform=None):
        self.data_dir = data_dir
        self.phase = phase
        self.image_files = []
        self.transform = transform
        
        for fn in os.listdir(os.path.join(self.data_dir, phase, 'images')):
            bboxes, class_ids = self.get_label(fn)
                                
            if(fn.endswith("jpg") and bboxes.size != 0 and class_ids.size != 0):
                self.image_files.append(fn)
        
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, index):
        filename, image = self.get_image(index)
        bboxes, class_ids = self.get_label(filename)
        
        if self.transform: 
            transformed_data = self.transform(image=image, bboxes=bboxes, class_ids=class_ids)
            image = transformed_data['image']
            bboxes = np.array(transformed_data['bboxes'])
            class_ids = np.array(transformed_data['class_ids'])
        else:
            #transform 을 하지 않을경우 reshape to (C,W,H)
            image = torch.Tensor(image).permute(2,0,1)
        
        target = np.concatenate((bboxes, class_ids[:, np.newaxis]), axis=1)
        return image, target, filename
    
    def get_image(self, index):
        filename = self.image_files[index]
        image_path = os.path.join(self.data_dir, self.phase, 'images', filename)
        image = cv2.imread(image_path)
        return filename, image
    
    
    def get_label(self, filename):
        image_id = filename.split('.')[0]
        label_file_path = os.path.join(self.data_dir, self.phase, 'labels') + '/' + image_id + '.txt'
        try:
            bbox_df = pd.read_csv(label_file_path, sep=' ', header=None)
            
            bboxes = np.asarray(bbox_df[[1,2,3,4]])
            class_ids = np.asarray(bbox_df[0])
            
        except Exception as e:
            bboxes = np.array([])
            class_ids = np.array([])
            
            
        return bboxes, class_ids

In [7]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

IMAGE_SIZE = 448

"""
    when you use yolo format bbox param
    need to add logic in albumentations/augmentations/bbox_utils.py - check_bbox() method
    to make bbox boundery in [0,1]
    
    -------------------
    bbox=list(bbox)
    
    for i in range(4):
      if (bbox[i]<0) :
        bbox[i]=0
      elif (bbox[i]>1) :
        bbox[i]=1
    
    bbox=tuple(bbox)
    --------------------
"""

#mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225) -> imageNet 데이터셋에 기반한 계산된 수치 
transform = A.Compose([
        A.Resize(height=IMAGE_SIZE, width=IMAGE_SIZE),
        A.Normalize(mean=(0.485, 0.456, 0.406),std=(0.229, 0.224, 0.225)),
        ToTensorV2(),
    ],
    bbox_params=A.BboxParams(format='yolo', label_fields=['class_ids']),
)

transform

Compose([
  Resize(p=1.0, height=448, width=448, interpolation=1),
  Normalize(p=1.0, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=255.0, normalization='standard'),
  ToTensorV2(p=1.0, transpose_mask=False),
], p=1.0, bbox_params={'format': 'yolo', 'label_fields': ['class_ids'], 'min_area': 0.0, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False}, keypoint_params=None, additional_targets={}, is_check_shapes=True)

In [8]:
from torchvision.utils import make_grid
from my_util import set_bounding_boxes, set_bounding_box, get_random_color_dict
from ipywidgets import interact

transformed_train_dataset = Detection_dataset(data_dir='../data/IndoorObjectsDetection', phase="train", transform=transform)

@interact(index=(0, len(transformed_train_dataset)-1))
def show_transformed_image(index=0):
    img, target, filename = transformed_train_dataset[index]
    
    np_image = make_grid(img, normalize=True).permute(1,2,0).numpy()
    np_image_unit8 = (np_image*255).astype(np.uint8)
    
    res = set_bounding_boxes(np_image_unit8, target[:,0:4], 'yolo', target[:,4].astype(int), target_dict, get_random_color_dict(target_dict))
    plt.imshow(res)

interactive(children=(IntSlider(value=0, description='index', max=861), Output()), _dom_classes=('widget-inter…

In [9]:
from torch.utils.data import DataLoader

BATCH_SIZE = 10
data_dir = '../data/IndoorObjectsDetection'

def collate_fn(batch):
    image_list = []
    target_list = []
    filename_list = []
    
    for a,b,c in batch:
        image_list.append(a)
        target_list.append(b)
        filename_list.append(c)
        
    return torch.stack(image_list, dim=0), target_list, filename_list

def train_valid_dataloader(data_dir, batch_size=4, transform=None):
    dataloaders = {}
    
    train_dataset = Detection_dataset(data_dir=data_dir, phase="train", transform=transform)
    dataloaders["train"] = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    
    valid_dataset = Detection_dataset(data_dir=data_dir, phase="valid", transform=transform)    
    dataloaders["val"] = DataLoader(valid_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)
    
    return dataloaders

dataloaders = train_valid_dataloader(data_dir, BATCH_SIZE, transform)

In [10]:
t1 = torch.tensor([[1,2],[3,4]])
t2 = torch.tensor([[5,6],[7,8]])

print(torch.cat((t1,t2),dim=0))
print(torch.cat((t1,t2),dim=1))

print(torch.stack([t1,t2],dim=0))
print(torch.stack([t1,t2],dim=1))


tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])
tensor([[1, 2, 5, 6],
        [3, 4, 7, 8]])
tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])
tensor([[[1, 2],
         [5, 6]],

        [[3, 4],
         [7, 8]]])


In [11]:
'''
pip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org torchinfo
'''

'\npip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org torchinfo\n'

In [12]:
class BasicConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding=(kernel_size-1)//2, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.LeakyReLU(0.1, inplace=True)
        )

    def forward(self, x):
        return self.conv(x)

In [13]:
#Backbone
class ResidualBlock(nn.Module):
    def __init__(self, channels):
        super().__init__()

        self.residual = nn.Sequential(
            BasicConvBlock(channels, channels//2, kernel_size=1, stride=1),
            BasicConvBlock(channels//2, channels, kernel_size=3, stride=1)
        )

    def forward(self, x):
        return self.residual(x) + x

class DarkNet53(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv1 = BasicConvBlock(3, 32, 3, 1)
        self.block1 = nn.Sequential(
            BasicConvBlock(32, 64, 3, 2),
            ResidualBlock(64)
        )
        self.block2 = nn.Sequential(
            BasicConvBlock(64, 128, 3, 2),
            nn.Sequential(*[ResidualBlock(128) for _ in range(2)])
        )
        self.block3 = nn.Sequential(
            BasicConvBlock(128, 256, 3, 2),
            nn.Sequential(*[ResidualBlock(256) for _ in range(8)])
        )
        self.block4 = nn.Sequential(
            BasicConvBlock(256, 512, 3, 2),
            nn.Sequential(*[ResidualBlock(512) for _ in range(8)])
        )
        self.block5 = nn.Sequential(
            BasicConvBlock(512, 1024, 3, 2),
            nn.Sequential(*[ResidualBlock(1024) for _ in range(4)])
        )
        
    def forward(self, x):
        x = self.conv1(x)
        x = self.block1(x)
        x = self.block2(x)
        feature_map1 = self.block3(x)
        feature_map2 = self.block4(feature_map1)
        feature_map3 = self.block5(feature_map2)

        return feature_map1, feature_map2, feature_map3

In [14]:
#Neck : FPN top-down
class FPN_featureBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.conv = nn.Sequential(
            BasicConvBlock(in_channels, out_channels, 1),
            BasicConvBlock(out_channels, out_channels*2, 3),
            BasicConvBlock(out_channels*2, out_channels, 1),
            BasicConvBlock(out_channels, out_channels*2, 3),
            BasicConvBlock(out_channels*2, out_channels, 1)
        )

    def forward(self, x):
        return self.conv(x)
        
class UpSampling(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.upsample = nn.Sequential(
            BasicConvBlock(in_channels, out_channels, 1),
            nn.Upsample(scale_factor = 2)
        )
    
    def forward(self, x):
        return self.upsample(x)

In [15]:
#Head
class DetectionLayer(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()

        self.pred = nn.Sequential(
            BasicConvBlock(in_channels, in_channels*2, 3),
            nn.Conv2d(in_channels*2, (num_classes+5)*3, 1)
        )
        self.num_classes = num_classes

    def forward(self, x):
        output = self.pred(x)
        output = output.view(x.size(0), 3, self.num_classes+5, x.size(2), x.size(3))
        output = output.permute(0, 1, 3, 4, 2)

        return output

In [16]:
class Yolov3(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.num_classes = num_classes

        self.darknet53 = DarkNet53()

        self.fpn_feature_block1 = FPN_featureBlock(1024, 512)
        self.detectionlayer1 = DetectionLayer(512, num_classes)
        self.upsampling1 = UpSampling(512, 256)

        self.fpn_feature_block2 = FPN_featureBlock(512+256, 256)
        self.detectionlayer2 = DetectionLayer(256, num_classes)
        self.upsampling2 = UpSampling(256, 128)
        
        self.fpn_feature_block3 = FPN_featureBlock(256+128, 128)
        self.detectionlayer3 = DetectionLayer(128, num_classes)

    def forward(self, x):
        
        self.feature1, self.feature2, self.feature3 = self.darknet53(x)
        
        x = self.fpn_feature_block1(self.feature3)
        output1 = self.detectionlayer1(x)
        x = self.upsampling1(x)

        x = self.fpn_feature_block2(torch.cat([x, self.feature2], dim=1))
        output2 = self.detectionlayer2(x)
        x = self.upsampling2(x)

        x = self.fpn_feature_block3(torch.cat([x, self.feature1], dim=1))
        output3 = self.detectionlayer3(x)

        return output1, output2, output3

In [17]:
x = torch.randn((1, 3, 416, 416))
model = Yolov3(num_classes = 20)
out = model(x)
print(out[0].shape) # torch.Size([1, 3, 13, 13, 25])
print(out[1].shape) # torch.Size([1, 3, 26, 26, 25])
print(out[2].shape) # torch.Size([1, 3, 52, 52, 25]) 

torch.Size([1, 3, 13, 13, 25])
torch.Size([1, 3, 26, 26, 25])
torch.Size([1, 3, 52, 52, 25])


In [18]:
from torchinfo import summary

summary(model, input_size = (1, 3, 416, 416), device = "cpu")

Layer (type:depth-idx)                                       Output Shape              Param #
Yolov3                                                       [2, 3, 13, 13, 25]        --
├─DarkNet53: 1-1                                             [2, 256, 52, 52]          --
│    └─BasicConvBlock: 2-1                                   [2, 32, 416, 416]         --
│    │    └─Sequential: 3-1                                  [2, 32, 416, 416]         928
│    └─Sequential: 2-2                                       [2, 64, 208, 208]         --
│    │    └─BasicConvBlock: 3-2                              [2, 64, 208, 208]         18,560
│    │    └─ResidualBlock: 3-3                               [2, 64, 208, 208]         20,672
│    └─Sequential: 2-3                                       [2, 128, 104, 104]        --
│    │    └─BasicConvBlock: 3-4                              [2, 128, 104, 104]        73,984
│    │    └─Sequential: 3-5                                  [2, 128, 104, 104]   