In [1]:
import sys
print(sys.version)

CURR_DIR = '/content/drive/My Drive/google_colab_work/advanced_deep_learning_by_pytorch/'
CURR_DIR += '3_semantic_segmentation/'
sys.path.append(CURR_DIR)

3.6.9 (default, Jul 17 2020, 12:50:27) 
[GCC 8.4.0]


In [2]:
import os.path
import PIL.Image
import random
import math
import time
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
print('torch.__version__ =', torch.__version__)


from utils.data_augumentation import Compose, Scale, RandomRotation, RandomMirror, Resize, Normalize_Tensor

torch.__version__ = 1.6.0+cu101


In [3]:
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

<torch._C.Generator at 0x7f05bc5b6138>

# DataLoader

In [4]:
def MakeDatapathList(root_path):
    img_path_template = os.path.join(root_path, 'JPEGImages', '%s.jpg')
    anno_path_template = os.path.join(root_path, 'SegmentationClass', '%s.png')

    train_id_names = os.path.join(root_path + 'ImageSets/Segmentation/train.txt')
    val_id_names = os.path.join(root_path + 'ImageSets/Segmentation/val.txt')

    train_img_list = []
    train_anno_list = []
    for line in open(train_id_names):
        file_id = line.strip()
        img_path = (img_path_template % file_id)
        anno_path = (anno_path_template % file_id)
        train_img_list.append(img_path)
        train_anno_list.append(anno_path)

    val_img_list = []
    val_anno_list = []
    for line in open(val_id_names):
        file_id = line.strip()
        img_path = (img_path_template % file_id)
        anno_path = (anno_path_template % file_id)
        val_img_list.append(img_path)
        val_anno_list.append(anno_path)

    return train_img_list, train_anno_list, val_img_list, val_anno_list

In [5]:
class DataTransform():
    def __init__(self, input_size, color_mean, color_std):
        self.data_transform = {
            'train': Compose([
                Scale(scale=[0.5, 1.5]),
                RandomRotation(angle=[-10, 10]), # [deg]
                RandomMirror(),
                Resize(input_size),
                Normalize_Tensor(color_mean, color_std) ### PIL(h, w, c) ---> Torch(c, h, w)
            ]),
            'val': Compose([
                Resize(input_size),
                Normalize_Tensor(color_mean, color_std)
            ])
        }


    def __call__(self, phase, img, anno_class_img):
        return self.data_transform[phase](img, anno_class_img)

In [6]:
class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, img_list, anno_list, phase, data_transform):
        assert len(img_list) == len(anno_list)
        self.img_list = img_list
        self.anno_list = anno_list
        self.phase = phase
        self.data_transform = data_transform

    
    def __len__(self):
        return len(self.img_list)


    def __getitem__(self, idx):
        img, anno_class_img = self.RetrieveItem(idx)
        return img, anno_class_img 


    def RetrieveItem(self, idx):
        img_file_path = self.img_list[idx]
        img = PIL.Image.open(img_file_path) ### (h, w, c=RGB)

        anno_file_path = self.anno_list[idx]
        anno_class_img = PIL.Image.open(anno_file_path) ### (h, w)

        img, anno_class_img = self.data_transform(self.phase, img, anno_class_img)
        return img, anno_class_img 

In [7]:
### File path list

root_path = CURR_DIR + 'data/VOCdevkit/VOC2012/'
train_img_list, train_anno_list, val_img_list, val_anno_list = MakeDatapathList(root_path)

print('train_img_list[0] =', train_img_list[0])
print('train_anno_list[0] =', train_anno_list[0])

train_img_list[0] = /content/drive/My Drive/google_colab_work/advanced_deep_learning_by_pytorch/3_semantic_segmentation/data/VOCdevkit/VOC2012/JPEGImages/2007_000032.jpg
train_anno_list[0] = /content/drive/My Drive/google_colab_work/advanced_deep_learning_by_pytorch/3_semantic_segmentation/data/VOCdevkit/VOC2012/SegmentationClass/2007_000032.png


In [8]:
### Dataset

input_size = 475
color_mean = (0.485, 0.456, 0.406)
color_std = (0.229, 0.224, 0.225)

data_transform = DataTransform(input_size, color_mean, color_std)

train_dataset = VOCDataset(train_img_list, train_anno_list, 'train', data_transform)
val_dataset = VOCDataset(val_img_list, val_anno_list, 'val', data_transform)

img, anno_class_img = val_dataset.__getitem__(0)
print('img.size() =', img.size())
print('anno_class_img.size() =', anno_class_img.size())

img.size() = torch.Size([3, 475, 475])
anno_class_img.size() = torch.Size([475, 475])


In [9]:
### Dataset
batch_size = 8

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size, shuffle=False)

dataloaders_dict = {
    'train': train_dataloader,
    'val': val_dataloader,
}

val_batch_iter = iter(dataloaders_dict['val'])
imgs, anno_class_imgs = next(val_batch_iter)
print('imgs.size() =', imgs.size()) ### (n, c, h, w)
print('anno_class_imgs.size() =', anno_class_imgs.size()) ### (n, h, w)

imgs.size() = torch.Size([8, 3, 475, 475])
anno_class_imgs.size() = torch.Size([8, 475, 475])


# Feature module (Encoder)

In [10]:
class ConvBatchNormRelu(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, bias):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)


    def forward(self, x):
        outputs = self.conv(x)
        outputs = self.batchnorm(outputs)
        outputs = self.relu(outputs)

        return outputs

In [11]:
class FeatureMapConv(nn.Module):
    def __init__(self):
        super().__init__()

        in_channels=3; out_channels=64; kernel_size=3; stride=2; padding=1; dilation=1; bias=False
        self.cbnr_1 = ConvBatchNormRelu(in_channels, out_channels, kernel_size, stride, padding, dilation, bias)

        in_channels=64; out_channels=64; kernel_size=3; stride=1; padding=1; dilation=1; bias=False
        self.cbnr_2 = ConvBatchNormRelu(in_channels, out_channels, kernel_size, stride, padding, dilation, bias)

        in_channels=64; out_channels=128; kernel_size=3; stride=1; padding=1; dilation=1; bias=False
        self.cbnr_3 = ConvBatchNormRelu(in_channels, out_channels, kernel_size, stride, padding, dilation, bias)

        kernel_size=3; stride=2; padding=1 
        self.maxpool = nn.MaxPool2d(kernel_size, stride, padding)


    def forward(self, x):
        outputs = self.cbnr_1(x)
        outputs = self.cbnr_2(outputs)
        outputs = self.cbnr_3(outputs)
        outputs = self.maxpool(outputs)

        return outputs

In [12]:
class ConvBatchNorm(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, bias):
        super().__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
        self.batchnorm = nn.BatchNorm2d(out_channels)


    def forward(self, x):
        outputs = self.conv(x)
        outputs = self.batchnorm(outputs)

        return outputs

In [13]:
class BottleNeckPSP(nn.Module):
    def __init__(self, in_channels, mid_channels, out_channels, stride, dilation):
        assert in_channels == 2*mid_channels
        assert 4*mid_channels == out_channels

        super().__init__()

        self.cbr_1 = ConvBatchNormRelu(in_channels, mid_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)
        self.cbr_2 = ConvBatchNormRelu(mid_channels, mid_channels, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False)
        self.cb_3 = ConvBatchNorm(mid_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)

        self.cb_residual = ConvBatchNorm(in_channels, out_channels, kernel_size=1, stride=stride, padding=0, dilation=1, bias=False)

        self.relu = nn.ReLU(inplace=True)

    
    def forward(self, x):
        conv = self.cbr_1(x)
        conv = self.cbr_2(conv)
        conv = self.cb_3(conv)

        res = self.cb_residual(x)

        return self.relu(conv + res) ### Residual skip connection

In [14]:
class BottleNeckIdentityPSP(nn.Module):
    def __init__(self, out_channels, mid_channels, dilation):
        assert out_channels == 4*mid_channels

        super().__init__()

        self.cbr_1 = ConvBatchNormRelu(out_channels, mid_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)
        self.cbr_2 = ConvBatchNormRelu(mid_channels, mid_channels, kernel_size=3, stride=1, padding=dilation, dilation=dilation, bias=False)
        self.cb_3 = ConvBatchNorm(mid_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)

        self.relu = nn.ReLU(inplace=True)

    
    def forward(self, x):
        conv = self.cbr_1(x)
        conv = self.cbr_2(conv)
        conv = self.cb_3(conv)

        res = x

        return self.relu(conv + res) ### Residual skip connection

In [15]:
class ResBlockPSP(nn.Sequential):
    def __init__(self, n_blocks, in_channels, mid_channels, out_channels, stride, dilation):
        super().__init__()

        self.add_module(
            'block1',
            BottleNeckPSP(in_channels, mid_channels, out_channels, stride, dilation)
        )

        for i in range(n_blocks - 1):
            self.add_module(
                'block'+str(i+2),
                BottleNeckIdentityPSP(out_channels, mid_channels, dilation)
            )
            # print('[ResBlockPSP::__init__()] block', str(i+2))

# Pyramid Pooling module

In [16]:
class PyramidPooling(nn.Module):
    def __init__(self, in_channels, pool_sizes, height, width):
        super().__init__()
        self.height = height
        self.width = width

        out_channels = int(in_channels / len(pool_sizes))

        self.ada_avg_pool_1 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[0])
        self.cbr_1 = ConvBatchNormRelu(in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)

        self.ada_avg_pool_2 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[1])
        self.cbr_2 = ConvBatchNormRelu(in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)

        self.ada_avg_pool_3 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[2])
        self.cbr_3 = ConvBatchNormRelu(in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)

        self.ada_avg_pool_4 = nn.AdaptiveAvgPool2d(output_size=pool_sizes[3])
        self.cbr_4 = ConvBatchNormRelu(in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=False)


    def forward(self, x):
        out1 = self.ada_avg_pool_1(x)
        out1 = self.cbr_1(out1)
        out1 = F.interpolate(out1, size=(self.height, self.width), mode='bilinear', align_corners=True)

        out2 = self.ada_avg_pool_2(x)
        out2 = self.cbr_2(out2)
        out2 = F.interpolate(out2, size=(self.height, self.width), mode='bilinear', align_corners=True)

        out3 = self.ada_avg_pool_3(x)
        out3 = self.cbr_3(out3)
        out3 = F.interpolate(out3, size=(self.height, self.width), mode='bilinear', align_corners=True)

        out4 = self.ada_avg_pool_4(x)
        out4 = self.cbr_4(out4)
        out4 = F.interpolate(out4, size=(self.height, self.width), mode='bilinear', align_corners=True)

        outputs = torch.cat([x, out1, out2, out3, out4], dim=1)

        return outputs

# Up Sampling module, Auxiliary Loss module (Decoder)

In [17]:
class UpSampling(nn.Module):
    def __init__(self, in_channels, height, width, n_classes):
        super().__init__()

        self.height = height
        self.width = width
        self.mid_channels = 512

        self.cbr = ConvBatchNormRelu(in_channels, out_channels=self.mid_channels, kernel_size=3, stride=1, padding=1, dilation=1, bias=False)
        self.dropout = nn.Dropout2d(p=0.1)
        self.classification = nn.Conv2d(in_channels=self.mid_channels, out_channels=n_classes, kernel_size=1, stride=1, padding=0)

    
    def forward(self, x):
        outputs = self.cbr(x)
        outputs = self.dropout(outputs)
        outputs = self.classification(outputs)
        outputs = F.interpolate(outputs, size=(self.height, self.width), mode='bilinear', align_corners=True)

        return outputs

In [18]:
class AuxLoss(nn.Module):
    def __init__(self, in_channels, height, width, n_classes):
        super().__init__()

        self.height = height
        self.width = width
        self.mid_channels = 256

        self.cbr = ConvBatchNormRelu(in_channels, out_channels=self.mid_channels, kernel_size=3, stride=1, padding=1, dilation=1, bias=False)
        self.dropout = nn.Dropout2d(p=0.1)
        self.classification = nn.Conv2d(in_channels=self.mid_channels, out_channels=n_classes, kernel_size=1, stride=1, padding=0)

    
    def forward(self, x):
        outputs = self.cbr(x)
        outputs = self.dropout(outputs)
        outputs = self.classification(outputs)
        outputs = F.interpolate(outputs, size=(self.height, self.width), mode='bilinear', align_corners=True)

        return outputs

# PSPNet

In [19]:
class PSPNet(nn.Module):
    def __init__(self, n_classes):
        super().__init__()

        img_size = 475
        img_szie_small = 60

        block_nums = [3, 4, 6, 3]
        pool_sizes = [6, 3, 2, 1]

        ### Feature module
        self.feature_conv = FeatureMapConv()
        self.feature_res_1 = ResBlockPSP(n_blocks=block_nums[0], in_channels=128, mid_channels=64, out_channels=256, stride=1, dilation=1)
        self.feature_res_2 = ResBlockPSP(n_blocks=block_nums[1], in_channels=256, mid_channels=128, out_channels=512, stride=2, dilation=1)
        self.feature_dilated_res_1 = ResBlockPSP(n_blocks=block_nums[2], in_channels=512, mid_channels=256, out_channels=1024, stride=1, dilation=2)
        self.feature_dilated_res_2 = ResBlockPSP(n_blocks=block_nums[3], in_channels=1024, mid_channels=512, out_channels=2048, stride=1, dilation=4)

        ### Pyramid Pooling module
        self.pyramid_pooling = PyramidPooling(in_channels=2048, pool_sizes=pool_sizes, height=img_szie_small, width=img_szie_small)

        ### Up Sampling module
        self.decode_feature = UpSampling(in_channels=4096, height=img_size, width=img_size, n_classes=n_classes)

        ### Auxiliary Loss module
        self.aux = AuxLoss(in_channels=1024, height=img_size, width=img_size, n_classes=n_classes)


    def forward(self, x):
        y = self.feature_conv(x)
        y = self.feature_res_1(y)
        y = self.feature_res_2(y)
        y = self.feature_dilated_res_1(y)
        outputs_aux = self.aux(y)
        y = self.feature_dilated_res_2(y)

        y = self.pyramid_pooling(y)

        outputs = self.decode_feature(y)

        return (outputs, outputs_aux)

# Network model

In [20]:
### ADE20K dataset has 150 classes
net = PSPNet(n_classes=150)

### First, load pre-trained parameters by ADE20K dataset
state_dict = torch.load(CURR_DIR+'weights/pspnet50_ADE20K.pth')
net.load_state_dict(state_dict)


def InitWeightFunc(layer):
    if isinstance(layer, nn.Conv2d):
        nn.init.xavier_normal_(layer.weight.data)
        if layer.bias is not None:
            nn.init.constant_(layer.bias, 0.0)


### Then, switch the last layer for VOC dataset, whtch has 21 classes
n_classes = 21

net.decode_feature.classification = nn.Conv2d(
    in_channels=net.decode_feature.mid_channels, out_channels=n_classes,
    kernel_size=1, stride=1, padding=0
)
net.decode_feature.classification.apply(InitWeightFunc)

net.aux.classification = nn.Conv2d(
    in_channels=net.aux.mid_channels, out_channels=n_classes,
    kernel_size=1, stride=1, padding=0
)
net.aux.classification.apply(InitWeightFunc)

print(net)

PSPNet(
  (feature_conv): FeatureMapConv(
    (cbnr_1): ConvBatchNormRelu(
      (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (cbnr_2): ConvBatchNormRelu(
      (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (cbnr_3): ConvBatchNormRelu(
      (conv): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (batchnorm): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
    )
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (feature_res_1): ResBlockPSP(
    (block1): BottleNeckPSP(
      (cbr_1)

# Loss function

In [21]:
class PSPLoss(nn.Module):
    def __init__(self, aux_weight):
        super().__init__()
        self.aux_weight = aux_weight


    def forward(self, outputs, outputs_aux, targets):
        loss = F.cross_entropy(outputs, targets, reduction='mean')
        loss_aux = F.cross_entropy(outputs_aux, targets, reduction='mean')

        return loss + self.aux_weight*loss_aux

In [22]:
criterion = PSPLoss(aux_weight=0.4)

# Optimizer

In [23]:
optimizer = torch.optim.SGD([
    { 'params': net.feature_conv.parameters(), 'lr': 0.1e-2 },
    { 'params': net.feature_res_1.parameters(), 'lr': 0.1e-2 },
    { 'params': net.feature_res_2.parameters(), 'lr': 0.1e-2 },
    { 'params': net.feature_dilated_res_1.parameters(), 'lr': 0.1e-2 },
    { 'params': net.feature_dilated_res_2.parameters(), 'lr': 0.1e-2 },
    { 'params': net.pyramid_pooling.parameters(), 'lr': 0.1e-2 },
    { 'params': net.decode_feature.parameters(), 'lr': 1.0e-2 },
    { 'params': net.aux.parameters(), 'lr': 1.0e-2 },
], momentum=0.9, weight_decay=1.0e-4)

MAX_EPOCH = 10

def LambdaEpochFunc(epoch):
    return math.pow((1.0 - epoch/MAX_EPOCH), 0.9)

In [24]:
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=LambdaEpochFunc)

# Train

In [25]:
def TrainModel(net, dataloaders_dict, criterion, scheduler, optimizer):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('device =', device)

    net.to(device)
    torch.backends.cudnn.benchmark =True

    num_train_imgs = len(dataloaders_dict['train'].dataset)
    num_val_imgs = len(dataloaders_dict['val'].dataset)
    batch_size = dataloaders_dict['train'].batch_size

    ### Multiple minibatch
    batch_multiplier = 3

    iter_count = 1
    logs = []

    for epoch in range(MAX_EPOCH):
        epoch_start_time = time.time() 

        epoch_train_loss = 0.0
        epoch_val_loss = 0.0

        print('----------------------------------------------')
        print('Epoch: ', epoch+1, '/', MAX_EPOCH)
        print('----------------------------------------------')

        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()
                scheduler.step()
                optimizer.zero_grad()
            elif phase == 'val':
                if (epoch + 1) % 5 == 0:
                    net.eval() 
                else:
                    continue

            print('phase =', phase)

            ### Multiple minibatch
            batch_count = 0
            for imgs, anno_class_imgs in dataloaders_dict[phase]:
                if imgs.size()[0] == 1:
                    continue

                imgs = imgs.to(device)
                anno_class_imgs = anno_class_imgs.to(device)

                if phase=='train' and batch_count==0:
                    optimizer.step()
                    optimizer.zero_grad()
                    batch_count = batch_multiplier

                with torch.set_grad_enabled(phase=='train'):
                    outputs, outputs_aux = net(imgs)
                    loss = criterion(outputs, outputs_aux, anno_class_imgs.long()) / batch_multiplier

                    if phase == 'train':
                        loss.backward()
                        batch_count -= 1
                        epoch_train_loss += loss.item() * batch_multiplier 
                        iter_count += 1
                    elif phase == 'val':
                        epoch_val_loss += loss.item() * batch_multiplier 
            # end for imgs, anno_class_imgs
        # end for phase

        mean_train_loss = epoch_train_loss / num_train_imgs
        mean_val_loss = epoch_val_loss / num_train_imgs
        epoch_time = time.time() - epoch_start_time

        print('----------------------------------------------')
        print('epoch =', epoch+1, ', mean_train_loss =', mean_train_loss, ', mean_val_loss =', mean_val_loss)
        print('epoch_time =', epoch_time)

        log = {
            'epoch': epoch+1,
            'mean_train_loss': mean_train_loss,
            'mean_val_loss': mean_val_loss,
        }
        logs.append(log)
        pd.DataFrame(logs).to_csv('log_output.csv')
    # end for epoch

    torch.save(net.state_dict(), CURR_DIR+'weights/pspnet50_'+str(epoch+1)+'.pth')

In [26]:
TrainModel(net, dataloaders_dict, criterion, scheduler, optimizer)

device = cuda:0
----------------------------------------------
Epoch:  1 / 10
----------------------------------------------
phase = train




----------------------------------------------
epoch = 1 , mean_train_loss = 0.17455680188951922 , mean_val_loss = 0.0
epoch_time = 2302.1536090373993
----------------------------------------------
Epoch:  2 / 10
----------------------------------------------
phase = train
----------------------------------------------
epoch = 2 , mean_train_loss = 0.09203550879095422 , mean_val_loss = 0.0
epoch_time = 440.65367364883423
----------------------------------------------
Epoch:  3 / 10
----------------------------------------------
phase = train
----------------------------------------------
epoch = 3 , mean_train_loss = 0.08191247568389431 , mean_val_loss = 0.0
epoch_time = 440.6142158508301
----------------------------------------------
Epoch:  4 / 10
----------------------------------------------
phase = train
----------------------------------------------
epoch = 4 , mean_train_loss = 0.07174161059751374 , mean_val_loss = 0.0
epoch_time = 440.62203669548035
----------------------------