In [1]:
import cv2
import numpy as np
import os
import tarfile
import urllib

from matplotlib import pyplot as plt
import tensorflow as tf

DeepLab V3+
- https://blog.lunit.io/2018/07/02/deeplab-v3-encoder-decoder-with-atrous-separable-convolution-for-semantic-image-segmentation/
- https://github.com/tensorflow/models/blob/master/research/deeplab/deeplab_demo.ipynb

In [2]:
DOWNLOAD_URL_PREFIX = 'http://download.tensorflow.org/models/'

model_dir = 'human_segmentation/models'
tf.io.gfile.makedirs(model_dir)

model_path = os.path.join(model_dir, 'deeplab_model.tar.gz')

if not os.path.exists(model_path):
    urllib.request.urlretrieve(DOWNLOAD_URL_PREFIX + 'deeplabv3_mnv2_pascal_train_aug_2018_01_29.tar.gz', model_path)

In [3]:
class DeepLabModel(object):
    INPUT_TENSOR_NAME = 'ImageTensor:0'
    OUTPUT_TENSOR_NAME = 'SemanticPredictions:0'
    INPUT_SIZE = 513
    FROZEN_GRAPH_NAME = 'frozen_inference_graph'

    def __init__(self, tarball_path):
        self.graph = tf.Graph()
        graph_def = None
        tar_file = tarfile.open(tarball_path)
        for tar_info in tar_file.getmembers():
            if self.FROZEN_GRAPH_NAME in os.path.basename(tar_info.name):
                file_handle = tar_file.extractfile(tar_info)
                graph_def = tf.compat.v1.GraphDef.FromString(file_handle.read())
                break
        tar_file.close()

        with self.graph.as_default():
            tf.compat.v1.import_graph_def(graph_def, name='')

        self.sess = tf.compat.v1.Session(graph=self.graph)

    def preprocess(self, img_orig):
        height, width = img_orig.shape[:2]
        resize_ratio = 1.0 * self.INPUT_SIZE / max(width, height)
        target_size = (int(resize_ratio * width), int(resize_ratio * height))
        resized_image = cv2.resize(img_orig, target_size)
        resized_rgb = cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB)
        img_input = resized_rgb
        return img_input
        
    def run(self, image):
        img_input = self.preprocess(image)
        batch_seg_map = self.sess.run(
            self.OUTPUT_TENSOR_NAME,
            feed_dict={self.INPUT_TENSOR_NAME: [img_input]})

        seg_map = batch_seg_map[0]
        return cv2.cvtColor(img_input, cv2.COLOR_RGB2BGR), seg_map

# 1. 인물모드

In [4]:
class ShallowFocus:
    def __init__(self, model_path, img_path, label):
        self.model = DeepLabModel(model_path)
        self.img = cv2.imread(img_path)
        self.resize_img, self.segmentation_map = self.model.run(self.img)
        self.segmentation_map = self.extract_label(self.segmentation_map, label)
    
    def extract_label(self, segmentation_map, label):
        labels = [
            'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 
            'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tv'
        ]
        idx = labels.index(label)
        segmentation_map = np.where(segmentation_map==idx, idx, 0)
        return segmentation_map
    
    def make_shallow(self, show_origin=True, show_mask=False):
        if show_origin:
            plt.imshow(cv2.cvtColor(self.img, cv2.COLOR_BGR2RGB))
            plt.show()
        
        img_mask = self.segmentation_map * (255/self.segmentation_map.max())
        img_mask = img_mask.astype(np.uint8)
        img_mask_up = cv2.resize(img_mask, self.img.shape[:2][::-1], interpolation=cv2.INTER_LINEAR)
        _, img_mask_up = cv2.threshold(img_mask_up, 128, 255, cv2.THRESH_BINARY)
    
        img_orig_blur = cv2.blur(self.img, (20,20))
        img_mask_color = cv2.cvtColor(img_mask_up, cv2.COLOR_GRAY2BGR)
        img_bg_mask = cv2.bitwise_not(img_mask_color)
        img_bg_blur = cv2.bitwise_and(img_orig_blur, img_bg_mask)
        
        if show_mask:
            plt.imshow(cv2.cvtColor(img_bg_blur, cv2.COLOR_BGR2RGB))
            plt.show()
        
        img_concat = np.where(img_mask_color==255, self.img, img_bg_blur)
        plt.imshow(cv2.cvtColor(img_concat, cv2.COLOR_BGR2RGB))
        plt.show()

In [5]:
img_path = 'human_segmentation/images/my_image.jpg'
label = 'person'
app = ShallowFocus(model_path, img_path, label)
app.make_shallow()

<img src=https://i.ibb.co/3NFJX0M/2021-10-23-11-31-10.png width=300></img>

In [6]:
img_path = 'human_segmentation/images/bus.jpg'
label = 'bus'
app = ShallowFocus(model_path, img_path, label)
app.make_shallow()

<img src=https://i.ibb.co/RpNfS2K/2021-10-23-11-31-19.png width=300></img>


In [7]:
img_path = 'human_segmentation/images/dogs.jpg'
label = 'dog'
app = ShallowFocus(model_path, img_path, label)
app.make_shallow()

<img src=https://i.ibb.co/ryLps73/2021-10-23-11-31-25.png width=300></img>

In [8]:
img_path = 'human_segmentation/images/dog.jpg'
label = 'dog'
app = ShallowFocus(model_path, img_path, label)
app.make_shallow()

<img src=https://i.ibb.co/RcqNZSk/2021-10-23-11-31-32.png width=300></img>

    

In [9]:
img_path = 'human_segmentation/images/bicycle person.jpg'
label = 'person'
app = ShallowFocus(model_path, img_path, label)
app.make_shallow()

<img src=https://i.ibb.co/gv7BGh2/2021-10-23-11-31-38.png width=300></img>


In [10]:
img_path = 'human_segmentation/images/bicycle person.jpg'
label = 'bicycle'
app = ShallowFocus(model_path, img_path, label)
app.make_shallow()

<img src=https://i.ibb.co/KFjWgSL/2021-10-23-11-31-42.png width=300></img>

# 2. 사진에서 문제점 찾기

<img src=https://i.ibb.co/4tKYg1q/2021-10-23-12-40-09.png></img>

- 첫번째 사진의 경우, 사람을 segmentation하는 사진인데, 발 부분을 탐지하지 못했습니다.
- 두번째 사진은 개를 탐지해야 하는데, 귀 부분을 탐지하지 못했습니다.
- 세번째 사진은 버스를 탐지해야 하는데, 버스 뒷쪽 영역을 탐지하지 못했습니다.

# 3. 해결 방법 제안

## FuseNet: Incorporating Depth into Semantic Segmentation via Fusion-based CNN Architecture
Caner Hazirbas, Lingni Ma, Csaba Domokos, and Daniel Cremers / 382회 인용


https://vision.in.tum.de/_media/spezial/bib/hazirbasma2016fusenet.pdf

## Architecture
<img src=https://i.ibb.co/mzXFvBf/2021-10-26-00-30-28.png></img>

<img src=https://i.ibb.co/CHTjRry/2021-10-26-00-42-33.pngm></img>

모델은 인코더, 디코더로 구성되어 있으며, 2개의 이미지(RGB image, Depth image)를 input으로 받는 모델입니다.
각 이미지별로 Convolution, Batch Normalization, ReLU 블록(CBR 블록)을 2번 거친 후, 요소합을 수행합니다.
다만 2가지 전략을 실험해 봤는데, 
1. 앞서 말했듯이 위 CBR 블록을 2번 거친 후 요소 합을 수행하는 방법 (a. Sparse fusion(SF))
2. CBR 블록을 1번 거친 후, 요소 합을 하고, CBR을 다시 1번 거친 후 요소 합을 하는 방법으로 (b. Dense fusion (DF)) 

실험을 진행했다고 합니다.

<img src=https://i.ibb.co/fYbDzSz/2021-10-26-00-52-53.png></img>

테스트 결과, Dense fusion보다는 Sparse fusion이 성능이 더 좋았습니다.  
그리고 Spare fusion의 경우, fusion layer 많아질수록 성능이 높아지는 경향을 확인할 수 있습니다.

위 FuseNet과 앞선 DeepLab V3+의 아이디어(Atrous convolution)를 결합하면 효과가 높아질 수 있을것이라 생각했고,  
이러한 아이디어는 아래 논문에서 확인할 수 있었습니다.

## Improved Multi-modal Network Using Dilated Convolution Pyramid Pooling 
박준영, 호요성 
- https://scienceon.kisti.re.kr/commons/util/originalView.do?cn=CFKO201815540967076&oCn=NPAP12901063&dbt=CFKO&journal=NPRO00379579  
  
NYUDv2 데이터셋을 사용한 실험 결과는 아래와 같고, IoU가 1.8%p 상승함을 확인할 수 있었습니다.

<img src=https://i.ibb.co/QfRy0Lm/2021-10-26-10-44-42.png width=400></img>