# Semantic image segmentation using fully convolutional networks

> Michal Gallus (s172679) Julien Hoareau (s161088) Wazir Sahebali (s172062)

This notebook will walk you through the steps to be taken for producing the eventual evaluation results. Below, the test set of the [CamVid dataset](https://github.com/alexgkendall/SegNet-Tutorial/tree/master/CamVid) will be fed to our implementation of the 56-layer Fully Convolutional DenseNet.

Our notebook on human segmentation within the ADE20K dataset can be found [here](./ADE%20Notebook.ipynb).

Below you can specify the amount of test images you want to go through the model. The maximum amount is 233, as there are only that many test images.

In [None]:
n_images = 3 # total 233

## Initialization
We import the main libraries along with the 56 layer version of the network and its parameters. The input images all have a size of 360 by 480 pixels and will be resized to a 224 by 224 resolution. In this application the network has 12 classes (if the void class is included). In the next cell we also specify those classes along with their respective RGB colour code.

In [None]:
import os
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from network import net
import json
from data_utils import *
from colorise_camvid import colorize, legend, _mask_labels
from iou_calculation import intersection_over_union
slim = tf.contrib.slim

# Network specifications
batch_size, height, width, nchannels = n_images, 360, 480, 3
final_resized = 224
model_version = 56

# Mapping of the classes onto colours and strings
_cmap = [(128, 128, 128), (128, 0, 0), (192, 192, 128), (128, 64, 128), (0, 0, 192), (128, 128, 0),(192, 128, 128), (64, 64, 128), (64, 0, 128), (64, 64, 0), (0, 128, 192), (0, 0, 0)]
_mask_labels = {0: 'sky', 1: 'building', 2: 'column_pole', 3: 'road',
                4: 'sidewalk', 5: 'tree', 6: 'sign', 7: 'fence', 8: 'car',
                9: 'pedestrian', 10: 'byciclist', 11: 'void'}

# Model parameters
with open('model_parameters.json') as params:
    params_dict = json.load(params)[repr(model_version)]

params_dict['input_num_features'] = 48
params_dict['output_classes'] = 12

## Dataloading
The Tensorflow record file is actually already in the GitHub folder, so there is no need to recreate them. Nevertheless, we put the code for recreating them below as well. If you want to recreate the record file, then don't forget to adapt the file path of the CamVid folder (the variable `camvid_path`) to your situation.

In [None]:
#TF RECORD creation code
camvid_path = "../../CamVid"  # Path to the CamVid dataset
path_separator = "/"  

from PIL import Image
import numpy as np
import skimage.io as io
import os
import random

"""Based on: http://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/
   and https://kwotsin.github.io/tech/2017/01/29/tfrecords.html"""

# Functions to store images, integers and strings in Tensorrec format
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

reader = tf.TFRecordReader

# The image annotation pairs have been coupled together in a text file
# So we need to parse those text files
original_images = []
test_paths = camvid_path + path_separator + "test.txt"
train_paths = camvid_path + path_separator + "train.txt"
valid_paths = camvid_path + path_separator + "val.txt"

def parsepaths(location):
    # Load image and notations in separate lists
    x = []
    y = []
    n = []
    with open(location) as f:
        data = f.read()
    f.closed
    for l in data.split("\n"):
        if len(l) > 17:  # If the line contains more characters than the typical filename of one sample image
            x.append(path_replace(l.split(" ")[0]))
            y.append(path_replace(l.split(" ")[1]))
            n.append(l.split(" ")[0].replace("/SegNet/CamVid/train/", ""))
    return zip(x, y, n)

def path_replace(path):
    path = path.replace("/SegNet/CamVid/", "../")
    path = path.replace("/", path_separator)
    path = path.replace("..", camvid_path)
    return path

# This is the main function which saves the images of the selected dataset to a tfrec format
def tfrec_dump(dataset_paths, save_path):  # Either test_paths, train_paths or valid_paths
    filename_pairs = parsepaths(dataset_paths)
    writer = tf.python_io.TFRecordWriter(save_path)
    for img_path, annotation_path, file_name in filename_pairs:
        img = tf.gfile.FastGFile(img_path, 'rb').read()
        annotation = tf.gfile.FastGFile(annotation_path, 'rb').read()
        imgarr = np.array(Image.open(img_path))
        height = imgarr.shape[0]
        width = imgarr.shape[1]
        original_images.append((img, annotation))

        # Because the image is stored 1D we need to keep track of the image width and height
        example = tf.train.Example(features=tf.train.Features(feature={
            'image/format': _bytes_feature(file_name[-3:].encode('ascii')),
            'image/height': _int64_feature(height),
            'image/width': _int64_feature(width),
            'file_name': _bytes_feature(file_name.encode('ascii')),
            'image/encoded': _bytes_feature(img),
            'annotation/encoded': _bytes_feature(annotation)}))  # We assume here that the other features of the annotation image are the same as for the photo image
        writer.write(example.SerializeToString())
    writer.close()
    
# The following functions defines how to decode the dataset file
def slim_dataset(tfrec_location, num_samples):
    # How to interpret the dict keys
    keys_to_features = {
        'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'annotation/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
        'image/format': tf.FixedLenFeature((), tf.string, default_value='png'),
    }

    # How to decode certain keys
    items_to_handlers = {
        'image': slim.tfexample_decoder.Image(),
        'annotation': slim.tfexample_decoder.Image(image_key='annotation/encoded', format_key='image/format', channels=1),
    }
    items_to_descriptions = {
        'image': 'A 3-channel RGB coloured street image.',
        'annotation': 'A 1-channel image where everything is annotated.'
    }

    decoder = slim.tfexample_decoder.TFExampleDecoder(
        keys_to_features, items_to_handlers)
    dataset = slim.dataset.Dataset(
        data_sources=tfrec_location,
        decoder=decoder,
        reader=reader,
        num_readers=4,
        num_samples=num_samples,
        items_to_descriptions=items_to_descriptions)
    return dataset


Then we can read in the dataset.

In [None]:
datasetfilename="testset.tfrec"
if not os.path.isfile(datasetfilename):
        tfrec_dump(test_paths, datasetfilename)
tfsdataset = slim_dataset(datasetfilename, n_images)

In order for the network to load the images, it has to perform several operations on them. The image has to be randomly cropped and randomly flipped. In order to perform this in the same way for the image and the annotation the annotation is concatenated as an extra feature map to the image. Then these data augmentation methods are performed and the image is separated again from its annotation.  

In [None]:
def random_flip_crop_image_and_labels(image, labels, feature_maps_image, feature_maps_annot, height, width):
    """Randomly crops `image` together with `labels`.
    Based on <https://stackoverflow.com/questions/42147427/tensorflow-how-to-randomly-crop-input-images-and-labels-in-the-same-way>
    Args:
    image: A Tensor with shape [D_1, ..., D_K, N]
    labels: A Tensor with shape [D_1, ..., D_K, M]
    size: A Tensor with shape [K] indicating the crop size.
    Returns:
    A tuple of (cropped_image, cropped_label).
    """
    seed = random.randint(0, 1e10)
    combined = tf.concat([image, labels], axis=-1)

    last_label_dim = tf.shape(labels)[-1]
    last_image_dim = tf.shape(image)[-1]
    combined_crop = tf.random_crop(
        combined,
        size=[height, width, feature_maps_image + feature_maps_annot],
        seed=seed)
    combined_crop = tf.reshape(combined_crop, shape=(height, width, feature_maps_image + feature_maps_annot))
    maybe_flipped_images = tf.image.random_flip_left_right(combined_crop)
    crop_feature_maps = tf.unstack(maybe_flipped_images, axis=-1)
    return tf.stack(crop_feature_maps[:feature_maps_image], axis=-1), tf.stack(crop_feature_maps[feature_maps_image:], axis=-1)

# Convert to a tensor and resize
def imagepreprocessor(image, annot, height, width, scope=None):
    scopename="crop"
    with tf.name_scope(scope, scopename, [image, annot, height, width]):
        if image.dtype != tf.float32:
            image = tf.image.convert_image_dtype(image, dtype=tf.float32)
        if annot.dtype != tf.float32:
            annot = tf.image.convert_image_dtype(annot, dtype=tf.float32)
        image, annot = random_flip_crop_image_and_labels(
                                image, annot,
                                feature_maps_image=3,
                                feature_maps_annot=1,
                                height=height,
                                width=width)
    return tf.expand_dims(image, 0), tf.expand_dims(tf.image.convert_image_dtype(annot, dtype=tf.uint8), 0)

The following function brings together the data loading and dataprocessing, so that the tensors returned by this function can be directly fed into the network.

In [None]:
# Load a batch
def batch(dataset, batch_size=3, height=360, width=480, resized=224):  # Resize to a multiple of 32
    IMAGE_HEIGHT = IMAGE_WIDTH = resized
    # First create the data_provider object
    data_provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        common_queue_capacity=24 + 3 * batch_size,
        common_queue_min=24)

    # Get the images from provider
    raw_image, raw_annotation = data_provider.get(['image', 'annotation'])

    # Do image preprocessing
    image, annotation = imagepreprocessor(
        image=raw_image, annot=raw_annotation, height=IMAGE_HEIGHT, width=IMAGE_WIDTH)

    # Loaded batch
    images, annotations = tf.train.batch(
        [image, annotation],
        batch_size=batch_size,
        num_threads=4,
        capacity=4 * batch_size,
        allow_smaller_final_batch=True)
    return images, annotations

## The Network
The network contains the following building blocks:
* Dense blocks
    - Skipped
    - Non-skipped
* Transition layers
    - Transition up
    - Transition down
 

First we define the hyperparameters along with the batch normalization and the creation of a standard layer.

In [None]:
l2_reg = slim.l2_regularizer(0.0001)
regularizers = {"beta" : slim.l2_regularizer(0.0001), "gamma": slim.l2_regularizer(0.0001)}
weight_initializer = tf.contrib.keras.initializers.he_uniform()

def batch_wise_batch_norm(x, scope):
    with tf.name_scope(scope):
        batch_mean, batch_var = tf.nn.moments(x, axes=[0, 1, 2])
        x = tf.subtract(x, batch_mean)
        x = tf.div(x, tf.sqrt(batch_var) + 1e-6)
        x = tf.nn.relu(x, "BatchNormRelu")
        return x
def create_layer(input, num_features, scope, kernel_size=3, p=0.2, is_training=True):
    relud_batch_norm = batch_wise_batch_norm(input, scope + "/batchnorm")
    conv = slim.conv2d(relud_batch_norm, num_features,
                       kernel_size, weights_initializer=weight_initializer,
                       weights_regularizer=l2_reg,
                       scope=(scope + "/conv"), activation_fn=None)
    dropout = slim.dropout(conv, keep_prob=1-p, scope=(scope + "/dropout"), is_training=is_training)
    return dropout

In [None]:
def skipped_dense_block(x, num_layers, num_features, scope, p=0.2, is_training=True):
    layers = []
    for i in range(num_layers):
        layer = create_layer(x, num_features, p=p,
                             scope=(scope + "/layer" + str(i)), is_training=is_training)
        layers.append(layer)
        x = tf.concat(axis=-1, values=[x, layer],
                      name=(scope + "/skip" + str(i)))
    return x

def nonskipped_dense_block(x, num_layers, num_features, scope, p=0.2, is_training=True):
    layers = []
    for i in range(num_layers):
        layer = create_layer(x, num_features, p=p,
                             scope=(scope + "/layer" + str(i)), is_training=is_training)
        layers.append(layer)
        if (i == num_layers - 1):
            continue
        x = tf.concat(axis=-1, values=[x, layer],
                      name=(scope + "/skip" + str(i)))
    return tf.concat(axis=-1, values=layers,
                    name=(scope + '/output'))


In [None]:
def transition_down(input, scope, kernel_size=1, pool_size=2, p=0.2, is_training=True):
    relud_batch_norm = batch_wise_batch_norm(input, scope + "/batchnorm")
    # relud_batch_norm = slim.batch_norm(input, activation_fn=tf.nn.relu,
    #      param_regularizers=regularizers, scope=(scope + "/batchnorm"))
    conv = slim.conv2d(relud_batch_norm, input.shape[-1],
                       kernel_size, weights_initializer=weight_initializer,
                       weights_regularizer=l2_reg,
                       scope=(scope + "/conv"), activation_fn=None)
    dropout = slim.dropout(conv, keep_prob=1-p, scope=(scope + "/dropout"), is_training=is_training)
    max_pool = slim.max_pool2d(
        dropout, pool_size, stride=2, scope=(scope + "/maxpool"))
    return max_pool


def transition_up(input, scope, kernel_size=3, stride=2):
    return slim.conv2d_transpose(input, input.shape[-1], kernel_size,
                                 weights_initializer=weight_initializer,
                                 weights_regularizer=l2_reg,
                                 stride=stride, scope=scope, activation_fn=None)

The network starts with a 3 by 3 convolution of the input and this is followed by a downsampling path of 5 DenseBlocks with skip connections. Then we get a bottleneck,  followed by an upsampling path with 4 non-skipped DenseBlocks and one final DenseBlock which does include a skip connection. Before the final output, the network has one last convolution layer.

In [None]:
import tensorflow as tf
from building_blocks import *

def net(input, PARAMS, is_training=True):
    net = slim.conv2d(input, PARAMS['input_num_features'],
                      3, weights_initializer=weight_initializer,
                      weights_regularizer=l2_reg,
                      scope='inputConv', activation_fn=None)
    dense_down = list()
    for i in range(1, 6):
        dense_k = skipped_dense_block(net, PARAMS['dense_{}'.format(i)]['num_layers'],
            PARAMS['num_features'], 'dense{}'.format(i), is_training=is_training)
        net = transition_down(dense_k, 'td{}'.format(i), is_training=is_training)
        dense_down.append(dense_k)

    net = nonskipped_dense_block(
        net, PARAMS['dense_bottleneck']['num_layers'], PARAMS['num_features'], 'denseBottleneck', is_training=is_training)

    for i in range(1, 6):
        net = transition_up(net, 'tu{}'.format(i))
        net = tf.concat(
            axis=-1, values=[dense_down[-i], net], name=('skip{}_up'.format(i)))
        if (i < 5):
            net = nonskipped_dense_block(net, PARAMS['dense_{}_up'.format(i)]['num_layers'],
                PARAMS['num_features'], 'dense{}up'.format(i), is_training=is_training)
        else:
            # Last upsampling dense block has a skip connection
            net = skipped_dense_block(net, PARAMS['dense_{}_up'.format(i)]['num_layers'],
                PARAMS['num_features'], 'dense{}up'.format(i), is_training=is_training)

    return slim.conv2d(net, PARAMS['output_classes'], 1,
            weights_initializer=weight_initializer, weights_regularizer=l2_reg,
            scope='outputConv', activation_fn=None)


## Evaluation loop
Below, the weights of the trained network are reloaded and the dataset is passed through the network. The results will be visible in TensorBoard from the `test` folder. In there the image tab will show the original photo in the upper left corner, the ground truth in the lower left corner, the difference in the upper right corner (white depicts a wrong prediction), and the predicted segmentation in the lower right corner of each sample.

In [None]:
gpu_opts = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
# Evaluation loop
with tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts)) as sess:
    log_dir = 'test'
    # We load a batch and reshape to tensor
    xbatch, ybatch = batch(
        tfsdataset, batch_size=batch_size, height=height, width=width, resized=final_resized)
    input_batch = tf.reshape(xbatch, shape=(batch_size, final_resized, final_resized, 3))
    ground_truth_batch = tf.reshape(ybatch, shape=(batch_size, final_resized, final_resized, 1))

    # Obtain the prediction
    predictions = net(input_batch, params_dict, is_training=False)
    predim = tf.nn.softmax(predictions)
    predimmax = tf.expand_dims(
        tf.cast(tf.argmax(predim, axis=3), tf.float32), -1)

    yb = tf.cast(tf.divide(ground_truth_batch, 11), tf.float32)
    predimmaxdiv = tf.divide(tf.cast(predimmax, tf.float32), 11)

    # We calculate the loss
    one_hot_labels = slim.one_hot_encoding(
        tf.squeeze(ground_truth_batch),
        params_dict['output_classes'])

    masked_weights = 1 - tf.unstack(one_hot_labels, axis=-1)[-1]

    # Concatenate all four images into one grid
    ediff = tf.minimum(tf.abs(tf.subtract(yb, predimmaxdiv)), tf.expand_dims(masked_weights, axis=-1))
    norm_ediff = tf.ceil(ediff)
    annots=tf.concat([colorize(ground_truth_batch),colorize(predimmax)],2)
    img_and_err=tf.multiply(tf.concat([input_batch,tf.image.grayscale_to_rgb(norm_ediff)],2),255) # Multiply by 255, because it actually outputs 0.0 to 1.0
    aio=tf.concat([img_and_err,annots],1)
    tf.summary.image("All_in_one", aio, max_outputs=n_images)
    tf.summary.image("Legend", legend, max_outputs=1)

    slim.losses.softmax_cross_entropy(
        predictions,
        one_hot_labels,
        weights=masked_weights)
    total_loss = slim.losses.get_total_loss()
    tf.summary.scalar('loss', total_loss)

    accuracy = tf.reduce_mean(tf.cast(norm_ediff, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    iou_array, mean_iou = intersection_over_union(ground_truth_batch, predimmax, params_dict['output_classes'], masked_weights)
    tf.summary.scalar('mean_IoU', mean_iou)
    class_labels = tf.convert_to_tensor(np.array(list(_mask_labels.values())), tf.string)
    iou_per_class = tf.stack([class_labels, tf.as_string(iou_array, precision=2)], axis=1)
    tf.summary.text('IoU per class', iou_per_class)

    slim.evaluation.evaluate_once(
        '',
        'train_aws_nbs/model.ckpt-10963', # Model weights
        'test'                            # Save directory for the logs
    )
