# Segmantic Segmentation with VGG16

The initial attempt with using a GAN hit a wall, specifically with figuring out how to do proper layer sizing for the transposed convolution layers for the generator. On top of that, some research I did during my "off" time showed that the approach semantic segmentation using either an FCN (Fully Convolutional Network) approach or SegNet would work best.

Since SegNet is completely new, whereas transfer learning with VGG16 is something I'm familiar with, I'm going to attempt a VGG16 network here.

In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
from random import randint
from glob import glob
import os
import os.path

%matplotlib inline

import tensorflow as tf
#Check GPU
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
print('TensorFlow Version: {}'.format(tf.__version__))

Default GPU Device: /device:GPU:0
TensorFlow Version: 1.6.0


In [2]:
#useful variables
num_classes = 13 # none and 12 options, 0-12
image_shape = (160, 576)
weights_initializer_stddev = 0.01
weights_regularized_l2 = 1e-3

## Load VGG16


First we're going to load VGG16 with pretrained weights (so it maintains its feature detectors, which can be useful for our smaller dataset).

In [3]:
#Download VGG16 if it is not already
from urllib.request import urlretrieve
import zipfile

if not os.path.exists("vgg16.zip"):
    urlretrieve(
        'https://s3-us-west-1.amazonaws.com/udacity-selfdrivingcar/vgg.zip',
        "./vgg16.zip")
    print("Downloaded VGG16 model weights")
else:
    print("Already exists, skipping download")

Already exists, skipping download


In [4]:
#Extract if needed
if not os.path.exists("./vgg"):
    unzip = zipfile.ZipFile("./vgg16.zip", "r")
    unzip.extractall("./")
    unzip.close()
    print("Extracted VGG16 model weights")
else:
    print("Already exists, skipping extraction")

Already exists, skipping extraction


In [5]:
#Make a map more colorful
def convertToColor(value):
    colors = [
        (255, 255, 255),   #0
        (255, 0, 0),      #1
        (0, 255, 0),      #2
        (0, 0, 255),      #3
        (255, 255, 0),    #4
        (127, 0, 255),    #5
        (51, 255, 51),    #6
        (255, 0, 127),    #7
        (127, 127, 127),  #8
        (0, 0, 0),        #9
        (0, 255, 255),  #10
        (0, 0, 100),      #11
        (100, 0, 0),      #12
    ]
    return colors[value[0]]

def colorizeMap(img):
    return [list( map(convertToColor, row) ) for row in img]

In [6]:
#We need to be able to convert the image map to a 13 channel ground truth map, and vice versa
def pixelToTruth(value):
    truths = [
        (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)
    ]
    
    return truths[value[0]]

def truthToPixel(value):
    return (value.tolist().index(1), 0, 0)

def imageToTruth(img):
    return [list(map(pixelToTruth, row)) for row in img]

def truthToImage(truth):
    return [list(map(truthToPixel, row)) for row in truth]

In [7]:
# Function to load the image data and the label for it
def get_training_data(batch_size):
    #Both inputs and ground truth maps have the same name - easy!
    image_paths = glob(os.path.join("./data/Train/CameraRGB", "*.png"))
    label_paths = glob(os.path.join("./data/Train/CameraSeg", "*.png"))
    
    for batch in range(0, len(image_paths), batch_size):
        images = []
        maps = []
    
    for batch in range(0, len(image_paths), batch_size):
        images = []
        maps = []
        
        for index, image_file in enumerate(image_paths[batch:batch + batch_size]):
            map_file = os.path.join(label_paths[index])
            
            image = cv2.imread(image_file)
#             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, image_shape)
#             image = image - np.array([123.68, 116.779, 103.939], dtype=np.float32)
            map_image = cv2.imread(map_file)
            map_image = cv2.cvtColor(map_image, cv2.COLOR_BGR2RGB)
            map_image = cv2.resize(map_image, image_shape)
            map_image = imageToTruth(map_image)
            
            images.append(image)
            maps.append(map_image)

        yield np.array(images), np.array(maps)
            
            

In [8]:
def get_placeholders():
    #Placeholders
    label = tf.placeholder(tf.int32, (None, None, None, num_classes), name='label')
    learning_rate = tf.placeholder(tf.int32, name='learning_rate')
    
    return label, learning_rate

In [9]:
def load_vgg():
    #Grab layers from pretrained VGG
    tf.saved_model.loader.load(sess, ["vgg16"], "./vgg/")
    
    graph = tf.get_default_graph()
    
    #define key layers for us to work with, so we can take pieces of VGG16
    #for our own use
    input_layer = graph.get_tensor_by_name("image_input:0")
    keep_prob = graph.get_tensor_by_name("keep_prob:0") #Dropout settings
    
    #more layer grabbing
    layer_3 = graph.get_tensor_by_name("layer3_out:0")
    layer_4 = graph.get_tensor_by_name("layer4_out:0")
    layer_7 = graph.get_tensor_by_name("layer7_out:0")
    
    return graph, input_layer, keep_prob, layer_3, layer_4, layer_7

In [10]:
def vgg_fcn(graph, input_layer, keep_prob, layer_3, layer_4, layer_7):
    # Skip connections for later
    skip_conv_3 = tf.layers.conv2d(layer_3, num_classes, 1, padding='same',
            kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
            kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    skip_conv_4 = tf.layers.conv2d(layer_4, num_classes, 1, padding='same',
            kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
            kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #Layer 7 isn't skipped, it's passed right to transpose    
    fully_connected_convs = tf.layers.conv2d(layer_7, num_classes, 1, padding='same',
            kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
            kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #From layer 7 we need to transpose up
    transpose_1 = tf.layers.conv2d_transpose(fully_connected_convs, num_classes, 4, 2, padding='same',
            kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
            kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    # Add the skip layer from layer 4
    skip_1 = tf.add(transpose_1, skip_conv_4)
    
    #Tranpose up from resultant layer
    transpose_2 = tf.layers.conv2d_transpose(skip_1, num_classes, 4, 2, padding='same',
            kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
            kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #Create skip layer from layer 3
    skip_2 = tf.add(skip_conv_3, transpose_2)
    
    #Final output layer
    output_layer = tf.layers.conv2d_transpose(skip_2, num_classes, 16, 8, padding='same',
            kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
            kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2),
            activation=tf.sigmoid)
    
    return output_layer

In [11]:
def get_logits(label, output_layer):
    #Create the optimzer
    logits = tf.reshape(output_layer, (-1, num_classes))
    correct_label = tf.reshape(label, (-1, num_classes))
    
    return logits, correct_label

In [12]:
def get_loss(logits, correct_labels):
    cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=correct_labels))
    optimizer= tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(cross_entropy_loss)
    return cross_entropy_loss, optimizer, train_op

In [13]:
epochs = 1
batch_size = 10
keep_probability = 0.5 
learning_rate_alpha = 0.0001

In [14]:
def train(sess, input_layer, label, keep_prob, learning_rate):
    
    saver = tf.train.Saver()
    
    sess.run(tf.global_variables_initializer())
    
    print("Graph has been built - launching training")
    print("====== :-) ======")
    print()
    
    for epoch in range(epochs):
        print("Launching Epoch {}".format(epoch))
        loss_log = []
        batch_count = 0
        
        #get the images
        for image, truth in get_training_data(batch_size):
            batch_count += 1
            _, loss = sess.run(
                    [train_op, cross_entropy_loss],
                    feed_dict = {
                        input_layer: image,
                        label: truth,
                        keep_prob: keep_probability,
                        learning_rate: learning_rate_alpha
                    }
                )
            loss_log.append('{:3f}'.format(loss))
            if(batch_count % 10 == 0):
                print("Batch {} - loss of {}".format(batch_count, loss))
        print("Training for epoch finished - ", loss_log[-1])
        print()
    print("Training finished")

In [31]:
def execute_on_image(sess, logits, input_layer, label, keep_prob, learning_rate):
    sess.run(tf.global_variables_initializer())

    chosen_image = "165"
    image_file = "./data/Train/CameraRGB/{}.png".format(chosen_image)
    truth_file = "./data/Train/CameraSeg/{}.png".format(chosen_image)
    
    image = cv2.imread(image_file)
    image = cv2.resize(image, image_shape)
    
    truth = cv2.imread(truth_file)
    truth = cv2.cvtColor(truth, cv2.COLOR_BGR2RGB)
    truth = cv2.resize(truth, image_shape)
    truth = imageToTruth(truth)
    
    return sess.run([output_layer], feed_dict={
            input_layer: np.array([image]),
            label: np.array([truth]),
            keep_prob: keep_probability,
            learning_rate: learning_rate_alpha
        })

In [32]:
output_test = None

In [33]:
tf.reset_default_graph()
with tf.Session() as sess:
    label, learning_rate = get_placeholders()
    
    graph, input_layer, keep_prob, layer_3, layer_4, layer_7 = load_vgg()
    
    output_layer = vgg_fcn(graph, input_layer, keep_prob, layer_3, layer_4, layer_7)

    logits, correct_labels = get_logits(label, output_layer)
    
    cross_entropy_loss, optimizer, train_op = get_loss(logits, correct_labels)

    #and now, training!
#     train(sess, input_layer, label, keep_prob, learning_rate)
    
    #execute graph on a singular image
    output_test = execute_on_image(sess, logits, input_layer, label, keep_prob, learning_rate)
    

INFO:tensorflow:Restoring parameters from b'./vgg/variables/variables'


In [43]:
output_test[0][0].shape

(576, 160, 13)

In [44]:
output_test[0][0]

array([[[7.67056823e-01, 3.09352130e-01, 1.31825760e-01, ...,
         2.91122705e-01, 1.60486817e-01, 7.48600543e-01],
        [8.68503228e-02, 1.86929703e-01, 5.14724627e-02, ...,
         9.99738872e-01, 2.25904211e-03, 3.45170265e-04],
        [9.98210669e-01, 1.14997312e-01, 5.11900435e-05, ...,
         9.99963164e-01, 9.93969083e-01, 8.65127367e-04],
        ...,
        [5.74969769e-01, 4.77827191e-01, 2.98987150e-01, ...,
         1.57196954e-01, 9.02837217e-01, 1.33709073e-01],
        [7.45507777e-01, 7.63445735e-01, 6.01666272e-01, ...,
         7.19402909e-01, 5.16307831e-01, 8.78918827e-01],
        [2.98375100e-01, 7.04606473e-01, 3.46926153e-01, ...,
         8.81381750e-01, 1.81084663e-01, 6.07959151e-01]],

       [[9.99996424e-01, 9.15998518e-01, 2.99819499e-01, ...,
         8.07579815e-01, 9.75729823e-01, 1.46905193e-03],
        [1.01733254e-02, 4.47796099e-03, 5.23329824e-02, ...,
         9.98283386e-01, 9.98081923e-01, 9.81153250e-01],
        [6.45237088e-01, 

In [30]:
160*576

92160