# Segmantic Segmentation with VGG16

The initial attempt with using a GAN hit a wall, specifically with figuring out how to do proper layer sizing for the transposed convolution layers for the generator. On top of that, some research I did during my "off" time showed that the approach semantic segmentation using either an FCN (Fully Convolutional Network) approach or SegNet would work best.

Since SegNet is completely new, whereas transfer learning with VGG16 is something I'm familiar with, I'm going to attempt a VGG16 network here.

In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
from random import randint
from glob import glob
import os
import os.path

%matplotlib inline

import tensorflow as tf
#Check GPU
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
print('TensorFlow Version: {}'.format(tf.__version__))

Default GPU Device: /device:GPU:0
TensorFlow Version: 1.6.0


In [9]:
#useful variables
num_classes = 13 # none and 12 options, 0-12
image_shape = (160, 576)
weights_initializer_stddev = 0.01
weights_regularized_l2 = 1e-3

## Load VGG16


First we're going to load VGG16 with pretrained weights (so it maintains its feature detectors, which can be useful for our smaller dataset).

In [3]:
#Download VGG16 if it is not already
from urllib.request import urlretrieve
import zipfile

if not os.path.exists("vgg16.zip"):
    urlretrieve(
        'https://s3-us-west-1.amazonaws.com/udacity-selfdrivingcar/vgg.zip',
        "./vgg16.zip")
    print("Downloaded VGG16 model weights")
else:
    print("Already exists, skipping download")

Already exists, skipping download


In [4]:
#Extract if needed
if not os.path.exists("./vgg"):
    unzip = zipfile.ZipFile("./vgg16.zip", "r")
    unzip.extractall("./")
    unzip.close()
    print("Extracted VGG16 model weights")
else:
    print("Already exists, skipping extraction")

Already exists, skipping extraction


In [5]:
#Make a map more colorful
def convertToColor(value):
    colors = [
        (255, 255, 255),   #0
        (255, 0, 0),      #1
        (0, 255, 0),      #2
        (0, 0, 255),      #3
        (255, 255, 0),    #4
        (127, 0, 255),    #5
        (51, 255, 51),    #6
        (255, 0, 127),    #7
        (127, 127, 127),  #8
        (0, 0, 0),        #9
        (0, 255, 255),  #10
        (0, 0, 100),      #11
        (100, 0, 0),      #12
    ]
    return colors[value[0]]

def colorizeMap(img):
    return [list( map(convertToColor, row) ) for row in img]

In [6]:
#We need to be able to convert the image map to a 13 channel ground truth map, and vice versa
def pixelToTruth(value):
    truths = [
        (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0),
        (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)
    ]
    
    return truths[value[0]]

def truthToPixel(value):
    return (value.tolist().index(1), 0, 0)

def imageToTruth(img):
    return [list(map(pixelToTruth, row)) for row in img]

def truthToImage(truth):
    return [list(map(truthToPixel, row)) for row in truth]

In [7]:
# Function to load the image data and the label for it
def get_training_data(batch_size):
    #Both inputs and ground truth maps have the same name - easy!
    image_paths = glob(os.path.join("./data/Train/CameraRGB", "*.png"))
    label_paths = glob(os.path.join("./data/Train/CameraSeg", "*.png"))
    
    for batch in range(0, len(image_paths), batch_size):
        images = []
        maps = []
        
        for index, image_file in enumerate(image_paths[batch:batch + batch_size]):
#             map_file = os.path.join("./data/Train/CameraSeg", label_paths[os.path.basename(image_file)])
            map_file = os.path.join(label_paths[index])
            
            image = cv2.imread(image_file)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = cv2.resize(image, image_shape)
            map_image = cv2.imread(map_file)
            map_image = cv2.cvtColor(map_image, cv2.COLOR_BGR2RGB)
            map_image = cv2.resize(map_image, image_shape)
            map_image = imageToTruth(map_image)
            
#             bg = np.all(map_image == np.array([0, 0, 0]), axis=2)
#             bg = bg.reshape(*bg.shape, 1)
#             map_image = np.concatenate((bg, np.invert(bg)), axis=2)
            
            images.append(image)
            maps.append(map_image)

        yield np.array(images), np.array(maps)
            
            

In [10]:
tf.reset_default_graph()
with tf.Session() as sess:
    #Placeholders
    label = tf.placeholder(tf.int32, (None, None, None, num_classes), name='label')
    learning_rate = tf.placeholder(tf.int32, name='learning_rate')
    
    #Grab layers from pretrained VGG
    tf.saved_model.loader.load(sess, ["vgg16"], "./vgg/")
    
    graph = tf.get_default_graph()
    
    #define key layers for us to work with, so we can take pieces of VGG16
    #for our own use
    input_layer = graph.get_tensor_by_name("image_input:0")
    keep_prob = graph.get_tensor_by_name("keep_prob:0") #Dropout settings
    
    #more layer grabbing
    layer3 = graph.get_tensor_by_name("layer3_out:0")
    layer4 = graph.get_tensor_by_name("layer4_out:0")
    layer7 = graph.get_tensor_by_name("layer7_out:0")
    
    #Create new output layers
    #First, a 1x1 convolutional to maintain spacial data
    layer_8_conv = tf.layers.conv2d(layer7, num_classes, 1,
                                padding='same', name='layer_8_conv',
                                kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
                                kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #transpose by 2 for then ext layer
    layer_9_transpose = tf.layers.conv2d_transpose(layer_8_conv, num_classes,
                                4, strides=2, padding='same',
                                name='layer_9_transpose',
                                kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
                                kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #Another convolution
    layer_10_conv = tf.layers.conv2d(layer_9_transpose, num_classes, 1,
                                padding='same', name='layer_10_conv',
                                kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
                                kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #Skip layer - so we dont lose too much positional information during conv/transposeds
    layer_11_skip = tf.add(layer_9_transpose, layer_10_conv, name='layer_11_skip')
    
    #transpose again
    layer_12_transpose = tf.layers.conv2d_transpose(layer_11_skip, num_classes,
                                4, strides=2, padding='same',
                                name='layer_12_transpose',
                                kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
                                kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #and convolve...
    layer_13_conv = tf.layers.conv2d(layer_12_transpose, num_classes, 1,
                                padding='same', name='layer_13_conv',
                                kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
                                kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #skip again
    layer_14_skip = tf.add(layer_12_transpose, layer_13_conv, name='layer_14_skip')
    
    #Transpose
    output_layer = tf.layers.conv2d_transpose(layer_14_skip, num_classes, 16, strides=8,
                                padding='same', name='output_layer',
                                kernel_initializer = tf.random_normal_initializer(stddev=weights_initializer_stddev),
                                kernel_regularizer= tf.contrib.layers.l2_regularizer(weights_regularized_l2))
    
    #layer_15 will be our transposed output!
    
    #Create the optimzer
    logits = tf.reshape(output_layer, (-1, num_classes))
    correct_label = tf.reshape(label, (-1, num_classes))
    cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=correct_label))
    optimizer= tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(cross_entropy_loss)
    
    
    #and now, training!
    epochs = 1
    batch_size = 1
    keep_probability = 0.5
    learning_rate_alpha = 0.001
    
    saver = tf.train.Saver()
    
    sess.run(tf.global_variables_initializer())
    
    print("Graph has been built - launching training")
    print("====== :-) ======")
    print()
    
    for epoch in range(epochs):
        print("Launching Epoch {}".format(epoch))
        loss_log = []
        batch_count = 0
        
        #get the images
        for image, truth in get_training_data(batch_size):
            batch_count += 1
            _, loss = sess.run(
                    [train_op, cross_entropy_loss],
                    feed_dict = {
                        input_layer: image,
                        label: truth,
                        keep_prob: keep_probability,
                        learning_rate: learning_rate_alpha
                    }
                )
            loss_log.append('{:3f}'.format(loss))
            if(batch_count % 10 == 0):
                print("Batch {} - loss of {}".format(batch_count, loss))
        print("Training for epoch finished - ", loss_log[-1])
        print()
    print("Training finished")

INFO:tensorflow:Restoring parameters from b'./vgg/variables/variables'
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

Graph has been built - launching training
Launching Epoch 0
Batch 10 - loss of 2.564948797225952
Batch 20 - loss of 2.5649497509002686
Batch 30 - loss of 2.564948558807373
Batch 40 - loss of 2.5649499893188477
Batch 50 - loss of 2.5649471282958984
Batch 60 - loss of 2.5649476051330566
Batch 70 - loss of 2.5649492740631104
Batch 80 - loss of 2.5649495124816895
Batch 90 - loss of 2.564948797225952
Batch 100 - loss of 2.5649492740631104
Batch 110 - loss of 2.564948081970215
Batch 120 - loss of 2.564948320388794
Batch 130 - loss of 2.564948797225952
Batch 140 - loss of 2.5649492740631104
Batch 150 - loss of 2.5649468898773193
Batch 160 - loss of 2.5649497509002686
Batch 170 - loss of 2.564948320388794
Batch 180 - loss of 2.56494879722

KeyboardInterrupt: 