In [None]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.image as im
%matplotlib inline
import tensorflow as tf
import time
from datetime import timedelta
import math

training_file = "train.p"
testing_file = "test.p"

with open(training_file, mode='rb') as f:
    train = pickle.load(f)
with open(testing_file, mode='rb') as f:
    test = pickle.load(f)
    
X_train, y_train = train['features'], train['labels']
X_test, y_test = test['features'], test['labels']

In [None]:
n_train = X_train.shape[0]
n_test = X_test.shape[0]
image_shape = (X_train.shape[1],X_train.shape[2])
n_classes = len(np.unique(y_train))
print("Number of training examples =", n_train)
print("Number of testing examples =", n_test)
print("Image data shape =", image_shape)
print("Number of classes =", n_classes)

----

## Step 2: Design and Test a Model Architecture

Design and implement a deep learning model that learns to recognize traffic signs. Train and test your model on the [German Traffic Sign Dataset](http://benchmark.ini.rub.de/?section=gtsrb&subsection=dataset).

There are various aspects to consider when thinking about this problem:

- Your model can be derived from a deep feedforward net or a deep convolutional network.
- Play around preprocessing techniques (normalization, rgb to grayscale, etc)
- Number of examples per label (some have more than others).
- Generate fake data.

Here is an example of a [published baseline model on this problem](http://yann.lecun.com/exdb/publis/pdf/sermanet-ijcnn-11.pdf). It's not required to be familiar with the approach used in the paper but, it's good practice to try to read papers like these.

### Implementation

Use the code cell (or multiple code cells, if necessary) to implement the first step of your project. Once you have completed your implementation and are satisfied with the results, be sure to thoroughly answer the questions that follow.

In [None]:
#############################################
tf.__version__

In [None]:
y_train_one_hot = np.zeros((n_train,n_classes))
for i in range(0,n_train):
    y_train_one_hot[i,y_train[i]]=1
X_train = (0.2989*X_train[:,:,:,0]+0.5870*X_train[:,:,:,1]+0.1140*X_train[:,:,:,2])

y_test_one_hot = np.zeros((n_test,n_classes))
for i in range(0,n_test):
    y_test_one_hot[i,y_test[i]]=1
X_test = (0.2989*X_test[:,:,:,0]+0.5870*X_test[:,:,:,1]+0.1140*X_test[:,:,:,2])

In [None]:
#############################################
# Convolutional Layer 1.
filter_size1 = 5          # Convolution filters are 5 x 5 pixels.
num_filters1 = 16         # There are 16 of these filters.

# Convolutional Layer 2.
filter_size2 = 5          # Convolution filters are 5 x 5 pixels.
num_filters2 = 36         # There are 36 of these filters.

# Fully-connected layer.
fc_size = 64             # Number of neurons in fully-connected layer.

#############################################
# 32x32
img_size = X_train.shape[1]
img_size_flat = img_size * img_size
img_shape = (img_size, img_size)
num_channels = 1
num_classes = n_classes

#############################################
def new_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

def new_biases(length):
    return tf.Variable(tf.constant(0.05, shape=[length]))

#############################################
def new_conv_layer(input,              # The previous layer.
                   num_input_channels, # Num. channels in prev. layer.
                   filter_size,        # Width and height of each filter.
                   num_filters,        # Number of filters.
                   use_pooling=True):  # Use 2x2 max-pooling.
    shape = [filter_size, filter_size, num_input_channels, num_filters]
    weights = new_weights(shape=shape)
    biases = new_biases(length=num_filters)
    layer = tf.nn.conv2d(input=input,
                         filter=weights,
                         strides=[1, 1, 1, 1],
                         padding='SAME')
    layer += biases
    if use_pooling:
        layer = tf.nn.max_pool(value=layer,
                               ksize=[1, 2, 2, 1],
                               strides=[1, 2, 2, 1],
                               padding='SAME')
    layer = tf.nn.relu(layer)
    return layer, weights

#############################################
def flatten_layer(layer):
    layer_shape = layer.get_shape()
    num_features = layer_shape[1:4].num_elements()
    layer_flat = tf.reshape(layer, [-1, num_features])
    return layer_flat, num_features

#############################################
def new_fc_layer(input,          # The previous layer.
                 num_inputs,     # Num. inputs from prev. layer.
                 num_outputs,    # Num. outputs.
                 use_relu=True): # Use Rectified Linear Unit (ReLU)?

    # Create new weights and biases.
    weights = new_weights(shape=[num_inputs, num_outputs])
    biases = new_biases(length=num_outputs)

    # Calculate the layer as the matrix multiplication of
    # the input and weights, and then add the bias-values.
    layer = tf.matmul(input, weights) + biases

    # Use ReLU?
    if use_relu:
        layer = tf.nn.relu(layer)

    return layer

#############################################
x = tf.placeholder(tf.float32, shape=[None, img_size, img_size], name='x')
x_image = tf.reshape(x, [-1, img_size, img_size, num_channels])
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, dimension=1)

In [None]:
#############################################
layer_conv1, weights_conv1 = \
    new_conv_layer(input=x_image,
                   num_input_channels=num_channels,
                   filter_size=filter_size1,
                   num_filters=num_filters1,
                   use_pooling=True)

layer_conv1

In [None]:
#############################################
layer_conv2, weights_conv2 = \
    new_conv_layer(input=layer_conv1,
                   num_input_channels=num_filters1,
                   filter_size=filter_size2,
                   num_filters=num_filters2,
                   use_pooling=True)
    
layer_conv2

In [None]:
#############################################
layer_flat, num_features = flatten_layer(layer_conv2)
layer_flat

In [None]:
#############################################
layer_fc1 = new_fc_layer(input=layer_flat,
                         num_inputs=num_features,
                         num_outputs=fc_size,
                         use_relu=True)
layer_fc1

In [None]:
#############################################
layer_fc2 = new_fc_layer(input=layer_fc1,
                         num_inputs=fc_size,
                         num_outputs=num_classes,
                         use_relu=False)
layer_fc2

In [None]:
#############################################
y_pred = tf.nn.softmax(layer_fc2)
y_pred_cls = tf.argmax(y_pred, dimension=1)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,labels=y_true)
cost = tf.reduce_mean(cross_entropy)

In [None]:
#############################################
#optimizer = tf.train.AdagradOptimizer(learning_rate=0.01).minimize(cost)
optimizer = tf.train.FtrlOptimizer(learning_rate=0.01,l2_regularization_strength=1.0).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [None]:
#############################################
session = tf.Session()
session.run(tf.initialize_all_variables())
train_batch_size = 250

# Counter for total number of iterations performed so far.
total_iterations = 0

def optimize(num_iterations):
    global total_iterations
    start_time = time.time()
    for i in range(total_iterations,total_iterations + num_iterations):
        tmpp = np.unique(np.sort(np.random.randint(0,n_train,size=train_batch_size)))
        x_batch  = X_train[tmpp,:,:]        
        y_true_batch = y_train_one_hot[tmpp,:]        
        feed_dict_train = {x: x_batch,
                           y_true: y_true_batch}
        session.run(optimizer, feed_dict=feed_dict_train)
        acc = session.run(accuracy, feed_dict=feed_dict_train)
        msg = "Optimization Iteration: {0:>6}, Training Accuracy: {1:>6.1%}"
        print(msg.format(i + 1, acc))
        if (acc > 0.9999):
            break

    # Update the total number of iterations performed.
    total_iterations += num_iterations

    end_time = time.time()
    time_dif = end_time - start_time
    print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))

In [None]:
def print_accuracy():
    acc = session.run(accuracy, feed_dict=feed_dict_test)
    print("Accuracy on test-set: {0:.1%}".format(acc))

In [None]:
#Check to see if everything works OK
optimize(num_iterations=1)

In [None]:
#I split up test set into two, because my computer memory cannot handle the whole set at once
feed_dict_test = {x: X_test[0:6315],
                  y_true: y_test_one_hot[0:6315],
                  y_true_cls: y_test[0:6315]}
print_accuracy()
feed_dict_test = {x: X_test[6316:],
                  y_true: y_test_one_hot[6316:],
                  y_true_cls: y_test[6316:]}
print_accuracy()

In [None]:
#RUN RUN RUN!!!
optimize(num_iterations=10000)

In [None]:
#I split up test set into two, because my computer memory cannot handle the whole set at once
feed_dict_test = {x: X_test[0:6315],
                  y_true: y_test_one_hot[0:6315],
                  y_true_cls: y_test[0:6315]}
print_accuracy()
feed_dict_test = {x: X_test[6316:],
                  y_true: y_test_one_hot[6316:],
                  y_true_cls: y_test[6316:]}
print_accuracy()

### Question 1 

_Describe the techniques used to preprocess the data._

**Answer:**
I am using the HVASS Labs tutorial/model to setup the CNN. This entire work is based off this tutorial - 
https://www.youtube.com/watch?v=HMcx-zY8JSg

I tried several things to pre-process the data and improve performance.

First I tried each color channel independently - only r, only g, and only b. I had the best performance on the green channel, but red and blue were not very far behind.

I then tried grayscaling the images and that also helped. My first attempt was the openCV grayscale, and that worked well, but then I found slightly better performance with the MATLAB grayscale converter, so I used that.
0.2989*r + 0.5870*g + 0.1140*b

I did not find any impact of zero-centering the data or scaling it (/128).

I also had to one hot encode the labels to make it easier to feed into the model.

I did this for both the training set and the test set.

### Question 2

_Describe how you set up the training, validation and testing data for your model. If you generated additional data, why?_

**Answer:**

I did not generate any additional data, I simply used the training set and test set as is. Towards the end of the project I did generate some images for testing, and I have results for that in Section 03.

### Question 3

_What does your final architecture look like? (Type of model, layers, sizes, connectivity, etc.)  For reference on how to build a deep neural network using TensorFlow, see [Deep Neural Network in TensorFlow
](https://classroom.udacity.com/nanodegrees/nd013/parts/fbf77062-5703-404e-b60c-95b78b2f3f9e/modules/6df7ae49-c61c-4bb2-a23e-6527e69209ec/lessons/b516a270-8600-4f93-a0a3-20dfeabe5da6/concepts/83a3a2a2-a9bd-4b7b-95b0-eb924ab14432) from the classroom._


**Answer:**

I started with a simple linear model (WX+b) and that actually worked pretty well (80% accuracy on the test set). However since this assignment is focussed CNNs, I studied and built a CNN. 

Again, I base this work off https://github.com/Hvass-Labs/TensorFlow-Tutorials/blob/master/02_Convolutional_Neural_Network.ipynb

The architecture is as follows:
2 Layers of convolution
2 Layers fully connected
16 filters on the first CN, each sized 5x5
36 filters on the second CN, each also sized 5x5
First fully connected layer takes a flattened version of the 2nd convolution layer and has 64 outputs (ReLu is used)
The second fully connected layer has 64 inputs and 43 outputs (=number of classes)

I have been trying to work with Tensorboard to visualize it, but have not been successful so far.


### Question 4

_How did you train your model? (Type of optimizer, batch size, epochs, hyperparameters, etc.)_


**Answer:**

I trained the model using a randomly sampled batch size of 250. I tried many different sizes, below 200 the convergence was poor (often there would be divergence) and above 300 there was not much gain as the model would slow down. 

I tried to sample from each class, but that actually did poorly. It seems a completely random sample works better.

I tried to vary the batch size randomly also (e.g. between 100 and 400) but that did not help

I also tried to inject "noise" into each sampled image - add/substract a random 32x32 number to the original image - but that did not help

I played around with all available optimizers - I found FtrlOptimizer to be the best

I played around with the learning rate - For very small learning rates, the model would take a very long time to train, and for large learning rates the accuracy would bounce around a lot. I found a good medium at 1e-2

I also played around with the regularization - and found the best accuracy on the test set at about 1.0

### Question 5


_What approach did you take in coming up with a solution to this problem?_

**Answer:**

See descriptions above!