# Specialist Convolutional Neural Network Models

In this notebook we build several CNN models each trained to predict different facial keypoint sets. The rationale for such approach is that each keypoint contains different number of label observations. Some images show face on the side, thus not all keypoints are revealed. In order to fully utilize labels, other than just examples where full set of keypoints are marked, keypoints close to each other and which number of observations are relatively close are trained with a CNN. This gives us several CNN models. The final prediction is done via all CNNs.

In [15]:
## Architecture

print("Layer (type)                  Output Shape              Param #          Param Shape       ")
print("===========================================================================================")
print("conv2d_1 (Conv2D)             (None, 32, 96, 96)        800              (32, 1, 5, 5)     ")
print("maxpooling2d_1 (Pooling)      (None, 32, 48, 48)        0                                  ")
print("dropout_1 (Dropout)           (None, 32, 48, 48)        0                                  ") 
print("conv2d_2 (Conv2D)             (None, 64, 48, 48)        51200            (64, 32, 5, 5)    ")
print("maxpooling2d_2 (Pooling)      (None, 64, 24, 24)        0                                  ")
print("dropout_2 (Dropout)           (None, 64, 24, 24)        0                                  ")
print("conv2d_3 (Conv2D)             (None, 128, 24, 24)       204800           (128, 64, 5, 5)   ")
print("maxpooling2d_3 (Pooling)      (None, 128, 12, 12)       0                                  ")
print("dropout_3 (Dropout)           (None, 128, 12, 12)       0                                  ")
print("conv2d_4 (Conv2D)             (None, 256, 12, 12)       819200           (256, 128, 5, 5)  ")
print("maxpooling2d_4 (Pooling)      (None, 256, 6, 6)         0                                  ")
print("dropout_4 (Dropout)           (None, 256, 6, 6)         0                                  ")
print("conv2d_5 (Conv2D)             (None, 512, 6, 6)         3276800          (512, 256, 5, 5)  ")
print("maxpooling2d_5 (Pooling)      (None, 512, 3, 3)         0                                  ")
print("dropout_5 (Dropout)           (None, 512, 3, 3)         0                                  ")
print("flatten_1 (Flatten)           (None, 4608)              0                                  ")
print("dense_1 (Dense)               (None, 600)               2764800          (4608, 600)       ")
print("dense_2 (Dense)               (None, 600)               360000           (600, 600)        ")
print("dense_3 (Dense)               (None, 600)               360000           (600, 600)        ")
print("dense_4 (Dense)               (None, 600)               360000           (600, 600)        ")
print("dense_5 (Dense)               (None, 4 keypoints)       2400             (600, 4 keypoints)")
print("===========================================================================================")
print("Toal params: 8,200,000")
print("Trainable params: 8,200,000")
print("Non-trainable params: 0")

Layer (type)                  Output Shape              Param #          Param Shape       
conv2d_1 (Conv2D)             (None, 32, 96, 96)        800              (32, 1, 5, 5)     
maxpooling2d_1 (Pooling)      (None, 32, 48, 48)        0                                  
dropout_1 (Dropout)           (None, 32, 48, 48)        0                                  
conv2d_2 (Conv2D)             (None, 64, 48, 48)        51200            (64, 32, 5, 5)    
maxpooling2d_2 (Pooling)      (None, 64, 24, 24)        0                                  
dropout_2 (Dropout)           (None, 64, 24, 24)        0                                  
conv2d_3 (Conv2D)             (None, 128, 24, 24)       204800           (128, 64, 5, 5)   
maxpooling2d_3 (Pooling)      (None, 128, 12, 12)       0                                  
dropout_3 (Dropout)           (None, 128, 12, 12)       0                                  
conv2d_4 (Conv2D)             (None, 256, 12, 12)       819200           (256, 1

In [1]:
import os
import re
import sys
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.parsers import read_csv
from sklearn.utils import shuffle
from six.moves import cPickle

from sklearn.metrics import classification_report
import time

import theano
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from theano.tensor.nnet.conv import conv2d
from theano.tensor.signal.pool import pool_2d

import conv_net_helper as helper

print(theano.config.device)
print(theano.config.floatX)
print(sys.version)

 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GRID K520 (CNMeM is disabled, cuDNN 5103)


gpu
float64
3.6.1 |Anaconda 4.4.0 (64-bit)| (default, May 11 2017, 13:09:58) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


#### Load Training Data and Create Training and Dev sets

In [3]:
TRAIN = "../../../data/facial_keypoints/training.csv"
TEST = "../../../data/facial_keypoints/test.csv"

rawdata = helper.load_data(TRAIN)
X, Y = helper.loadXY(rawdata)
num_dev = int(len(X)/7)
train_data, train_labels = X[num_dev:], Y[num_dev:]
dev_data, dev_labels = X[:num_dev], Y[:num_dev]

print("Number of training examples:", len(train_data))
print("Number of dev examples:", len(dev_data))

Number of training examples: 6042
Number of dev examples: 1007


#### Column names of keypoint sets

In [4]:
full_keypoints = (rawdata.columns.values).tolist()[:-1]
cols_dict = {
    "eye_ct" : ["left_eye_center_x", "left_eye_center_y",
                "right_eye_center_x", "right_eye_center_y"],
    "eye_cr" : ["left_eye_inner_corner_x", "left_eye_inner_corner_y",
                "left_eye_outer_corner_x", "left_eye_outer_corner_y",
                "right_eye_inner_corner_x", "right_eye_inner_corner_y",
                "right_eye_outer_corner_x", "right_eye_outer_corner_y"],
    "eyebrow" : ["left_eyebrow_inner_end_x", "left_eyebrow_inner_end_y",
                 "left_eyebrow_outer_end_x", "left_eyebrow_outer_end_y",
                 "right_eyebrow_inner_end_x", "right_eyebrow_inner_end_y",
                 "right_eyebrow_outer_end_x", "right_eyebrow_outer_end_y"],
    "nose" : ["nose_tip_x", "nose_tip_y"],
    "mouth_cr" : ["mouth_left_corner_x", "mouth_left_corner_y",
                           "mouth_right_corner_x", "mouth_right_corner_y"],
    "mouth_ct_top" : ["mouth_center_top_lip_x", "mouth_center_top_lip_y"],
    "mouth_ct_bottom" : ["mouth_center_bottom_lip_x", "mouth_center_bottom_lip_y"]
}
cols_dict.keys()

dict_keys(['eye_ct', 'eye_cr', 'eyebrow', 'nose', 'mouth_cr', 'mouth_ct_top', 'mouth_ct_bottom'])

#### Subset Traing Data - for keypoint sets

In [5]:
train_data_sep = [0 for i in cols_dict.keys()]
train_labels_sep = [0 for i in cols_dict.keys()]
for i, key in enumerate(cols_dict.keys()):
    train_data_sep[i], train_labels_sep[i] = helper.subset_data(train_data,
                                                                train_labels,
                                                                full_keypoints,
                                                                cols_dict[key])

#### Subset Dev Data - for keypoint sets

In [6]:
dev_data_sep = [0 for i in cols_dict.keys()]
dev_labels_sep = [0 for i in cols_dict.keys()]
for i, key in enumerate(cols_dict.keys()):
    dev_data_sep[i], dev_labels_sep[i] = helper.subset_data(dev_data,
                                                                dev_labels,
                                                                full_keypoints,
                                                                cols_dict[key])

Reshape training and dev data into tensor 4D

In [7]:
print("training set shape, dev set shape")
for i in range(len(train_data_sep)):
    train_data_sep[i] = helper.load_2d_images(train_data_sep[i], 96)
    dev_data_sep[i] = helper.load_2d_images(dev_data_sep[i], 96)
    print(train_data_sep[i].shape, dev_data_sep[i].shape)

training set shape, dev set shape
(6030, 1, 96, 96) (1003, 1, 96, 96)
(1917, 1, 96, 96) (330, 1, 96, 96)
(1865, 1, 96, 96) (325, 1, 96, 96)
(6042, 1, 96, 96) (1007, 1, 96, 96)
(1927, 1, 96, 96) (336, 1, 96, 96)
(1938, 1, 96, 96) (337, 1, 96, 96)
(6017, 1, 96, 96) (999, 1, 96, 96)


### Build separate CNN models

- 0: eye center
- 1: eye corner
- 2: eyebrow
- 3: nose
- 4: mouse corner
- 5: mouse center top
- 6: mouse center bottom

In [11]:
numModels = len(train_data_sep)
cnns = [0. for i in range(numModels)]

for i in range(numModels):
    numClasses = train_labels_sep[i][1].size
    cnns[i] = helper.convNetBuilder(numClasses, patchSize = [5,5], 
                                   featureMapLayers = [32,64,128,256,512],
                                   numHiddenNodes = 600, numNNLayer = 5,
                                   imageWidth = 96, poolingSize = 2,
                                   train_dropout_rate = [0.2,0.5])

In [12]:
## Validate the parameters' matrix shape
for i in range(numModels):
    print("-----------------")
    for j in range(len(cnns[i].params)):
        print(cnns[i].params[j].get_value().shape)

-----------------
(32, 1, 5, 5)
(64, 32, 5, 5)
(128, 64, 5, 5)
(256, 128, 5, 5)
(512, 256, 5, 5)
(4608, 600)
(600, 600)
(600, 600)
(600, 600)
(600, 4)
-----------------
(32, 1, 5, 5)
(64, 32, 5, 5)
(128, 64, 5, 5)
(256, 128, 5, 5)
(512, 256, 5, 5)
(4608, 600)
(600, 600)
(600, 600)
(600, 600)
(600, 8)
-----------------
(32, 1, 5, 5)
(64, 32, 5, 5)
(128, 64, 5, 5)
(256, 128, 5, 5)
(512, 256, 5, 5)
(4608, 600)
(600, 600)
(600, 600)
(600, 600)
(600, 8)
-----------------
(32, 1, 5, 5)
(64, 32, 5, 5)
(128, 64, 5, 5)
(256, 128, 5, 5)
(512, 256, 5, 5)
(4608, 600)
(600, 600)
(600, 600)
(600, 600)
(600, 2)
-----------------
(32, 1, 5, 5)
(64, 32, 5, 5)
(128, 64, 5, 5)
(256, 128, 5, 5)
(512, 256, 5, 5)
(4608, 600)
(600, 600)
(600, 600)
(600, 600)
(600, 4)
-----------------
(32, 1, 5, 5)
(64, 32, 5, 5)
(128, 64, 5, 5)
(256, 128, 5, 5)
(512, 256, 5, 5)
(4608, 600)
(600, 600)
(600, 600)
(600, 600)
(600, 2)
-----------------
(32, 1, 5, 5)
(64, 32, 5, 5)
(128, 64, 5, 5)
(256, 128, 5, 5)
(512, 256, 5, 

### Training multiple CNN models

In [13]:
train_result = [ [] for i in range(numModels) ]
val_result = [ [] for i in range(numModels) ]
model_name = ["eye center", "eye corner", "eyebrow", "nose",
              "mouth corner", "mouth center top", "mouth center bottom"]

#### first training trial

In [None]:
rounds = 1000
alpha_schedule = np.linspace(0.3, 0.0001, rounds)

for i, model in enumerate(cnns):
    print("\n ======================================================")
    print("CNN Model", (i+1), " - ", model_name[i], "keypoint prediction")
    train_result[i] , val_result[i] = model.SGD(train_data_sep[i], train_labels_sep[i],
                                                update_rule = "backprop",
                                                epochs = rounds, 
                                                miniBatchSize = 10,
                                                learning_rate = 0.05,
                                                learningRateSchedule = alpha_schedule,
                                                validation = [dev_data_sep[i],
                                                              dev_labels_sep[i]]
                                               )


CNN Model 1  -  eye center keypoint prediction

Epoch: 1 / 1000
training time: 880.30395484 s, ----- loss: 0.0158258945051 , RMSE: 6.03844855403
validation loss: 1.21478302394 , val RMSE: 52.9042539609

Epoch: 2 / 1000
training time: 873.586210012 s, ----- loss: 0.0158604053171 , RMSE: 6.0450288544
validation loss: 1.17748362595 , val RMSE: 52.0857204443

Epoch: 3 / 1000
training time: 882.13487339 s, ----- loss: 0.0184171028083 , RMSE: 6.51406208677
validation loss: 1.17156091347 , val RMSE: 51.9545603833

Epoch: 4 / 1000
training time: 760.372305155 s, ----- loss: 0.0107313276227 , RMSE: 4.97242182872
validation loss: 1.10928991295 , val RMSE: 50.5549597906

Epoch: 5 / 1000
training time: 497.775071859 s, ----- loss: 0.0141021642948 , RMSE: 5.70012162459
validation loss: 1.02869498341 , val RMSE: 48.6838088257

Epoch: 6 / 1000
training time: 495.802038193 s, ----- loss: 0.0138427848243 , RMSE: 5.64745750185
validation loss: 1.09384210481 , val RMSE: 50.2017152046

Epoch: 7 / 1000
tr


Epoch: 53 / 1000
training time: 499.498436928 s, ----- loss: 0.0118288135669 , RMSE: 5.22049676353
validation loss: 0.861996218366 , val RMSE: 44.5650006969

Epoch: 54 / 1000
training time: 503.695280552 s, ----- loss: 0.0117094853514 , RMSE: 5.19409802079
validation loss: 0.857425588494 , val RMSE: 44.4466934191

Epoch: 55 / 1000
training time: 497.90859127 s, ----- loss: 0.0105760187908 , RMSE: 4.93630907602
validation loss: 0.847507095229 , val RMSE: 44.1888713073


#### Try again with less epochs

- rmsprop

In [14]:
rounds = 20
alpha_schedule = np.linspace(0.3, 0.0001, rounds)

for i, model in enumerate(cnns):
    print("\n ======================================================")
    print("CNN Model", (i+1), " - ", model_name[i], "keypoint prediction")
    train_result[i] , val_result[i] = model.SGD(train_data_sep[i], train_labels_sep[i],
                                                update_rule = "rmsprop",
                                                epochs = rounds, 
                                                miniBatchSize = 10,
                                                learning_rate = 0.05,
                                                learningRateSchedule = alpha_schedule,
                                                validation = [dev_data_sep[i],
                                                              dev_labels_sep[i]]
                                               )


CNN Model 1  -  eye center keypoint prediction

Epoch: 1 / 20
training time: 641.937942505 s, ----- loss: 32.7267844569 , RMSE: 274.595177286
validation loss: 23.4298188629 , val RMSE: 232.340919039

Epoch: 2 / 20
training time: 953.477121115 s, ----- loss: 140.578078584 , RMSE: 569.115008638
validation loss: 124.701971006 , val RMSE: 536.016176246


KeyboardInterrupt: 

### Plot performance

In [10]:
for i in range(7):
    helper.plot_performance(train_result[i] , val_result[i])

NameError: name 'train_result' is not defined

### Save the model params

In [None]:
for i in range(numModels):
    for layer in range(len(cnns[i].params)):
        filename = model_name[i] + "_layer" + str(layer+1)
        helper.save_layer_params(cnns[i].params[layer], filename)

In [13]:
helper.load_saved_params("eye_ct_layer9_weights").get_value()

array([[-0.0039484 , -0.01108056,  0.00097547, ...,  0.00673776,
        -0.00781311, -0.00604274],
       [-0.01168752,  0.00146486,  0.00517273, ...,  0.00331349,
        -0.01609198,  0.00706688],
       [-0.00588158,  0.00856659, -0.00125814, ..., -0.00328748,
         0.0118611 ,  0.00234903],
       ..., 
       [-0.01497844, -0.00012292, -0.01610636, ...,  0.00627014,
         0.00585229, -0.00109457],
       [-0.01884747, -0.0079815 , -0.01920352, ..., -0.00610651,
        -0.00692251,  0.00126688],
       [-0.00646328, -0.00138179, -0.00709273, ...,  0.02022066,
         0.00733202, -0.00276212]])

----------------------------------------------------------------------