## Model to detect eye state (close/open)

In [1]:
!python --version # Mine - Python 2.7.14 :: Anaconda, Inc.

Python 2.7.14 :: Anaconda, Inc.


In [2]:
from __future__ import print_function, division

In [5]:
#to get the usage of GPU
#!nvidia-smi

Sat Feb 17 14:15:20 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111                Driver Version: 384.111                   |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           On   | 00000000:00:1E.0 Off |                    0 |
| N/A   35C    P8    31W / 149W |      1MiB / 11439MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [7]:
% matplotlib inline

In [9]:
import sys, glob, subprocess
from itertools import product

import shutil
#from IPython.display import Image as JImage

import numpy as np
import cv2
import dlib
from PIL import Image
from skimage import io
from matplotlib.gridspec import GridSpec
from matplotlib import pyplot as plt

import tensorflow as tf

import keras
from keras.layers import Dense
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.applications.resnet50 import preprocess_input, decode_predictions

In [10]:
print(tf.__version__) # built using 1.4.1

1.4.1


In [11]:
print(keras.__version__) # Built using 2.1.3

2.1.3


### Data source
- [Website](http://parnec.nuaa.edu.cn/xtan/data/ClosedEyeDatabases.html)
- [Download link](http://parnec.nuaa.edu.cn/xtan/data/datasets/dataset_B_Facial_Images.rar)


### A note on preprocessing input for models trained with Imagenet data
- for vgg and other models - [here](https://github.com/flyyufelix/cnn_finetune)

Structure of dataset for taining using Keras:
```
data/dataset4
├── test
│   ├── closed
│   └── opened
└── train
    ├── closed
    └── opened
```

### Prepare data

In [14]:
batch_size=32

In [15]:
# ImageDataGenerator from Keras
datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest',
        #data_format="channels_last",
        preprocessing_function=preprocess_input,
)

In [16]:
train_dir,test_dir=('data/dataset4/train', 'data/dataset4/test')

In [17]:
# generator for train data
train_generator = datagen.flow_from_directory(
        train_dir,  #target directory
        target_size=(224, 224),  #all images will be resized to (224,224)
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=True
)

Found 1937 images belonging to 2 classes.


In [18]:
# generator for validation data
validation_generator = datagen.flow_from_directory(
        test_dir, #target directory
        target_size=(224, 224),  #all images will be resized to (224,224)
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=True
)

Found 486 images belonging to 2 classes.


### Model

In [19]:
# Resnet model pretrained on "imagenet" dataset
resnet50=keras.applications.resnet50.ResNet50(include_top=True,
                                     weights="imagenet", 
                                     input_tensor=None, 
                                     input_shape=None,
                                     pooling=None, 
                                     )

In [20]:
resnet50.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        input_1[0][0]                    
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 112, 112, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 112, 112, 64) 0           bn_conv1[0][0]                   
__________________________________________________________________________________________________
max_poolin

In [21]:
out=Dense(2, activation="softmax")(resnet50.layers[-2].output) #new output layer
model_facex=Model(resnet50.input, out, name="EyeInTheSky") # new model with above output layer instead of 1000d from imagenet
model_facex.compile(loss="categorical_crossentropy",optimizer='rmsprop',metrics=['accuracy'])
#all layers in the model are trainable

In [22]:
model_facex.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        input_1[0][0]                    
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 112, 112, 64) 256         conv1[0][0]                      
__________________________________________________________________________________________________
activation_1 (Activation)       (None, 112, 112, 64) 0           bn_conv1[0][0]                   
__________________________________________________________________________________________________
max_poolin

In [30]:
486//batch_size,

(15,)

In [39]:
model_facex.fit_generator(train_generator,
                          steps_per_epoch=len(train_generator)
                          epochs=25,
                          validation_data=validation_generator,
                          validation_steps=len(validation_generator),
) #train the model for 25 epochs

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7fdbe3bd4510>

In [391]:
# save model (layers, train config) and weights
#model_facex.save("FINAL1.h5")

In [38]:
# load complete model
#keras.models.load_model("FINAL1.h5")

### Predict for video

#### Predict for video

Video source: https://www.youtube.com/watch?v=ZiGWZRDXCLc

In [None]:
def predict_on_video(model_path, in_video_path, out_video_path):
    """
        Makes prediction for video file (at `video_path`) and saves video annonated with predictions to `out_video_path`
        `model_path` - path to Keras model data (containing model layer data, model config data and weights)
    """
    
    def label_img(img, label, loc=(3,50)): #annotates the image with the predicted label (close/open)
        return cv2.putText(img, label, loc, cv2.FONT_HERSHEY_SIMPLEX, 3.4, (0, 255, 0), 2, cv2.LINE_AA)
    
    classes={1:"open",0:"close"}
    model=keras.models.load_model(model_path)
    # setup to read video
    vid = cv2.VideoCapture(in_video_path)
    vid.open(in_video_path)
    # setup to write video
    h,w=map(int, [vid.get(cv2.CAP_PROP_FRAME_HEIGHT), vid.get(cv2.CAP_PROP_FRAME_WIDTH)])
    fps=int(vid.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*"XVID")
    vidout = cv2.VideoWriter(out_video_path,fourcc, fps, (w, h), True)
    secs=np.ceil(1.0*vid.get(cv2.CAP_PROP_FRAME_COUNT)/vid.get(cv2.CAP_PROP_FPS)).astype(np.int32) #time of video
    for sec in range(secs): # proces 1 second of video each iteration
        images=list()
        for i in range(fps):
            code,image=vid.read()
            if code:
                images.append(image)
            else: #end of video
                break
        # preprocess each frame(image) - resize and remove mean from RGB channels
        processed_images=np.stack([cv2.resize(preprocess_input(img.astype(np.float32)),(224,224)) for img in images], axis=0)

        #predict labels for 1 sec length of video
        preds=model.predict(processed_images)
        labels=[classes[p] for p in np.argmax(preds, axis=1)] #close/open label for each frame
        for j,image in enumerate(images):
            which=0 if labels[j]=="close" else 1
            label=labels[j] + " (%0.2f)" %(preds[j][which])
            img_a=label_img(image, label) #annotate each original frame with predicted label
            vidout.write(img_a) #write annoated frame to video file
    #close input and output video files
    vid.release()
    vidout.release()
    return

model_path="FINAL1.h5"
in_video_path="data/coca_cola_eyes_closed.mp4"
out_video_path="coca_cola_labelled.avi"
predict_on_video(model_path, in_video_path, out_video_path)

In [12]:
def detect_faces(image):
    """
        Using dlib library to extract faces from images, as the model was trained on faces.
        It would be better to use `dlib.cnn_face_detection_model_v1` but slower (http://dlib.net/cnn_face_detector.py.html)    
    """
    face_detector=dlib.get_frontal_face_detector()
    detected_faces=face_detector(image,1)
    face_frames=[(x.left(),x.top(),x.right(),x.bottom()) for x in detected_faces]
    
    #using cnn face detector
    #cnn_face_detector=dlib.cnn_face_detection_model_v1("mmod_human_face_detector.dat")
    # get `mmod_human_face_detector.dat` at http://dlib.net/files/mmod_human_face_detector.dat.bz2
    #detected_faces=cnn_face_detector(img,1)
    #face_frames=[(x.rect.left(),x.rect.top(),x.rect.right(),x.rect.bottom()) for x in detected_faces]
    
    return face_frames

In [None]:
# Accurate than above method, but slower
# This predictor is more accurate (and slow) as we extract the face and feed it to the model
def predict_on_video2(model_path, in_video_path, out_video_path):
    """
        Makes prediction for video file (at `video_path`) and saves video annonated with predictions to `out_video_path`
        `model_path` - path to Keras model data (containing model layer data, model config data and weights)
    """
    
    def label_img(img, label, loc=(3,50)): #annotates the image with the predicted label (close/open)
        return cv2.putText(img, label, loc, cv2.FONT_HERSHEY_SIMPLEX, 3.4, (0, 255, 0), 2, cv2.LINE_AA)
    
    classes={1:"open",0:"close"}
    model=keras.models.load_model(model_path)
    # setup to read video
    vid = cv2.VideoCapture(in_video_path)
    vid.open(in_video_path)
    # setup to write video
    h,w=map(int, [vid.get(cv2.CAP_PROP_FRAME_HEIGHT), vid.get(cv2.CAP_PROP_FRAME_WIDTH)])
    fps=int(vid.get(cv2.CAP_PROP_FPS))
    fourcc = cv2.VideoWriter_fourcc(*"XVID")
    vidout = cv2.VideoWriter(out_video_path,fourcc, fps, (w, h), True)
    secs=np.ceil(1.0*vid.get(cv2.CAP_PROP_FRAME_COUNT)/vid.get(cv2.CAP_PROP_FPS)).astype(np.int32) #time of video
    noface=list()
    for sec in range(secs): # proces 1 second of video each iteration
        images=list()
        for i in range(fps):
            code,image=vid.read()
            if code:
                face_rect=detect_faces(image)
                if len(face_rect)>0:
                    image=np.array(Image.fromarray(image).crop(face_rect[0]))
                else:
                    noface.append(i)
                images.append(image)
            else: #end of video
                break
        # preprocess each frame(image) - resize and remove mean from RGB channels
        processed_images=np.stack([cv2.resize(preprocess_input(img.astype(np.float32)),(224,224)) for img in images], axis=0)

        #predict labels for 1 sec length of video
        preds=model.predict(processed_images)
        labels=[classes[p] for p in np.argmax(preds, axis=1)] #close/open/noface label for each frame
        labels=["noface" if i in noface else label for i,label in enumerate(labels)]
        for j,image in enumerate(images):
            which=0 if labels[j]=="close" else 1
            label=labels[j] + (" (%0.2f)" %(preds[j][which]) if label in classes.keys() else '')
            img_a=label_img(image, label) #annotate each original frame with predicted label
            vidout.write(img_a) #write annoated frame to video file
    #close input and output video files
    vid.release()
    vidout.release()
    return

model_path="FINAL1.h5"
in_video_path="data/coca_cola_eyes_closed.mp4"
out_video_path="coca_cola_labelled2.avi"
predict_on_video2(model_path, in_video_path, out_video_path)

#### Predict for image

In [80]:
def predict_on_image(model_path, in_image_path):
    """
        Makes prediction for imge(at `in_image_path`) and returns one of {"open", "close", "noface"}
        `model_path` - path to Keras model data (containing model layer data, model config data and weights)
    """
    
    def label_img(img, label, loc=(3,50)): #annotates the image with the predicted label (close/open)
        return cv2.putText(img, label, loc, cv2.FONT_HERSHEY_SIMPLEX, 3.4, (0, 255, 0), 2, cv2.LINE_AA)
    
    classes={1:"open",0:"close"}
    model=keras.models.load_model(model_path)
    image=io.imread(in_image_path)
    face_rect=detect_faces(image)
    if len(face_rect)>0:
        image=np.array(Image.fromarray(image).crop(face_rect[0]))
    else:
        return "noface"
    # preprocess image - resize and remove mean from RGB channels
    processed_image=np.stack([cv2.resize(preprocess_input(img.astype(np.float32)),(224,224)) for img in [image]], axis=0)
    pred=model.predict(processed_image)    
    labels=[classes[p] for p in np.argmax(pred, axis=1)] #close/open label for image
    return labels[0]

model_path="FINAL1.h5"
in_image_path="sample_image.jpg"
predict_on_image(model_path, in_image_path)

'close'

### End