In [1]:
# import Library
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential, load_model
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.optimizers import Adam,SGD, RMSprop
from keras.utils import np_utils, generic_utils
import tensorflow as tf
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

Using TensorFlow backend.


In [2]:
# set up label and  config
classes = ['Swiping Left', 'Swiping Right', 'Swiping Down', 'Swiping Up', 'Zooming In', 'Zooming Out', 'No Gesture' ]
img_rows,img_cols,img_depth = 84, 84, 18
step_sample = 2

# data folder
train_folder = 'dataset\\train\\'

In [3]:
# Load training data
X_tr = []     # variable to store entire dataset

#Reading data from each class
for class_name in classes:    
    class_folder = os.path.join(train_folder, class_name )
    listing = os.listdir(class_folder)
    for vid in listing:
        frames = []
        # set number of necessary frames
        num_frames_necessary = img_depth * step_sample 
        vid = os.path.join(class_folder, vid)
        frame_names = os.listdir(vid)
        frame_names = list(sorted(frame_names))
    #     print(frame_names)
        num_frames = len(frame_names)
        # pick frames
        offset = 0
        if num_frames_necessary > num_frames:
            # Pad last frame if video is shorter than necessary
            frame_names += [frame_names[-1]] * \
                (num_frames_necessary - num_frames)        
        elif num_frames_necessary < num_frames:
            # If there are more frames, then sample starting offset.
            diff = (num_frames - num_frames_necessary)
            offset = np.random.randint(0, diff)
        frame_names = frame_names[offset:num_frames_necessary + offset:2]      
        for name in frame_names:
            name = os.path.join(vid,name)
    #         print(name)
            frame= cv2.imread(name)
            frame=cv2.resize(frame,(img_rows,img_cols),interpolation=cv2.INTER_AREA)
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frames.append(gray)      
        ipt = np.array(frames)     
        X_tr.append(ipt)
    print(np.array(X_tr).shape)
    print('load {} data completed!'.format(class_name))



(1000, 18, 84, 84)
load Swiping Left data completed!
(2000, 18, 84, 84)
load Swiping Right data completed!
(3000, 18, 84, 84)
load Swiping Down data completed!
(4000, 18, 84, 84)
load Swiping Up data completed!
(5000, 18, 84, 84)
load Zooming In data completed!
(6000, 18, 84, 84)
load Zooming Out data completed!
(7000, 18, 84, 84)
load No Gesture data completed!


In [4]:
X_tr = np.array(X_tr)
print(X_tr.shape)

(7000, 18, 84, 84)


In [5]:
# Assign Label to each class for train set
num_samples = X_tr.shape[0]
label=np.ones((num_samples,),dtype = int)
label[0:1000]= 0         # swiping left
label[100:2000] = 1      # swiping right
label[2000:3000] = 2     # swiping down
label[3000:4000] = 3     # swiping up
label[4000:5000] = 4     # zooming in
label[5000:6000] = 5     # zooming out
label[6000:7000] = 6     # No gestures


X_train = np.array(X_tr) 
y_train = label

print(X_train.shape)


(7000, 18, 84, 84)


In [6]:
X_train = np.expand_dims(X_train, axis=-1)
print(X_train.shape)

(7000, 18, 84, 84, 1)


In [7]:
# Pre-processing
X_train = X_train .astype('float32')
X_train  -= np.mean(X_train )
X_train  /= np.max(X_train )

nb_classes = 7
# convert class vectors to binary class matrices
Y_train = np_utils.to_categorical(y_train, nb_classes)

In [8]:
from keras.layers.convolutional import Conv3D, MaxPooling3D
"""See: 'https://arxiv.org/pdf/1412.0767.pdf' """
# Tunable parameters
kernel_size = (3, 3, 3)
strides = (1, 1, 1)
extra_conv_blocks = 1

model = Sequential()

# Conv Block 1
model.add(Conv3D(32, kernel_size, strides=strides, activation='relu',
                 padding='same', input_shape= (18,84,84,1)))
model.add(MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2)))

# Conv Block 2
model.add(Conv3D(64, kernel_size, strides=strides, activation='relu',
                 padding='same'))
model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2)))

# Conv Block 3
model.add(Conv3D(128, kernel_size, strides=strides, activation='relu',
                 padding='same'))
model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2)))

# Conv Block 4
model.add(Conv3D(128, kernel_size, strides=strides, activation='relu',
                 padding='same'))
model.add(MaxPooling3D(pool_size=(2, 2, 2), strides=(2, 2, 2)))

# Dense Block
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(nb_classes, activation='softmax'))



In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d_1 (Conv3D)            (None, 18, 84, 84, 32)    896       
_________________________________________________________________
max_pooling3d_1 (MaxPooling3 (None, 18, 42, 42, 32)    0         
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 18, 42, 42, 64)    55360     
_________________________________________________________________
max_pooling3d_2 (MaxPooling3 (None, 9, 21, 21, 64)     0         
_________________________________________________________________
conv3d_3 (Conv3D)            (None, 9, 21, 21, 128)    221312    
_________________________________________________________________
max_pooling3d_3 (MaxPooling3 (None, 4, 10, 10, 128)    0         
_________________________________________________________________
conv3d_4 (Conv3D)            (None, 4, 10, 10, 128)    442496    
__________

In [10]:
optimizer = Adam(lr=0.001)
model.compile(loss='categorical_crossentropy', optimizer= optimizer, metrics=['accuracy'])

In [11]:
# Split the data
X_train_new, X_val_new, y_train_new, y_val_new = train_test_split(X_train, Y_train, test_size=0.2, random_state=4)

In [12]:
from keras.callbacks import ModelCheckpoint

filepath = "handgesture.weights.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose =1, save_best_only = True, mode ='max')
callbacks_list= [checkpoint]


In [13]:
batch_size = 8
nb_epoch = 10

# Train the model

hist = model.fit(
    X_train_new,
    y_train_new,
    validation_data=(X_val_new,y_val_new),
    batch_size = batch_size,
    epochs = nb_epoch,
    shuffle=True,
    callbacks  = callbacks_list,
    verbose = 2
    )





Train on 5600 samples, validate on 1400 samples
Epoch 1/10
 - 50s - loss: 1.1672 - acc: 0.5534 - val_loss: 0.8077 - val_acc: 0.7057

Epoch 00001: val_acc improved from -inf to 0.70571, saving model to handgesture.weights.best.hdf5
Epoch 2/10
 - 47s - loss: 0.7189 - acc: 0.7375 - val_loss: 0.5497 - val_acc: 0.8257

Epoch 00002: val_acc improved from 0.70571 to 0.82571, saving model to handgesture.weights.best.hdf5
Epoch 3/10
 - 47s - loss: 0.5533 - acc: 0.8130 - val_loss: 0.5215 - val_acc: 0.8171

Epoch 00003: val_acc did not improve from 0.82571
Epoch 4/10
 - 47s - loss: 0.4344 - acc: 0.8554 - val_loss: 0.4590 - val_acc: 0.8557

Epoch 00004: val_acc improved from 0.82571 to 0.85571, saving model to handgesture.weights.best.hdf5
Epoch 5/10
 - 47s - loss: 0.3618 - acc: 0.8827 - val_loss: 0.4480 - val_acc: 0.8621

Epoch 00005: val_acc improved from 0.85571 to 0.86214, saving model to handgesture.weights.best.hdf5
Epoch 6/10
 - 47s - loss: 0.2950 - acc: 0.9075 - val_loss: 0.4357 - val_acc:

In [20]:
model.load_weights("handgesture.weights.best.hdf5")

# Evaluate the model
score = model.evaluate(
    X_val_new,
    y_val_new,
    batch_size=batch_size,
    )
print('accuracy:',score[1]*100)

accuracy: 88.21428571428571


In [23]:
test_folder = 'dataset\\test\\'
# Load training data
X_test = []     # variable to store entire dataset

#Reading data from each class
for class_name in classes:    
    class_folder = os.path.join(test_folder, class_name )
    listing = os.listdir(class_folder)
    for vid in listing:
        frames = []
        # set number of necessary frames
        num_frames_necessary = img_depth * step_sample 
        vid = os.path.join(class_folder, vid)
        frame_names = os.listdir(vid)
        frame_names = list(sorted(frame_names))
    #     print(frame_names)
        num_frames = len(frame_names)
        # pick frames
        offset = 0
        if num_frames_necessary > num_frames:
            # Pad last frame if video is shorter than necessary
            frame_names += [frame_names[-1]] * \
                (num_frames_necessary - num_frames)        
        elif num_frames_necessary < num_frames:
            # If there are more frames, then sample starting offset.
            diff = (num_frames - num_frames_necessary)
            offset = np.random.randint(0, diff)
        frame_names = frame_names[offset:num_frames_necessary + offset:2]      
        for name in frame_names:
            name = os.path.join(vid,name)
    #         print(name)
            frame= cv2.imread(name)
            frame=cv2.resize(frame,(img_rows,img_cols),interpolation=cv2.INTER_AREA)
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            frames.append(gray)      
        ipt = np.array(frames)     
        X_test.append(ipt)
    print(np.array(X_test).shape)
    print('load {} data completed!'.format(class_name))
    
    

(250, 18, 84, 84)
load Swiping Left data completed!
(500, 18, 84, 84)
load Swiping Right data completed!
(750, 18, 84, 84)
load Swiping Down data completed!
(1000, 18, 84, 84)
load Swiping Up data completed!
(1250, 18, 84, 84)
load Zooming In data completed!
(1500, 18, 84, 84)
load Zooming Out data completed!
(1750, 18, 84, 84)
load No Gesture data completed!


In [27]:
# Assign Label to each class for test set
num_samples = np.array(X_test).shape[0]
label=np.ones((num_samples,),dtype = int)
label[0:250]= 0         # swiping left
label[250:500] = 1      # swiping right
label[500:750] = 2     # swiping down
label[750:1000] = 3     # swiping up
label[1000:1250] = 4     # zooming in
label[1250:1500] = 5     # zooming out
label[1500:1750] = 6     # No gestures

X_test = np.array(X_test)
X_test = np.expand_dims(X_test, axis=-1)

y_test = label
Y_test = np_utils.to_categorical(y_test, nb_classes)
print(X_test.shape)
print(Y_test.shape)


(1750, 18, 84, 84, 1)
(1750, 7)


In [30]:
# Pre-processing
X_testing = X_test.astype('float32')
X_testing  -= np.mean(X_testing )
X_testing  /= np.max(X_testing )

In [31]:
# Evaluate the model
score = model.evaluate(
    X_testing,
    Y_test,
    batch_size=batch_size,
    )
print('accuracy:',score[1]*100)

accuracy: 73.6


In [35]:
### Generate Confusion matrix
import sklearn 
from sklearn.metrics import confusion_matrix
Y_predict = model.predict_classes(X_testing)
confusion_matrix(label,Y_predict)

array([[  0, 243,   0,   2,   3,   2,   0],
       [  0, 246,   1,   0,   1,   2,   0],
       [  0,  14, 199,  19,  10,   7,   1],
       [  0,   4,  37, 189,   9,  11,   0],
       [  0,   4,   7,   6, 198,  32,   3],
       [  0,   4,  10,   2,  17, 212,   5],
       [  0,   2,   1,   1,   0,   2, 244]], dtype=int64)

In [41]:
cv2.imshow('input',X_test[1,1,:,:,:])
cv2.waitKey()

92

In [53]:
test_vid_id = 1 
test_vid = X_test[test_vid_id]
test_vid_t = np.expand_dims(test_vid, axis=0)

predicted = int(model.predict_classes(test_vid_t))

for i in range(18):
    image = test_vid[i]
    image = cv2.resize(image,(800,500), interpolation = cv2.INTER_CUBIC)
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(image,'Predict_gesture: {}'.format(classes[predicted]),(50,50), font, 1 ,(255,0,0),1,cv2.LINE_AA)
    cv2.imshow('input',image)
    cv2.waitKey(500)
    
cv2.destroyAllWindows()

