# Gesture Recognition Project:

### Problem statement: Develop a cool feature in the smart-TV that can recognise five different gestures performed by the user which will help users control the TV without using a remote.


### Importing Libraries

In [1]:
import numpy as np
import os
import imageio
from skimage.io import imread
from skimage.transform import resize as imresize
import datetime
from tensorflow.keras.applications import MobileNet
import cv2

np.random.seed(30)          # for reproducability
import random as rn
rn.seed(30)
from keras import backend as K
import tensorflow as tf

from keras.models import Sequential, Model
from keras.layers import Dense, GRU, Flatten, TimeDistributed, BatchNormalization, Activation, Dropout,LSTM
from keras.layers.convolutional import Conv3D, MaxPooling3D,Conv2D, MaxPooling2D
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras import optimizers

In [2]:
## Mount Google Drive
# from google.colab import drive
# drive.mount('/content/drive',force_remount=True)

### Read the folder names for training and validation

In [3]:
# np.random.permutation will randomly select all train and validation files from CSV files

train_doc = np.random.permutation(open('datasets/Project_data/train.csv').readlines())
val_doc = np.random.permutation(open('datasets/Project_data/val.csv').readlines())
batch_size = 32

### Data Generator

##### Data Generator with grayscale color channel

In [4]:
def generator(source_path, folder_list, batch_size):
    """
    Generator function for creating batches of data for training and validation.

    Args:
        source_path: The path to the directory containing the gesture data.
        folder_list: A list of the folders containing the gesture data (CSV files).
        batch_size: The size of the batches to be created.

    Yields:
        A tuple of (batch_data, batch_labels), where:
            batch_data: A numpy array of shape (batch_size, len(img_idx), 120, 120, 1) containing the grayscale image data for the batch.
            batch_labels: A numpy array of shape (batch_size, 5) containing the one-hot encoded labels for the batch.
    """
    print('Source path = ', source_path, '; batch size =', batch_size)
    img_idx = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28]  # Use even-numbered frames of video

    while True:
        t = np.random.permutation(folder_list)  # Randomly shuffle the order of the folders in folder_list
        num_batches = len(folder_list) // batch_size  # Number of batches

        for batch in range(num_batches):
            batch_data = np.zeros((batch_size, len(img_idx), 120, 120, 1))  # Grayscale images
            batch_labels = np.zeros((batch_size, 5))  # One-hot encoded labels

            for folder in range(batch_size):
                imgs = os.listdir(source_path + '/' + t[folder + (batch * batch_size)].split(';')[0])  # List of images in the video folder

                for idx, item in enumerate(img_idx):
                    image = cv2.imread(
                        source_path + '/' + t[folder + (batch * batch_size)].strip().split(';')[0] + '/' + imgs[
                            item])
                    
                    # Convert the image to grayscale for reducing the model parameters
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                    
                    image = cv2.resize(image, (120, 120))  # Resize the image to (120, 120)

                    # Normalize the grayscale image to the range [0, 1]
                    image = image / 255.0

                    batch_data[folder, idx, :, :, 0] = image

                batch_labels[folder, int(t[folder + (batch * batch_size)].strip().split(';')[2])] = 1  # One-hot encoded labels

            yield batch_data, batch_labels

        # Handle remaining data points after full batches
        remaining_samples = len(folder_list) % batch_size
        print('remaining samples found', remaining_samples)
        if remaining_samples > 0:
            batch_data = np.zeros((batch_size, len(img_idx), 120, 120, 1))  # Grayscale images
            batch_labels = np.zeros((batch_size, 5))

            for folder in range(remaining_samples):
                imgs = os.listdir(source_path + '/' + t[folder + (num_batches * batch_size)].split(';')[0])  # List of images in the video folder

                for idx, item in enumerate(img_idx):
                    image = cv2.imread(
                        source_path + '/' + t[folder + (num_batches * batch_size)].strip().split(';')[0] + '/' + imgs[
                            item])
                    
                    # Convert the image to grayscale
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                    
                    image = cv2.resize(image, (120, 120))  # Resize the image to (120, 120)

                    # Normalize the grayscale image to the range [0, 1]
                    image = image / 255.0

                    batch_data[folder, idx, :, :, 0] = image

                batch_labels[folder, int(t[folder + (num_batches * batch_size)].strip().split(';')[2])] = 1  # One-hot encoded labels

            yield batch_data[:remaining_samples], batch_labels[:remaining_samples]


##### Data Generator with RGB Color channels (for transfer learning use case)

In [5]:
def generator_2(source_path, folder_list, batch_size):
    """
    Generator function for creating batches of data for training and validation.

    Args:
        source_path: The path to the directory containing the gesture data.
        folder_list: A list of the folders containing the gesture data. ( In our case it is CSV file)
        batch_size: The size of the batches to be created.

    Yields:
        A tuple of (batch_data, batch_labels), where:
            batch_data: A numpy array of shape (batch_size, x, y, z, 3) containing the image data for the batch.
            batch_labels: A numpy array of shape (batch_size, 5) containing the one-hot encoded labels for the batch.
    """

    print('Source path = ', source_path, '; batch size =', batch_size)
    img_idx = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28]  # we will use even number frames (images) of video for model training
    # img_idx= list(range(30))                             # we will use all images of video
    while True:
        t = np.random.permutation(folder_list)            # randomly shuffling the order of the folders in the folder_list
        num_batches = len(folder_list) // batch_size      # number of batches
        for batch in range(num_batches):
            batch_data = np.zeros((batch_size, len(img_idx), 120, 120, 3))   # Here, we will use total len(img_idx) images for each video with (120,120) image size
            batch_labels = np.zeros((batch_size, 5))      # batch_labels is the one hot encoded representation of the output
            for folder in range(batch_size):
                imgs = os.listdir(source_path + '/' + t[folder + (batch * batch_size)].split(';')[0]) # read all the images in the video folder

                # For each image of Video foler we will normalize it and resize it to make same shape input to the Model
                for idx, item in enumerate(img_idx):
                    image = imread(source_path + '/' + t[folder + (batch * batch_size)].strip().split(';')[0] + '/' + imgs[item]).astype(np.float32) # read each image
                    image = imresize(image, (120, 120)) # reshape image size to (120,120)

                    # Use Min-Max rescaling for image normalization for each color channel

                    batch_data[folder, idx, :, :, 0] = (
                        image[:, :, 0] - np.min(image[:, :, 0])
                    ) / (
                        np.max(image[:, :, 0]) - np.min(image[:, :, 0])
                    )
                    batch_data[folder, idx, :, :, 1] = (
                        image[:, :, 1] - np.min(image[:, :, 1])
                    ) / (
                        np.max(image[:, :, 1]) - np.min(image[:, :, 1])
                    )
                    batch_data[folder, idx, :, :, 2] = (
                        image[:, :, 2] - np.min(image[:, :, 2])
                    ) / (
                        np.max(image[:, :, 2]) - np.min(image[:, :, 2])
                    )
                batch_labels[folder, int(t[folder + (batch * batch_size)].strip().split(';')[2])] = 1  # one-hot encoded representation of output labels
            yield batch_data, batch_labels

        # Handle remaining data points which are left after full batches

        remaining_samples = len(folder_list) % batch_size
        if remaining_samples > 0:
            batch_data = np.zeros((batch_size, len(img_idx), 120, 120, 3))   # Here, we will use total len(img_idx) images for each video with (120,120) image size
            batch_labels = np.zeros((batch_size, 5))

            for folder in range(remaining_samples):
                imgs = os.listdir(source_path + '/' + t[folder + (num_batches*batch_size)].split(';')[0]) # read all the images in the video folder

                # For each image of Video foler we will normalize it and resize it to make same shape input to the Model
                for idx, item in enumerate(img_idx):
                    image = imread(source_path + '/' + t[folder + (num_batches*batch_size)].strip().split(';')[0] + '/' + imgs[item]).astype(np.float32) # read each image
                    image = imresize(image, (120, 120)) # reshape image size to (120,120)

                    # Use Min-Max rescaling for image normalization for each color channel

                    batch_data[folder, idx, :, :, 0] = (
                        image[:, :, 0] - np.min(image[:, :, 0])
                    ) / (
                        np.max(image[:, :, 0]) - np.min(image[:, :, 0])
                    )
                    batch_data[folder, idx, :, :, 1] = (
                        image[:, :, 1] - np.min(image[:, :, 1])
                    ) / (
                        np.max(image[:, :, 1]) - np.min(image[:, :, 1])
                    )
                    batch_data[folder, idx, :, :, 2] = (
                        image[:, :, 2] - np.min(image[:, :, 2])
                    ) / (
                        np.max(image[:, :, 2]) - np.min(image[:, :, 2])
                    )
                batch_labels[folder, int(t[folder + (num_batches*batch_size)].strip().split(';')[2])] = 1  # one-hot encoded representation of output labels
            yield batch_data[:remaining_samples], batch_labels[:remaining_samples]

Define train_source_path, Validation_Source_path and total number of epochs

In [6]:
curr_dt_time = datetime.datetime.now()
train_path="datasets/Project_data/train"
val_path="datasets/Project_data/val"
num_train_sequences = len(train_doc)
print('# training sequences =', num_train_sequences)
num_val_sequences = len(val_doc)
print('# validation sequences =', num_val_sequences)
num_epochs = 30

# training sequences = 663
# validation sequences = 100


## Model Architecture

#### 1) Using Conv3D and MaxPooling3D ( using without BatchNormalization and dropout )

In [7]:
# def create_model():
#   model = Sequential()

#   model.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', input_shape=(15,120,120,1)))
#   model.add(MaxPooling3D((2, 2, 2)))

#   model.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
#   model.add(MaxPooling3D((2, 2, 2)))

#   model.add(Flatten())
#   model.add(Dense(128, activation='relu'))
#   model.add(Dense(5, activation='softmax'))

#   return model

#### 2) Using Conv3D and MaxPooling3D along with dropout layers and batch-normalization

In [8]:
# # Using Conv3D and MaxPooling3D along with Dropout layer

# def create_model():
#   model = Sequential()

#   model.add(Conv3D(16, (3, 3, 3), activation='relu', padding='same', input_shape=(15,120,120,1)))
#   model.add(BatchNormalization())
#   model.add(MaxPooling3D((2, 2, 2)))

#   model.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same'))
#   model.add(BatchNormalization())
#   model.add(MaxPooling3D((2, 2, 2)))
#   model.add(Dropout(0.25)) 

#   model.add(Flatten())  
#   model.add(Dense(64, activation='relu'))
#   model.add(Dropout(0.50))
#   model.add(Dense(5, activation='softmax'))

#   return model

#### 3) Using Conv3D and MaxPooling3D along with Dropout layers and Batch-Normalization

In [9]:
# # Using Conv3D and MaxPooling3D along with Dropout layer and Batch-Normalization layer

# def create_model():
#   model = Sequential()

#   model.add(Conv3D(16, (3, 3, 3), activation='relu', padding='same', input_shape=(15,120,120,1)))
#   model.add(BatchNormalization())
#   model.add(MaxPooling3D((2, 2, 2)))

#   model.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same'))
#   model.add(BatchNormalization())
#   model.add(MaxPooling3D((2, 2, 2)))
#   model.add(Dropout(0.25)) 
    
#   model.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
#   model.add(BatchNormalization())
#   model.add(MaxPooling3D((2, 2, 2)))
#   model.add(Dropout(0.25))

#   model.add(Flatten())

#   model.add(Dense(128, activation='relu'))
#   model.add(Dropout(0.50))
#   model.add(Dense(64, activation='relu'))
#   model.add(Dropout(0.50))
#   model.add(Dense(5, activation='softmax'))

#   return model

#### 4) Same as Model-3 but removed Batch-Normalization and Dropout layers from Convolutional layers

In [10]:
# Using Conv3D and MaxPooling3D along with Dropout layer and Batch-Normalization layer

def create_model():
  model = Sequential()

  model.add(Conv3D(16, (3, 3, 3), activation='relu', padding='same', input_shape=(15,120,120,1)))  
  model.add(MaxPooling3D((2, 2, 2)))

  model.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same')) 
  model.add(MaxPooling3D((2, 2, 2)))   
    
  model.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))  
  model.add(MaxPooling3D((2, 2, 2)))  

  model.add(Flatten())

  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.50))
  model.add(Dense(64, activation='relu'))
  model.add(Dropout(0.50))
  model.add(Dense(5, activation='softmax'))

  return model

#### 5) Pre-trained MobileNet model with RNN

In [11]:
# def create_model():
#     # Load MobileNet pre-trained on ImageNet without the top classification layer
#     base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(120, 120, 3))

#     # Freeze the weights of the base model
#     for layer in base_model.layers:
#         layer.trainable = False

#     # Define the model
#     model = Sequential()

#     # Add the MobileNetV2 base model with TimeDistributed wrapper
#     model.add(TimeDistributed(base_model, input_shape=(15, 120, 120, 3)))

#     # Flatten the output from the TimeDistributed layers
#     model.add(TimeDistributed(Flatten()))

#     # Recurrent layers (LSTM)
#     model.add(LSTM(64))    

#     # Fully connected layers
#     model.add(Dense(64, activation='relu'))
#     model.add(Dense(5, activation='softmax'))
    
#     return model

#### 6) Pre-trained MobileNet model with RNN (LSTM) with Dropout layer

In [12]:
# def create_model():
#     # Load MobileNet pre-trained on ImageNet without the top classification layer
#     base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(120, 120, 3))

#     # Freeze the weights of the base model
#     for layer in base_model.layers:
#         layer.trainable = False

#     # Define the model
#     model = Sequential()

#     # Add the MobileNet base model with TimeDistributed wrapper
#     model.add(TimeDistributed(base_model, input_shape=(15, 120, 120, 3)))

#     # Flatten the output from the TimeDistributed layers
#     model.add(TimeDistributed(Flatten()))

#     # Recurrent layers (LSTM)
#     model.add(LSTM(64))    

#     # Fully connected layers
#     model.add(Dense(64, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(5, activation='softmax'))
    
#     return model

#### 7) Same as Model-6 but applied MaxPooling2D layer and used GRU instead of LSTM in RNN layer

In [13]:
# def create_model():
#     # Load MobileNet pre-trained on ImageNet without the top classification layer
#     base_model = MobileNet(weights='imagenet', include_top=False, input_shape=(120, 120, 3))

#     # Freeze the weights of the base model
#     for layer in base_model.layers:
#         layer.trainable = False

#     # Define the model
#     model = Sequential()

#     # Add the MobileNet base model with TimeDistributed wrapper
#     model.add(TimeDistributed(base_model, input_shape=(15, 120, 120, 3)))
#     model.add(TimeDistributed(MaxPooling2D((2,2))))

#     # Flatten the output from the TimeDistributed layers
#     model.add(TimeDistributed(Flatten()))

#     # Recurrent layers (GRU)
#     model.add(GRU(32))
#     model.add(Dropout(0.25))

#     # Fully connected layers
#     model.add(Dense(64, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(5, activation='softmax'))
    
#     return model

In [14]:
model=create_model()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
print (model.summary())



2023-09-04 17:18:54.530567: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-09-04 17:18:54.530632: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14796 MB memory:  -> device: 0, name: Quadro RTX 5000, pci bus id: 0000:1e:00.0, compute capability: 7.5


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed (TimeDistr  (None, 15, 3, 3, 1024)   3228864   
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, 15, 1, 1, 1024)   0         
 tributed)                                                       
                                                                 
 time_distributed_2 (TimeDis  (None, 15, 1024)         0         
 tributed)                                                       
                                                                 
 gru (GRU)                   (None, 32)                101568    
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                        

Let's create `train_generator` and `val_generator`

In [15]:
train_generator = generator(train_path, train_doc, batch_size)  # For Model architecture 5,6 & 7 please use generator_2
val_generator = generator(val_path, val_doc, batch_size)

Let's define Model_checkpoint with appropriate Model name and also reduced learning rate on plateau

In [16]:
model_name = 'model_init' + '_' + str(curr_dt_time).replace(' ','').replace(':','_') + '/'
if not os.path.exists(model_name):
    os.mkdir(model_name)

filepath = model_name + 'model-{epoch:05d}-{loss:.5f}-{categorical_accuracy:.5f}-{val_loss:.5f}-{val_categorical_accuracy:.5f}.h5'

checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch')

# Define the ReduceLROnPlateau callback and configure it with the desired parameters, such as the factor, patience, and min_lr. This callback will reduce the learning rate when a monitored metric (e.g., validation loss) plateaus.

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',   # Monitor validation loss
    factor=0.5,           # Reduce learning rate by a factor of 0.5
    patience=2,           # Number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-6,          # Minimum learning rate
    verbose=1             # Provide verbose output
)

callbacks_list = [checkpoint, reduce_lr]


The `steps_per_epoch` and `validation_steps` are used by `fit_generator` to decide the number of next() calls it need to make.

In [17]:
if (num_train_sequences%batch_size) == 0:
    steps_per_epoch = int(num_train_sequences/batch_size)
else:
    steps_per_epoch = (num_train_sequences//batch_size) + 1

if (num_val_sequences%batch_size) == 0:
    validation_steps = int(num_val_sequences/batch_size)
else:
    validation_steps = (num_val_sequences//batch_size) + 1

Let's fit the Model and save at the end of each epoch

In [18]:
model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs,verbose=1,
                    callbacks=callbacks_list, validation_data=val_generator,
                    validation_steps=validation_steps, class_weight=None, workers=1, initial_epoch=0)

Source path =  datasets/Project_data/train ; batch size = 32
Epoch 1/30


2023-09-04 17:19:02.430877: I tensorflow/stream_executor/cuda/cuda_dnn.cc:377] Loaded cuDNN version 8302



Epoch 00001: val_loss improved from inf to 1.54037, saving model to model_init_2023-09-0417_18_53.750742/model-00001-1.73880-0.23680-1.54037-0.30000.h5
Epoch 2/30
Epoch 00002: val_loss improved from 1.54037 to 1.41272, saving model to model_init_2023-09-0417_18_53.750742/model-00002-1.60462-0.27149-1.41272-0.50000.h5
Epoch 3/30
Epoch 00003: val_loss improved from 1.41272 to 1.28597, saving model to model_init_2023-09-0417_18_53.750742/model-00003-1.47094-0.36350-1.28597-0.70000.h5
Epoch 4/30
Epoch 00004: val_loss improved from 1.28597 to 1.14174, saving model to model_init_2023-09-0417_18_53.750742/model-00004-1.33164-0.44193-1.14174-0.67000.h5
Epoch 5/30
Epoch 00005: val_loss improved from 1.14174 to 1.05052, saving model to model_init_2023-09-0417_18_53.750742/model-00005-1.24231-0.50377-1.05052-0.67000.h5
Epoch 6/30
Epoch 00006: val_loss improved from 1.05052 to 0.90353, saving model to model_init_2023-09-0417_18_53.750742/model-00006-1.13282-0.55053-0.90353-0.70000.h5
Epoch 7/30
E

KeyboardInterrupt: 