In [1]:

import os, glob
import imageio
import itertools
import math

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, TimeDistributed, LSTM, Input, BatchNormalization, Conv2D, MaxPooling2D, Reshape, Conv1D, GlobalAveragePooling1D, MaxPooling1D, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
tf.__version__

'2.2.0'

In [3]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [4]:
# Reading will be updated for the new dataset
from reading import read_gif

In [5]:

"""
This model generates generator of the datasets for the Network. 

"""
# Image variables need to be updated for the new dataset
IMAGE_HEIGHT = 200
IMAGE_WIDTH = 200
IMAGE_CHANNEL = 3
NUM_FRAMES = 50
NUM_CLASSES = 2
        
        
class GenerateDataset(object):
    """Generates generator for the datasets
    
    This model generates a generator for the datasets. This done to efficiently 
    manage space.
    
    :param: file_path: path to files/videos.
    :param directory: Path to the main directory.
    """
    def __init__(self, file_path, directory, n_items):
        self.n_items = n_items
        self.directory = directory
        self.file_path = file_path
        self.num_samples = len(self.samples(self.get_video_files(self.file_path, self.directory)))
        

    def load_video(self, filename):
        """Loads the specified video.

        Returns:
            List[FloatTensor]: the frames of the video as a list of 3D tensors
                (channels, width, height)"""
        
        # This part should be changed after reading.py is updated for the new dataset
        
        return read_gif(filename)
    

    def get_sample_size(self):
        return self.num_samples
    
    
    def create_df(self, file_path):
        '''
        creates pandas dataframe of labels and actions directories
        '''
        
        d = {}
        y_labels = []
        class_folders = []
        for ind, clss in enumerate(os.listdir(file_path)):
            y_labels.append(ind)
            class_folders.append(clss)
        
        d['directory'] = class_folders
        d['class'] = y_labels
        print(d)
        return pd.DataFrame(d)


    def get_video_files(self, file_path, directory=None):
        '''
        get video files from word class directories
        '''
        d = {}
        f = []
        
        for root, dirs, files in os.walk(file_path):
            for file in files:
                if file.endswith(".gif"):
                    target_file = file.split('_')[0]
                    f.append(target_file)
                    if target_file not in d:
                        d[target_file] = []
                    d[target_file].append(os.path.join(root, file))
        return d
        
    def generator(self):
        """Interfaces the private generator method

        :param num_items_per_class: The number of items in a categority. 
        :param batch: The batch size.
        """
        data = self.create_df(self.file_path)
        video_files = self.get_video_files(self.file_path, self.directory)
        return self._generator(data, directory = self.directory, video_files = video_files)

    def samples(self, video_files):
        train = []
        for key, value in video_files.items():
            ind = 0
            for file in value:
                train.append(file)
                ind+=1
                if ind == self.n_items:
                    break

        return train
    
    def _generator(self, data, directory=None, video_files=None, BATCH_SIZE = 1):
        
        '''
        retrieves the training batch for each iteration
        '''
        
        train = []
        for key, value in video_files.items():
            ind = 0
            for file in value:
                train.append(file)
                ind+=1
                if ind == self.n_items:
                    break
                
                  
                
        while True:
            # Randomize the indices to make an array
            indices_arr = np.random.permutation(len(train))
            
            for batch in range(0, len(indices_arr), BATCH_SIZE):
                # slice out the current batch according to batch-size
                current_batch = indices_arr[batch:(batch + BATCH_SIZE)]

                # initializing the arrays, x_train and y_train
                x_train = np.empty([0, NUM_FRAMES, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL], dtype=np.float32)
            
                y_train = np.empty([0], dtype=np.int32)

                for i in current_batch:
                    # get an image and its corresponding color for an traffic light
                    video_frames = self.load_video(train[i])
                    
                    
                    #preprocess frames from videos
#                     video_frames = tf.image.resize_nearest_neighbor(video_frames,(IMAGE_HEIGHT, IMAGE_WIDTH), )
                    #video_frames = tf.image.rgb_to_grayscale(video_frames)
#                     video_frames = tf.reshape(video_frames, (NUM_FRAMES, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL))

                    # Appending them to existing batch
                    x_train = np.append(x_train, [video_frames/255], axis=0)
                    tvar = train[i].split("\\")[0].split("/")[-1]
                    y_train = np.append(y_train, [ data.loc[ data['directory'] == tvar ].values[0][1] ])
                    #print(data.loc[ data['directory'] == train[i].split('/')[-1].split('_')[-2] ].values[0][1])
                    
                
                y_train = to_categorical(y_train, num_classes=NUM_CLASSES)
                
                yield(x_train, y_train)
        

In [6]:
class LibiumNet(object):
    """TA lipreading model, `LibiunNet`
    This is lip reading model which reads or predicts the words of a spoken mouth in a silent video. 
    This model implements the RCNN (Recurrent Convolutional Neural Network) architecture. 

    :param img_c: The number of channels of the input image. i.e. a frame in a video (default 3).
    :param img_w: The width of the input image i.e. a frame in a video (default 256)
    :param img_h: The height of the input image i.e. a frame in a video (default 256)
    :param frames_n: The total number of frames in an input video (default 29)
    :param output_size: The output size of the network. 
    
    """
    def __init__(self, img_c=3, img_w=IMAGE_WIDTH, img_h=IMAGE_HEIGHT, frames_n=NUM_FRAMES, output_size=NUM_CLASSES):
        self.img_c = img_c
        self.img_w = img_w
        self.img_h = img_h
        self.frames_n = frames_n
        self.output_size = output_size
        self.history = None
        self.build()
    
    def build(self):
        """
        Retrieves the features from the last pool layer in the densenet pretrained model 
        and pass obtained features to LSTM network. 
        """
        input_shape = (self.frames_n, self.img_w, self.img_h, self.img_c) # input shape
   
        
        feature_extractor = Sequential()
        inputShape = (self.img_w, self.img_h, self.img_c)
        chanDim = -1
        
        feature_extractor.add(Lambda(lambda x: tf.keras.backend.mean(x, axis=3, keepdims=True), input_shape=inputShape))
        feature_extractor.add(MaxPooling2D(pool_size=(2, 2)))
        
        # first CONV => RELU => CONV => RELU => POOL layer set
        
        feature_extractor.add(Conv2D(32, (3, 3)))
        feature_extractor.add(Activation("relu"))
        feature_extractor.add(BatchNormalization(axis=chanDim))
        feature_extractor.add(Conv2D(32, (3, 3)))
        feature_extractor.add(Activation("relu"))
        feature_extractor.add(BatchNormalization(axis=chanDim))
        feature_extractor.add(MaxPooling2D(pool_size=(2, 2)))
        feature_extractor.add(Dropout(0.25))
        
        # second CONV => RELU => CONV => RELU => POOL layer set
        feature_extractor.add(Conv2D(64, (3, 3)))
        feature_extractor.add(Activation("relu"))
        feature_extractor.add(BatchNormalization(axis=chanDim))
        feature_extractor.add(Conv2D(64, (3, 3)))
        feature_extractor.add(Activation("relu"))
        feature_extractor.add(BatchNormalization(axis=chanDim))
        feature_extractor.add(MaxPooling2D(pool_size=(2, 2)))
        feature_extractor.add(Dropout(0.25))
        
        
        TIME_PERIODS = self.frames_n
        dims = 53824

        model_m = Sequential()
        model_m.add(Conv1D(10, 2, activation='relu'))
        model_m.add(Conv1D(10, 2, activation='relu'))
        
        
        self.input_data = Input(name='the_input', shape=input_shape, dtype='float32')
        self.image_frame_features = TimeDistributed(feature_extractor)(self.input_data) ## extracting the features from the images
        
        self.flat = TimeDistributed(Flatten())(self.image_frame_features) ## flatten before passing on to the recurrent network

        self.sequence = Flatten()(model_m(self.flat))
        
        self.dense = Dense(self.output_size, activation='softmax')(self.sequence)


        self.model = Model(inputs = self.input_data, outputs=self.dense)


    def summary(self):
        """"Summarizes the architecture of the model.
        
        :return: returns the model architecture summary
        """
        return self.model.summary()
      
    
    def train(self, generator,steps_per_epoch=None, epochs=1,validation_data=None, validation_steps=None):
        # Callbacks
        early_stopping_monitor = EarlyStopping(patience=3)
        callbacks_list = [early_stopping_monitor]

        
        print('Training...')
        
        self.model.compile(
              optimizer=tf.keras.optimizers.Adam(lr=1e-4),
              loss='categorical_crossentropy',
              metrics=['accuracy']
        )
        
        self.history = self.model.fit(generator, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=validation_data, validation_steps = validation_steps)
        
        #self.visualize_accuracy(history)
        #self.visualize_loss(history)
      
      
    def predict(self, input_batch):
        """Predicts a video
        
        :param input_batch: A batch of a sequence of frames. 
        :return: returns the predicted probailities
        """
        return self.model(input_batch)
      
    def visualize_accuracy(self):
        """Visualize model accuracy
        """
        if self.history:
            plt.plot(self.history.history['accuracy'], label='training accuracy')
            plt.plot(self.history.history['val_accuracy'], label='testing accuracy')
            plt.title('Accuracy')
            plt.xlabel('epochs')
            plt.ylabel('accuracy')
            plt.legend()
      
    def visualize_loss(self):
        """Visualizes model loss"""
        if self.history:
            plt.plot(self.history.history['loss'], label='training loss')
            plt.plot(self.history.history['val_loss'], label='testing loss')
            plt.title('Loss')
            plt.xlabel('epochs')
            plt.ylabel('loss')
            plt.legend()

In [7]:
model = LibiumNet(output_size=2)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, 50, 200, 200, 3)] 0         
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 22, 22, 64)    65760     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 50, 30976)         0         
_________________________________________________________________
sequential_1 (Sequential)    (None, 48, 10)            619740    
_________________________________________________________________
flatten_1 (Flatten)          (None, 480)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 962       
Total params: 686,462
Trainable params: 686,078
Non-trainable params: 384
_____________________________________________________

In [8]:
# This part will be updated for the new dataset

gen = GenerateDataset('data/train/', '', 40)
#gen = GenerateDataset('/gdrive/My Drive/LibiumNet/overfit_test/', 'train')
datasets = gen.generator()
num_samples = gen.get_sample_size()
steps_per_epoch = 5

# validation
val_gen = GenerateDataset('data/val/', '', 10)
#val_gen = GenerateDataset('/gdrive/My Drive/LibiumNet/overfit_test/', 'train')
val_datasets = val_gen.generator()
num_valid_samples = val_gen.get_sample_size()
steps_per_valid_epoch = 5

num_samples

{'directory': ['kick', 'punch'], 'class': [0, 1]}
{'directory': ['kick', 'punch'], 'class': [0, 1]}


20

In [None]:
 # training 
epochs = 40
model.train(datasets, steps_per_epoch = steps_per_epoch, epochs=epochs,validation_data=val_datasets, validation_steps=steps_per_valid_epoch)

Training...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100

In [None]:
model.visualize_accuracy()

In [None]:
model.visualize_loss()

In [None]:
#model.model.save('libium.h5')