#LibiumNet: Lip-Reading using RCNN

In [0]:

import os, glob
import imageio
import itertools
import math

from tqdm import tqdm

import numpy as np
import pandas as pd
import torch
import torchvision
from torchvision import datasets, models, transforms
import torch.nn as nn
import torch.nn.functional as F

from collections import OrderedDict

import matplotlib.pyplot as plt

imageio.plugins.ffmpeg.download()

import warnings
warnings.filterwarnings('ignore')

Imageio: 'ffmpeg-linux64-v3.3.1' was not found on your computer; downloading it now.
Try 1. Download from https://github.com/imageio/imageio-binaries/raw/master/ffmpeg/ffmpeg-linux64-v3.3.1 (43.8 MB)
Downloading: 8192/45929032 bytes (0.0%)3416064/45929032 bytes (7.4%)7618560/45929032 bytes (16.6%)11804672/45929032 bytes (25.7%)15949824/45929032 bytes (34.7%)20004864/45929032 bytes (43.6%)24158208/45929032 bytes (52.6%)28295168/45929032 bytes (61.6%)32522240/45929032 bytes (70.8%)36683776/45929032 bytes (79.9%)40370176/45929032 bytes (87.9%)44605440/45929032 bytes (97.1%)45929032/45929032 bytes (100.0%)
  Done
File saved as /root

In [0]:
# check if CUDA is available
train_on_gpu = torch.cuda.is_available()
device = 'cpu'
if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')
    device = 'cuda'

CUDA is available!  Training on GPU ...


In [0]:
# mounting notebook to google drive
from google.colab import drive
drive.mount('/gdrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive/


In [0]:
from tensorflow.keras.utils import to_categorical

"""
This model generates generator of the datasets for the Network. 

@authors : Mustapha Tidoo Yussif, Samuel Atule, Jean Sabastien Dovonon
         and Nutifafa Amedior. 
"""
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256
IMAGE_CHANNEL = 3
NUM_FRAMES = 29
NUM_CLASSES = 4
        
        
class GenerateDataset(object):
    """Generates generator for the datasets
    
    This model generates a generator for the datasets. This done to efficiently 
    manage space.
    
    :param: file_path: path to files/videos.
    :param directory: Path to the main directory.
    """
    def __init__(self, file_path, directory, n_items):
        self.n_items = n_items
        self.directory = directory
        self.file_path = file_path
        self.num_samples = len(self.samples(self.get_video_files(self.file_path, self.directory)))
        

    def load_video(self, filename):
        """Loads the specified video using ffmpeg.

        Returns:
            List[FloatTensor]: the frames of the video as a list of 3D tensors
                (channels, width, height)"""
        
        reader = imageio.get_reader(filename,  'ffmpeg')
        
        return np.array(list(reader), dtype=np.float32)
    
    def resize_frames(self, frames):
        """
        Crops the frames of the videos around the mouth region.
        This is the part that is most important part and relevant
        to the model (where we can get the relevant features)

        :param frames: The frames in the video. 
        :return: returns the croped frames.
        """
        tf.image.resize_images(X, (IMAGE_SIZE, IMAGE_SIZE), 
                                    tf.image.ResizeMethod.NEAREST_NEIGHBOR) 

    def get_sample_size(self):
      return self.num_samples
    
    
    def create_df(self, file_path):
        '''
        creates pandas dataframe of labels and words directories
        '''
        
        d = {}
        y_labels = []
        class_folders = []
        for ind, clss in enumerate(os.listdir(file_path)):
            y_labels.append(ind)
            class_folders.append(clss)
        
        d['directory'] = class_folders
        d['class'] = y_labels
        return pd.DataFrame(d)


    def get_video_files(self, file_path, directory=None):
        '''
        get video files from word class directories
        '''
        d = {}
        f = []
        
        for root, dirs, files in os.walk(file_path):
            if root.split('/')[-1] == directory:
                for file in files:
                    if file.endswith(".mp4"):
                        target_file = file.split('_')[0]
                        f.append(target_file)
                        if target_file not in d:
                            d[target_file] = []
                        d[target_file].append(os.path.join(root, file))
                    
        return d
        
    def generator(self, batch = 1):
        """Interfaces the private generator method

        :param num_items_per_class: The number of items in a categority. 
        :param batch: The batch size.
        """
        data = self.create_df(self.file_path)
        video_files = self.get_video_files(self.file_path, self.directory)
        return self._generator(data, directory = self.directory, video_files = video_files, BATCH_SIZE = batch)

    def samples(self, video_files):
      train = []
      for key, value in video_files.items():
        ind = 0
        for file in value:
          train.append(file)
          ind+=1
          if ind == self.n_items:
            break
          
      return train
    
    def _generator(self, data, directory=None, video_files=None, BATCH_SIZE = 64):
        
        '''
        retrieves the training batch for each iteration
        '''
        
        train = []
        for key, value in video_files.items():
            ind = 0
            for file in value:
                train.append(file)
                ind+=1
                if ind == self.n_items:
                  break
                
                  
                
        while True:
            # Randomize the indices to make an array
            indices_arr = np.random.permutation(len(train))
            
            for batch in range(0, len(indices_arr), BATCH_SIZE):
                # slice out the current batch according to batch-size
                current_batch = indices_arr[batch:(batch + BATCH_SIZE)]

                # initializing the arrays, x_train and y_train
                x_train = np.empty([0, NUM_FRAMES, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL], dtype=np.float32)
            
                y_train = np.empty([0], dtype=np.int32)

                for i in current_batch:
                    # get an image and its corresponding color for an traffic light
                    video_frames = self.load_video(train[i])
                    
                    
                    #preprocess frames from videos
#                     video_frames = tf.image.resize_nearest_neighbor(video_frames,(IMAGE_HEIGHT, IMAGE_WIDTH), )
                    #video_frames = tf.image.rgb_to_grayscale(video_frames)
#                     video_frames = tf.reshape(video_frames, (NUM_FRAMES, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL))

                    # Appending them to existing batch
                    x_train = np.append(x_train, [video_frames/255], axis=0)
                    y_train = np.append(y_train, [ data.loc[ data['directory'] == train[i].split('/')[-1].split('_')[-2] ].values[0][1] ])
                    #print(data.loc[ data['directory'] == train[i].split('/')[-1].split('_')[-2] ].values[0][1])
                    
                
                y_train = to_categorical(y_train, num_classes=NUM_CLASSES)

                yield torch.autograd.Variable(torch.from_numpy(x_train)), torch.autograd.Variable(torch.from_numpy(y_train))
        

In [0]:
train_loader = GenerateDataset('/gdrive/My Drive/LibiumNet/lipread_mp4/', 'train', 2)
datasets = train_loader.generator()

In [0]:
next(datasets)[1]

tensor([[0., 0., 1., 0.]])

In [0]:
class C3D(nn.Module):
    def __init__(self):
        super(C3D, self).__init__()
        self.group1 = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=5, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
        
        self.group2 = nn.Sequential(
            nn.Conv3d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2)))
        
        

        self._features = nn.Sequential(
            self.group1,
            self.group2
        )


    def forward(self, x):
        x = x.view((1, 3, 256, 256, 29))
        return self._features(x)

In [0]:
class Bottleneck(nn.Module):
    def __init__(self, nChannels, growthRate):
        super(Bottleneck, self).__init__()
        interChannels = 4*growthRate
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, interChannels, kernel_size=1,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(interChannels)
        self.conv2 = nn.Conv2d(interChannels, growthRate, kernel_size=3,
                               padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat((x, out), 1)
        return out

class SingleLayer(nn.Module):
    def __init__(self, nChannels, growthRate):
        super(SingleLayer, self).__init__()
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, growthRate, kernel_size=3,
                               padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = torch.cat((x, out), 1)
        return out

class Transition(nn.Module):
    def __init__(self, nChannels, nOutChannels):
        super(Transition, self).__init__()
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, nOutChannels, kernel_size=1,
                               bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, growthRate=40, depth=10, reduction=1, bottleneck=True):
        super(DenseNet, self).__init__()

        nDenseBlocks = (depth-4) // 3
        if bottleneck:
            nDenseBlocks //= 2

        nChannels = 2*growthRate
        self.conv1 = nn.Conv2d(128, nChannels, kernel_size=3, padding=1,
                               bias=False)
        self.dense1 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks*growthRate
        nOutChannels = int(math.floor(nChannels*reduction))
        self.trans1 = Transition(nChannels, nOutChannels)

        nChannels = nOutChannels
        self.dense2 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks*growthRate
        nOutChannels = int(math.floor(nChannels*reduction))
        self.trans2 = Transition(nChannels, nOutChannels)

        nChannels = nOutChannels
        self.dense3 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks*growthRate

        self.bn1 = nn.BatchNorm2d(nChannels)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.bias.data.zero_()

    def _make_dense(self, nChannels, growthRate, nDenseBlocks, bottleneck):
        layers = []
        for i in range(int(nDenseBlocks)):
            if bottleneck:
                layers.append(Bottleneck(nChannels, growthRate))
            else:
                layers.append(SingleLayer(nChannels, growthRate))
            nChannels += growthRate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.dense3(out)
        out = F.avg_pool2d(F.relu(self.bn1(out)), kernel_size=3)
        return out

In [0]:
class block(nn.Module):
    def __init__(self,ni):
        super(block, self).__init__()
        self.conv1 = nn.Conv1d(ni, ni, 1)
        self.conv2 = nn.Conv1d(ni, ni, 3, 1, 1)

    def forward(self,x):
        residual = x
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        
        
        
        out += residual
        
        
        return out

In [0]:
class SEQ(nn.Module):
    def __init__(self, input_size, hidden_size1):
        super(SEQ, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size1

        self.c1 = block(input_size)
        self.p1 = nn.AvgPool2d(2)
        self.c2 = block(hidden_size1)
        self.p2 = nn.AvgPool1d(2)
        self.c3 = block(hidden_size1)
        self.p3 = nn.AvgPool1d(2)
        self.c4 = block(hidden_size1)
        self.p4 = nn.AvgPool1d(2)

    def forward(self, inputs):
      
        #print(inputs.shape)

        # Run through Conv1d and Pool1d layers
        c = self.c1(inputs)
        p = self.p1(c)
        c = self.c2(p)
        p = self.p2(c)
        c = self.c3(p)
        p = self.p3(c)
        c = self.c4(p)
        p = self.p4(c)
        
        
        
        out = F.relu(p)
        out = out.view(out.size(0), out.size(1))
        return out

In [0]:
def flatten(t):
    t = t.reshape(1, -1)
    t = t.squeeze()
    return t
  
class TimeDistributed(nn.Module):
    def __init__(self, module, batch_first=True):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):

        tList = [flatten(self.module(m)) for m in torch.unbind(x, dim=4) ]
        y = torch.stack(tList, dim=0)
        # We have to reshape Y
        if self.batch_first:
            y = y.contiguous().view(x.size(0), y.size(-1), -1)  # (samples, timesteps, output_size)
        else:
            y = y.view(-1, x.size(1), y.size(-1))  # (timesteps, samples, output_size)
        return y

In [0]:
n_classes = 4

model = nn.Sequential(OrderedDict([
    ('frontend', C3D()),
    ('features', TimeDistributed(DenseNet())),
    ('backend', SEQ(input_size=5000, hidden_size1=2500)),
    ('fc', nn.Sequential( nn.Dropout(p=0.5), nn.Linear(2500, n_classes) ))
]))

In [0]:
## function to train the network
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    train_loss = 0
    correct = 0
    
    train_steps = train_loader.get_sample_size()
    
    gen = train_loader.generator()
    
    for batch_idx in tqdm(range(train_steps)): 
        data, target = next(gen)
        data, target = data.to(device), torch.max(target.long().to(device), 1)[1]
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()
        
    train_loss /= train_loader.get_sample_size()
    print('Epoch: {} , Training Accuracy: {}/{} ({:.0f}%) Training Loss: {:.6f}'.format(
                epoch, correct, train_loader.get_sample_size(),
                100. * correct / train_loader.get_sample_size(), train_loss))

## function to train the network
def test(model, device, criterion, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    steps = test_loader.get_sample_size()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader.generator()):
            data, target = data.to(device), torch.max(target.long().to(device), 1)[1]
            output = model(data)
            test_loss += criterion(output, target).item()
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
            if batch_idx >= steps-1:
              break

    test_loss /= test_loader.get_sample_size()

    print('Test loss: {:.4f}, Test Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, test_loader.get_sample_size(),
        100. * correct / test_loader.get_sample_size()))

In [0]:
import torch.optim as optim
# specify loss function (categorical cross-entropy)
criterion = nn.CrossEntropyLoss()
# specify optimizer and learning rate
optimizer = optim.SGD(
  [
        {"params": model.fc.parameters(), "lr": 1e-3},
        {"params": model.backend.parameters(), "lr": 1e-5},
        {"params": model.features.parameters(), "lr": 1e-4},
        {"params": model.frontend.parameters(), "lr": 1e-4},
  ],
  momentum = 0.9
)

NameError: ignored

In [0]:
train_loader = GenerateDataset('/gdrive/My Drive/LibiumNet/lipread_mp4/', 'train',200)
test_loader = GenerateDataset('/gdrive/My Drive/LibiumNet/lipread_mp4/', 'val',20)

In [0]:
# number of epochs to train the model
n_epochs = 30

if train_on_gpu:
    model.cuda()

print("Start training \n\n")

for epoch in range(1, n_epochs+1):
  
    
    train (model, device, train_loader, optimizer, criterion, epoch)
    test (model, device, criterion, test_loader)
    
    print ('\n')
            
print ("Done training")

  0%|          | 0/800 [00:00<?, ?it/s]

Start training 




100%|██████████| 800/800 [17:53<00:00,  1.44s/it]


Epoch: 1 , Training Accuracy: 189/800 (24%) Training Loss: 3.064293


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 15.5779, Test Accuracy: 20/80 (25%)





100%|██████████| 800/800 [13:48<00:00,  1.05s/it]


Epoch: 2 , Training Accuracy: 210/800 (26%) Training Loss: 2.731881


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 7.1115, Test Accuracy: 18/80 (22%)





100%|██████████| 800/800 [13:47<00:00,  1.03s/it]


Epoch: 3 , Training Accuracy: 190/800 (24%) Training Loss: 2.410185


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 7.4882, Test Accuracy: 21/80 (26%)





100%|██████████| 800/800 [13:47<00:00,  1.03s/it]


Epoch: 4 , Training Accuracy: 188/800 (24%) Training Loss: 2.212992


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 8.4877, Test Accuracy: 20/80 (25%)





100%|██████████| 800/800 [13:49<00:00,  1.04s/it]


Epoch: 5 , Training Accuracy: 200/800 (25%) Training Loss: 2.078209


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 8.1017, Test Accuracy: 20/80 (25%)





100%|██████████| 800/800 [13:47<00:00,  1.04s/it]


Epoch: 6 , Training Accuracy: 220/800 (28%) Training Loss: 1.848193


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 3.6153, Test Accuracy: 21/80 (26%)





100%|██████████| 800/800 [13:48<00:00,  1.03s/it]


Epoch: 7 , Training Accuracy: 219/800 (27%) Training Loss: 1.833462


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 3.4465, Test Accuracy: 18/80 (22%)





100%|██████████| 800/800 [13:47<00:00,  1.03s/it]


Epoch: 8 , Training Accuracy: 238/800 (30%) Training Loss: 1.777003


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 2.0547, Test Accuracy: 21/80 (26%)





100%|██████████| 800/800 [13:46<00:00,  1.03s/it]


Epoch: 9 , Training Accuracy: 282/800 (35%) Training Loss: 1.649221


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 3.2229, Test Accuracy: 19/80 (24%)





100%|██████████| 800/800 [13:45<00:00,  1.03s/it]


Epoch: 10 , Training Accuracy: 284/800 (36%) Training Loss: 1.546823


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 3.8496, Test Accuracy: 24/80 (30%)





100%|██████████| 800/800 [13:46<00:00,  1.04s/it]


Epoch: 11 , Training Accuracy: 305/800 (38%) Training Loss: 1.531270


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 1.8881, Test Accuracy: 19/80 (24%)





100%|██████████| 800/800 [13:46<00:00,  1.02s/it]


Epoch: 12 , Training Accuracy: 337/800 (42%) Training Loss: 1.388208


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 2.7919, Test Accuracy: 22/80 (28%)





100%|██████████| 800/800 [13:48<00:00,  1.05s/it]


Epoch: 13 , Training Accuracy: 367/800 (46%) Training Loss: 1.340048


  0%|          | 0/800 [00:00<?, ?it/s]

Test loss: 8.6739, Test Accuracy: 18/80 (22%)





 56%|█████▌    | 448/800 [07:43<06:07,  1.04s/it]

In [0]:
torch.save(model.state_dict(), 'densefullmodel2.pwf')