#### **Lip Reading Application**

##### Install/Import dependencies

In [None]:
%%capture
#Install dependencies and confirm installation with !pip list
!pip install opencv-python matplotlib imageio gdown tensorflow-macos
!pip list

In [None]:
#Import dependencies
import os
import cv2
#import tensorflow as tf
import numpy as np
from typing import List
from matplotlib import pyplot as plt
import imageio
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import gdown

#Neural Network dependencies
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler

##### Load Data

In [None]:
# %%capture
# #Downloading dataset that was made for creating lip-read models 
# UNCOMMENT FOR FIRST RUN
# url = 'https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL'
# output = 'data.zip'
# gdown.download(url, output, quiet = False)
# gdown.extractall('data.zip')

In [None]:
def load_video(path) -> List[float]:
    '''
    Takes in a path to a video and returns the float values for each frame
        Args: 
            path: str --> path to video that will be passed into model
        Returns:
            List of floats that represents video1
    '''
    cap = cv2.VideoCapture(path)
    frames=[]
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        success, frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        #isolates the mouth --> we can also use a lip detector to isolate the mouth
        frames.append(frame[190:236,80:220,:])
    cap.release()
    
    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    return tf.cast((frames-mean), tf.float32)/std                 
    

In [None]:
vocab=[x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]

In [None]:
char_num = tf.keras.layers.StringLookup(vocabulary = vocab, oov_token="")
num_char = tf.keras.layers.StringLookup(vocabulary = char_num.get_vocabulary(), oov_token="", invert = True)
print(f'The vocab is: {char_num.get_vocabulary()}'
      f'(size={char_num.vocabulary_size()})')

The vocab is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' '](size=40)


In [None]:
def load_alignments(path) -> List[str]:
    with open(path, 'r') as f:
        lines = f.readlines()
    tokens = []
    for line in lines:
        line = line.split()
        if line[2]!='sil':
            tokens=[*tokens,' ', line[2]]
    return char_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

In [None]:
def load_data(path):
    path = bytes.decode(path.numpy())
    file_name = path.split('/')[-1].split('.')[0]
    video_path = os.path.join('data','s1',f'{file_name}.mpg')
    alignment_path = os.path.join('data','alignments', 's1',f'{file_name}.align')
    frames = load_video(video_path)
    alignments = load_alignments(alignment_path)
    return frames, alignments

In [None]:
def mappable_function(path) -> List[str]:
    return tf.py_function(load_data,[path], (tf.float32,tf.int64))

##### Tensorflow Data Pipeline

In [None]:
data = tf.data.Dataset.list_files('./data/s1/*.mpg')
data = data.shuffle(500)
data = data.map(mappable_function)
#batching into groups of 2, padding 75 frames with 40 tokens
data = data.padded_batch(2, padded_shapes=([75,None,None,None],[40]))
data = data.prefetch(tf.data.AUTOTUNE)

In [None]:
#the data is now in batches of 2 alignments and 2 frames
frames, alignments = data.as_numpy_iterator().next()
alignments, len(frames)

(array([[19,  5, 20, 39, 18,  5,  4, 39,  2, 25, 39, 21, 39, 14,  9, 14,
          5, 39,  1,  7,  1,  9, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0],
        [12,  1, 25, 39, 18,  5,  4, 39, 23,  9, 20,  8, 39, 19, 39, 26,
          5, 18, 15, 39, 16, 12,  5,  1, 19,  5,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0]]),
 2)

##### Tensorflow Neural Network

In [None]:

model = Sequential()
model.add(Conv3D(128,3,input_shape=(75,46,140,1),padding='same',activation='relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(256,3,padding='same',activation='relu'))
model.add(MaxPool3D((1,2,2)))

model.add(Conv3D(75,3,padding='same',activation='relu'))
model.add(MaxPool3D((1,2,2)))

model.add(TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))

model.add(Dense(char_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))

2023-07-18 15:21:00.440034: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-18 15:21:00.441084: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-18 15:21:00.441723: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [None]:
model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv3d_50 (Conv3D)          (None, 75, 46, 140, 128)  3584      
                                                                 
 max_pooling3d_50 (MaxPoolin  (None, 75, 23, 70, 128)  0         
 g3D)                                                            
                                                                 
 conv3d_51 (Conv3D)          (None, 75, 23, 70, 256)   884992    
                                                                 
 max_pooling3d_51 (MaxPoolin  (None, 75, 11, 35, 256)  0         
 g3D)                                                            
                                                                 
 conv3d_52 (Conv3D)          (None, 75, 11, 35, 75)    518475    
                                                                 
 max_pooling3d_52 (MaxPoolin  (None, 75, 5, 17, 75)  

In [None]:
#check what our model returns
yhat = model.predict(val[0])



array([[[0.02289667, 0.02361521, 0.02488543, ..., 0.0276392 ,
         0.01984844, 0.02383464],
        [0.02218219, 0.02346753, 0.02441526, ..., 0.02791576,
         0.01969178, 0.02348711],
        [0.02165942, 0.02334739, 0.02352175, ..., 0.02846433,
         0.01952133, 0.02338246],
        ...,
        [0.02074331, 0.02085769, 0.01731861, ..., 0.02948437,
         0.01839092, 0.02197212],
        [0.02120205, 0.02090364, 0.01751425, ..., 0.02872388,
         0.01903566, 0.02199452],
        [0.02171118, 0.02099093, 0.01798891, ..., 0.0277431 ,
         0.01958937, 0.0223166 ]],

       [[0.0223127 , 0.02238715, 0.02306183, ..., 0.02518002,
         0.01944063, 0.02421109],
        [0.02208225, 0.02214982, 0.0228009 , ..., 0.02528704,
         0.01898592, 0.02399273],
        [0.02189158, 0.02182242, 0.02237349, ..., 0.02540128,
         0.01873199, 0.02405256],
        ...,
        [0.02155303, 0.02226604, 0.01852947, ..., 0.02656786,
         0.01912714, 0.02214107],
        [0.0

In [None]:
#Prediction by the model before training
tf.strings.reduce_join([num_char(tf.argmax(x)) for x in yhat[0]])

<tf.Tensor: shape=(), dtype=string, numpy=b'qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqffffffff'>