In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, Conv3D, MaxPooling3D,MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, Input, \
    ReLU, GlobalAveragePooling3D, add
from tensorflow.keras.models import Model


output_shape = 6
input = Input(shape=(11, 60, 100, 1))

'''block_0'''
b0_conv3d_1 = Conv3D(64, kernel_size=(2, 3, 3),strides=(1,2,3), padding='same', use_bias=False,
                     name='b0_conv3d_1', kernel_initializer='he_normal')(input)
b0_relu_1 = ReLU(name='b0_relu_1')(b0_conv3d_1)
b0_bn_1 = BatchNormalization(name='b0_bn_1')(b0_relu_1)


'''block_1'''
b1_cnv3d_1 = Conv3D(filters=16, kernel_size=(3, 3, 3) ,padding='same',
                    use_bias=False, name='b1_cnv3d_1', kernel_initializer='he_normal')(b0_bn_1)
b1_relu_1 = ReLU(name='b1_relu_1')(b1_cnv3d_1)
b1_bn_1 = BatchNormalization(name='b1_bn_1')(b1_relu_1)  # size: 14*14

b1_cnv3d_2 = Conv3D(filters=32, kernel_size=(1, 1, 1), padding='same',
                    use_bias=False, name='b1_cnv3d_2', kernel_initializer='he_normal')(b1_bn_1)
b1_relu_2 = ReLU(name='b1_relu_2')(b1_cnv3d_2)
b1_out = BatchNormalization(name='b1_out')(b1_relu_2)  # size: 14*14

'''block 2'''
b2_cnv3d_1 = Conv3D(filters=32, kernel_size=(1, 1, 1), strides=(1, 1, 1), padding='same',
                    use_bias=False, name='b2_cnv3d_1', kernel_initializer='he_normal')(b1_out)
b2_relu_1 = ReLU(name='b2_relu_1')(b2_cnv3d_1)
b2_bn_1 = BatchNormalization(name='b2_bn_1')(b2_relu_1)  # size: 14*14

b2_add = add([b1_out, b2_bn_1])  #

b2_cnv3d_2 = Conv3D(filters=64, kernel_size=(3, 3, 3), strides=(1, 3, 4),padding='same',
                    use_bias=False, name='b2_cnv3d_2', kernel_initializer='he_normal')(b2_add)
b2_relu_2 = ReLU(name='b2_relu_2')(b2_cnv3d_2)
b2_out = BatchNormalization(name='b2_out')(b2_relu_2)  # size: 7*7

'''block 3'''
b3_cnv3d_1 = Conv3D(filters=64, kernel_size=(1, 1, 1), strides=(1, 1, 1), padding='same',
                    use_bias=False, name='b3_cnv3d_1', kernel_initializer='he_normal')(b2_out)
b3_relu_1 = ReLU(name='b3_relu_1')(b3_cnv3d_1)
b3_bn_1 = BatchNormalization(name='b3_bn_1')(b3_relu_1)  # size: 7*7

b3_add = add([b2_out, b3_bn_1])  #

b3_cnv3d_2 = Conv3D(filters=64, kernel_size=(3, 3, 3), padding='same',
                    use_bias=False, name='b3_cnv3d_2', kernel_initializer='he_normal')(b3_add)
b3_relu_2 = ReLU(name='b3_relu_2')(b3_cnv3d_2)
b3_out = BatchNormalization(name='b3_out')(b3_relu_2)  # size: 3*3

'''block 4'''

b4_cnv3d_1 = Conv3D(filters=64, kernel_size=(1, 1, 1), strides=(1, 1, 1), padding='same',
                    use_bias=False, name='b4_cnv3d_1', kernel_initializer='he_normal')(b3_out)
b4_relu_1 = ReLU(name='b4_relu_1')(b4_cnv3d_1)
b4_bn_1 = BatchNormalization(name='b4_bn_1')(b4_relu_1)  # size: 7*7

b4_add = add([b3_out, b4_bn_1])  #

b4_cnv3d_2 = Conv3D(filters=128, kernel_size=(3, 3, 3), strides=(1, 2, 2), padding='same',
                    use_bias=False, name='b4_cnv3d_2', kernel_initializer='he_normal')(b4_add)
b4_relu_2 = ReLU(name='b4_relu_2')(b4_cnv3d_2)
b4_out = BatchNormalization(name='b4_out')(b4_relu_2)

"""BiLSTM"""

reshaped2 = tf.keras.layers.Reshape((11,25* 128))(b4_out)
bi_lstm =tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True))(reshaped2)
dropout = tf.keras.layers.Dropout(0.5)(bi_lstm)
bi_lstm2 =tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(dropout)
dropout2 = tf.keras.layers.Dropout(0.5)(bi_lstm2)
last_layer = Dense(128, activation='relu')(dropout2)
output = Dense(output_shape, name='model_output', activation='softmax',
               kernel_initializer='he_uniform')(last_layer)
model = Model(input, output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.load_weights('/Users/koksziszdave/Egyetem/AIT/LipReadingModel/model.bilstm.weights.h5')

In [None]:
import gradio as gr
import tensorflow as tf


import cv2

def getFrames(video):
    cap = cv2.VideoCapture(video)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return frames

def to_gray(frames):
    gray_frames = []
    for j in range(len(frames)):
        gray_img = cv2.cvtColor(frames[j], cv2.COLOR_BGR2GRAY)
        gray_img = cv2.resize(gray_img, (gray_img.shape[1] ,  gray_img.shape[0]))
        gray_frames.append(gray_img)
        
    return gray_frames

def face_detect(face_classifier, images):
   
    detected_faces = []
    for j in range(len(images)):
        face = face_classifier.detectMultiScale(
                images[j], scaleFactor=1.1, minNeighbors=5, minSize=(40, 40)
        )
        if len(face) == 1:
            for (x, y, w, h) in face:
                cv2.rectangle(images[j], (x, y), (x + w, y + h), (0, 255, 0), 4)
                detected_faces.append((x, y, w, h))
        
    
    
    return detected_faces

from imutils import face_utils
import dlib
import numpy as np
def lip_detect(images, og_images, predictor):
    
    lips_list = []
    for j in range(len(images)):
        frame = og_images[j]
        x, y, w, h = images[j] 
        face_box = dlib.rectangle(left=x, top=y, right=x + w, bottom=y + h)
        shape = predictor(frame, face_box) 
        shape = face_utils.shape_to_np(shape)
        
        (x, y, w, h) = cv2.boundingRect(np.array([shape[48:68]]))
        margin = 10
        lips = frame[y-margin:y+h+margin, x-margin:x+w+margin]
        lips = cv2.resize(lips,(100,60))
        lips_list.append(lips)
        
    return lips_list

def get_middle_frames(lips, frame_num):
    separator = (29 - frame_num) // 2
    middle_frames = lips[separator:separator + frame_num]
    return middle_frames

import csv
def load_labels():
    with open('labels.csv', mode='r') as file:
        reader = csv.reader(file)
        row = next(reader)
        
    return row

def predict(frames, model):
    frames = np.array(frames)
    print(frames.shape)
    frames = frames / 255.0
    frames = frames.reshape(1,11, 60, 100, 1)
    prediction = model.predict(frames)
    print(prediction)
    idx = np.argmax(prediction[0])
    print(idx)
    labels=load_labels()
    
    
    return labels[idx]

def getWord(video):
    frames=getFrames(video)
    
    face_classifier = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    detectedfacebb=face_detect(face_classifier,frames)
    
    
    predictor_path = r'/Users/koksziszdave/Egyetem/AIT/Models/shape_predictor_68_face_landmarks.dat'
    predictor = dlib.shape_predictor(predictor_path)
    lips=lip_detect(detectedfacebb,frames,predictor)
    gray_lips=to_gray(lips)
    
    final_frames=get_middle_frames(gray_lips,11)
    word=predict(final_frames,model)
    
    return  word
    


In [14]:
demo = gr.Interface(fn=getWord, inputs="video", outputs="text", title="Lip Reading Model", description="This model can predict the word from a video of a person speaking.")

demo.launch(share=True)  # Share your demo with just 1 extra parameter 🚀

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://f5a11ce8ab91c7991c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


