In [36]:
import gradio as gr
import tensorflow as tf


modelpath='/Users/koksziszdave/Egyetem/AIT/LipReadingModel/model.dilatatedcovnet.h5'
model = tf.keras.models.load_model(modelpath)
import cv2

def getFrames(video):
    cap = cv2.VideoCapture(video)
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return frames

def to_gray(frames):
    gray_frames = []
    for j in range(len(frames)):
        gray_img = cv2.cvtColor(frames[j], cv2.COLOR_BGR2GRAY)
        gray_img = cv2.resize(gray_img, (gray_img.shape[1] ,  gray_img.shape[0]))
        gray_frames.append(gray_img)
        
    return gray_frames

def face_detect(face_classifier, images):
   
    detected_faces = []
    for j in range(len(images)):
        face = face_classifier.detectMultiScale(
                images[j], scaleFactor=1.1, minNeighbors=5, minSize=(40, 40)
        )
        if len(face) == 1:
            for (x, y, w, h) in face:
                cv2.rectangle(images[j], (x, y), (x + w, y + h), (0, 255, 0), 4)
                detected_faces.append((x, y, w, h))
        
    
    
    return detected_faces

from imutils import face_utils
import dlib
import numpy as np
def lip_detect(images, og_images, predictor):
    
    lips_list = []
    for j in range(len(images)):
        frame = og_images[j]
        x, y, w, h = images[j] 
        face_box = dlib.rectangle(left=x, top=y, right=x + w, bottom=y + h)
        shape = predictor(frame, face_box) 
        shape = face_utils.shape_to_np(shape)
        
        (x, y, w, h) = cv2.boundingRect(np.array([shape[48:68]]))
        margin = 10
        lips = frame[y-margin:y+h+margin, x-margin:x+w+margin]
        lips = cv2.resize(lips,(100,60))
        lips_list.append(lips)
        
    return lips_list

def get_middle_frames(lips, frame_num):
    separator = (29 - frame_num) // 2
    middle_frames = lips[separator:separator + frame_num]
    return middle_frames

import csv
def load_labels():
    with open('labels.csv', mode='r') as file:
        reader = csv.reader(file)
        labels = list(reader)
    return labels

def predict(frames, model):
    frames = np.array(frames)
    print(frames.shape)
    frames = frames / 255.0
    frames = frames.reshape(11, 60, 100, 1)
    prediction = model.predict(frames)
    word = np.argmax(prediction)
    labels=load_labels()
    
    return labels[word][0]

def getWord(video):
    frames=getFrames(video)
    grayframes=to_gray(frames)
    grayframes=np.array(grayframes)
    print(grayframes.shape) 
    face_classifier = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    detectedfacebb=face_detect(face_classifier,grayframes)
    
    
    predictor_path = r'/Users/koksziszdave/Egyetem/AIT/Models/shape_predictor_68_face_landmarks.dat'
    predictor = dlib.shape_predictor(predictor_path)
    lips=lip_detect(detectedfacebb,frames,predictor)
    print(np.array(lips).shape)
    final_frames=get_middle_frames(lips,11)
    word=predict(final_frames,model)
    
    return  word
    




In [37]:
videopath='/Users/koksziszdave/Downloads/lipread_mp4/ABOUT/test/ABOUT_00044.mp4'
word=getWord(videopath)
print(word)

(29, 256, 256)
(29, 60, 100, 3)
(11, 60, 100, 3)


ValueError: cannot reshape array of size 198000 into shape (11,60,100,1)

In [None]:


demo = gr.Interface(fn=getWord, inputs="video", outputs="image", title="Lip Reading Model", description="This model can predict the word from a video of a person speaking.")

demo.launch(share=True)  # Share your demo with just 1 extra parameter 🚀