In [2]:
import os
import cv2
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from typing import List 
import imageio # convert numpy arr to gif

In [5]:
tf.config.list_physical_devices('CPU')

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [6]:
physical_devices=tf.config.list_physical_devices('CPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

# Build Data Loading Functions

In [8]:
# import zipfile

# extract data.zip file

# with zipfile.ZipFile("data.zip","r") as zip_ref:
#     zip_ref.extractall("data")

In [9]:
def load_video(path:str)->List[float]:
    """
    Loads a video from a given path and returns a list of frames
    """
    cap = cv2.VideoCapture(path) 
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame=cap.read()
        frame=tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236,80:220,:]) # isolating the lip region
        # we can also use delib to extract the lip
    cap.release()

    mean= tf.math.reduce_mean(frames) # scale the data
    std=tf.math.reduce_std(tf.ast(frames,tf.float32))
    frames=tf.cast((frames-mean),tf.float32)/std
    return frames

In [10]:
vocab= [x for x in 'abcdefghijklmnopqrstuvwxyz\'?!123456789 ']

In [12]:
print(vocab)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' ']


In [13]:
# https://keras.io/examples/audio/ctc_asr/#model

# lookup functions to convert and reconvert text to encoding
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab,oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(),invert=True
)
print(f"The vocabulary is : {char_to_num.get_vocabulary()}"
      f"(size ={char_to_num.vocabulary_size()})")

The vocabulary is : ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", '?', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', ' '](size =40)


In [14]:
char_to_num(['a','b','c'])

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([1, 2, 3], dtype=int64)>

In [16]:
char_to_num(['n','s','t','e']) # tokenizing the data and returning the tokens

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([14, 19, 20,  5], dtype=int64)>

In [18]:
num_to_char([14,19,20,5]) # returns byte encoded value

<tf.Tensor: shape=(4,), dtype=string, numpy=array([b'm', b'r', b's', b'd'], dtype=object)>

In [19]:
# load alignments

def load_alignments(path:str)->List[int]:
    """
    Loads the alignments from a given path and returns a list of integers
    """
    with open(path,"r") as f:
        lines=f.readlines()
    tokens=[]
    for line in lines:
        line=line.split()
        # data preprocessing
        if(line[2]!='sil'):
            tokens=[*tokens,' ',line[2]]
    # preprocessed alignments
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens,input_encoding='UTF-8'),(-1)))[1:]

