In [9]:
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt
import numpy as np
import cv2
import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras

In [14]:
frames = np.load('full_videos_read.npy')
frames.shape

(1000, 75, 64, 64, 3)

In [15]:
frames = frames.astype(np.float32)

In [16]:
new_frames = []
for i in frames:  # Loop over videos
    frame = []
    for j in i:  # Loop over frames
        frame.append(cv2.cvtColor(j, cv2.COLOR_BGR2GRAY))

    new_frames.append(np.asarray(frame))

new_frames = np.asarray(new_frames)
print(new_frames.shape)

(1000, 75, 64, 64)


In [17]:
new_frames = np.expand_dims(new_frames, axis = -1)
new_frames.shape

(1000, 75, 64, 64, 1)

In [18]:
path = '/mnt/g/projects/lip_reading_project/data/alignments/s1/*'
texts = []
for i in glob.glob(path):
    with open(file = i,mode='r') as f:
        texts.append(f.read().split())

In [19]:
words = []
start = []
end   = []

for text in texts:
    start.append(text[0::3])
    end.append(text[1::3])
    words.append(text[2::3])

In [20]:
words[1], start[1],end[1]

(['sil', 'bin', 'blue', 'at', 'f', 'three', 'soon', 'sil'],
 ['0', '17750', '22500', '27000', '28000', '31000', '36250', '46750'],
 ['17750', '22500', '27000', '28000', '31000', '36250', '46750', '74500'])

In [21]:
def prepare_word_sequences(word_lists, max_words=54, max_vocabulary=100, max_frames=9):
    # Flatten the list of word lists into a single list for tokenization
    all_words = [word for sublist in word_lists for word in sublist]
    
    # Create Tokenizer
    tokenizer = Tokenizer(num_words=max_vocabulary, oov_token='<OOV>')
    tokenizer.fit_on_texts(all_words)
    
    # Convert words to sequences of indices
    encoded_sequences = tokenizer.texts_to_sequences(word_lists)
    
    # Pad each sequence to ensure it has exactly `max_words` (54) per time step
    padded_sequences = pad_sequences(
        encoded_sequences, 
        maxlen=max_words,  # Each time step should have 54 words
        padding='post',  # Add zeros at the end
        truncating='post'  # Truncate from the end if too long
    )
    
    # Pad the entire sequence to ensure there are `max_frames` (9) time steps for each video
    padded_sequences = pad_sequences(
        padded_sequences, 
        maxlen=max_frames,  # Ensure there are 9 time steps
        padding='post',  # Add zeros at the end if fewer than 9 time steps
        truncating='post'  # Truncate from the end if more than 9 time steps
    )
    
    words_test_new =[]
    for line in padded_sequences:
        words_test_new.append(tf.keras.utils.to_categorical(x = line, num_classes=55))
    
    return {
        'padded_sequences': np.asarray(words_test_new),
        'tokenizer': tokenizer,
        'word_index': tokenizer.word_index
    }
    
    
result = prepare_word_sequences(words)


In [22]:
result["padded_sequences"].shape

(1000, 9, 55)

In [23]:
def pad_sequence(sequence, max_length):
    # Pad with zeros (or other value) if the sequence is shorter than max_length
    if len(sequence) < max_length:
        return np.pad(sequence, (0, max_length - len(sequence)), 'constant', constant_values=0)
    else:
        return sequence[:max_length]  # Truncate if the sequence is longer than max_length

# Set the fixed length you want (9 elements)
max_length = 9

# Pad the sequences in `start` (assuming `start` is a list of sequences with varying lengths)
start_padded = np.array([pad_sequence(s, max_length) for s in start])

# Similarly, pad the sequences in `end` if needed
end_padded = np.array([pad_sequence(e, max_length) for e in end])

In [24]:
new_frames.shape, result["padded_sequences"].shape, start_padded.shape, end_padded.shape

((1000, 75, 64, 64, 1), (1000, 9, 55), (1000, 9), (1000, 9))

In [25]:
start_padded = start_padded.astype(np.int32)
end_padded = end_padded.astype(np.int32)

In [26]:
start_padded = start_padded / 74500
start_padded = np.asarray(start_padded, dtype=np.float32)


end_padded = end_padded / 74500
end_padded = np.asarray(end_padded, dtype=np.float32)

In [27]:
@keras.saving.register_keras_serializable()
def inception_module(x, filters_1x1, filters_3x3_reduce, filters_3x3, filters_5x5_reduce, filters_5x5, filters_pool_proj):

    # 1x1 convolution branch
    branch1 = keras.layers.Conv3D(filters_1x1, (1,1,1), padding='same', activation='relu')(x)
    
    # 3x3 convolution branch
    branch2 = keras.layers.Conv3D(filters_3x3_reduce, (1,1,1), padding='same', activation='relu')(x)
    branch2 = keras.layers.Conv3D(filters_3x3, (3,3,3), padding='same', activation='relu')(branch2)
    
    # 5x5 convolution branch
    branch3 = keras.layers.Conv3D(filters_5x5_reduce, (1,1,1), padding='same', activation='relu')(x)
    branch3 = keras.layers.Conv3D(filters_5x5, (5,5,5), padding='same', activation='relu')(branch3)
    
    # Pool projection branch
    branch4 = keras.layers.MaxPooling3D((3,3,3), strides=(1,1,1), padding='same')(x)
    branch4 = keras.layers.Conv3D(filters_pool_proj, (1,1,1), padding='same', activation='relu')(branch4)
    
    
    
    # print(branch1.shape)
    # print(branch2.shape)
    # print(branch3.shape)
    # print(branch4.shape)
    # print('_____________')
    
    
    # Concatenate all branches
    return keras.layers.Concatenate()([branch1, branch2, branch3, branch4])

In [28]:
model = keras.models.load_model('google_net_full_video(1).h5', custom_objects={
        'inception_module': inception_module,
        'mse': tf.keras.losses.MeanSquaredError()
    })



In [29]:
y_pred = model.evaluate(new_frames, [result["padded_sequences"], start_padded, end_padded] )

y_pred 

2024-12-13 19:03:25.044814: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1228800000 exceeds 10% of free system memory.
2024-12-13 19:03:25.906858: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1228800000 exceeds 10% of free system memory.
2024-12-13 19:03:28.474322: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
W0000 00:00:1734109408.722718   96794 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109408.989212   96794 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109409.202514   96794 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109409.206058   96794 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109409.208293   96794 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy w

[1m31/32[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 339ms/step - end_time_output_loss: 10.1247 - end_time_output_mean_absolute_error: 0.5507 - loss: 10.1320 - start_time_output_loss: 0.0022 - start_time_output_mean_absolute_error: 0.0404 - word_output_accuracy: 0.4542 - word_output_loss: 0.0052

W0000 00:00:1734109421.414186   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.427684   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.440638   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.451001   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.452621   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.454113   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.455819   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.457337   96796 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1734109421.458839   96796 gp

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 350ms/step - end_time_output_loss: 10.1148 - end_time_output_mean_absolute_error: 0.5510 - loss: 10.1199 - start_time_output_loss: 0.0022 - start_time_output_mean_absolute_error: 0.0404 - word_output_accuracy: 0.4542 - word_output_loss: 0.0052


[9.932774543762207,
 0.0053385403007268906,
 0.0021728086285293102,
 9.96225357055664,
 0.5554441809654236,
 0.04060196131467819,
 0.45333331823349]

In [30]:
model.save('final_model.keras')

In [67]:
y_pred = model.predict(new_frames)

y_pred

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 330ms/step


[array([[[0.        , 0.        , 0.9544962 , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.9452368 , ..., 0.        ,
          0.        , 0.        ],
         [0.9289584 , 0.        , 0.01164295, ..., 0.        ,
          0.        , 0.        ]],
 
        [[0.        , 0.        , 0.9501702 , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.        ],
         ...,
         [0.        , 0.        , 0.        , ..., 0.        ,
          0.        , 0.

In [68]:
y_pred[0].shape,y_pred[1].shape,y_pred[2].shape

((1000, 9, 55), (1000, 9), (1000, 9))

In [78]:
y_pred[1][0]

array([0.0111043 , 0.19207971, 0.29415822, 0.37194118, 0.41160592,
       0.47448584, 0.5639604 , 0.67770576, 0.02360147], dtype=float32)

In [99]:
m = []
for true, pred in zip(start_padded, y_pred[1]):
    m.append(true*75000 - pred*75000)

In [115]:
np.mean(m, axis=0)

array([-832.82661655, -481.42044637,   62.08574418,  -31.52758218,
        -33.53876844, -313.77729625, -299.76973459,   28.17348316,
       -480.52893388])

In [117]:
np.asarray((y_pred[1][0]*75000 )+ np.mean(m, axis=0), dtype=np.int32)

array([    0, 13924, 22123, 27864, 30836, 35272, 41997, 50856,  1289],
      dtype=int32)

In [108]:
y_pred[1][0]*75000

array([  832.82261621, 14405.97809851, 22061.86652184, 27895.58842778,
       30870.44432759, 35586.43832803, 42297.02800512, 50827.93235779,
        1770.11014894])

In [105]:
start_padded[0]*75000

array([    0.        , 23909.39667821, 29697.98594713, 34228.18779945,
       35738.25582862, 41275.16895533, 47567.11274385, 53355.70424795,
           0.        ])

In [98]:
y_pred[1][3][0]*75000 - y_pred[1][3][-1]*75000

-937.2746804729104

In [120]:
# with open('tokenizer.pkl', 'wb') as f:
#     pickle.dump(result['tokenizer'], f)

In [2]:
with open('tokenizer.pkl', 'rb') as f:
    loaded_tokenizer = pickle.load(f)

In [122]:
# with open('word_index.pkl', 'wb') as f:
#     pickle.dump(result['word_index'], f

In [3]:
with open('word_index.pkl', 'rb') as f:
    loaded_word_index = pickle.load(f)