In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io.wavfile as io_wav
import os
import glob
import pickle
import cv2
from cv2 import VideoWriter, VideoWriter_fourcc
import librosa
from keras.models import model_from_json
from keras import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, CSVLogger, ModelCheckpoint
from keras.metrics import MeanSquaredError
import tensorflow as tf
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.saving import register_keras_serializable
from tensorflow.keras import backend as K
import subprocess  # Add this import
from subprocess import run


In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


In [3]:

# from LipReading with slight modifications
# https://github.com/hassanhub/LipReading/blob/master/codes/data_integration.py
################## VIDEO INPUT ##################
def load_video_3D(path, framesPerSec):

    cap = cv2.VideoCapture(path)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT ))
    frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH ))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # make sure that all the videos are the same FPS
    if (np.abs(fps - framesPerSec) > 0.01):
        print('fps:', fps, '(' + path + ')')
        buf = np.empty((frameHeight, frameWidth, frameCount), np.dtype('float32'))

    buf = np.empty((frameHeight, frameWidth, frameCount), np.dtype('float32'))
    fc = 0
    ret = True

    while (fc < frameCount  and ret):
        ret, frame = cap.read()
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame = frame.astype('float32')
        # min-max scaling to [0-1]
        frame = frame-np.amin(frame)
        # make sure not to divide by zero
        if np.amax(frame) != 0:
            frame = frame/np.amax(frame)
        buf[:,:,fc]=frame
        fc += 1
    cap.release()

    return buf

# load vocoder features,
# or calculate, if they are not available
def get_mgc_lsp_coeff(basefilename):
    if os.path.isfile(basefilename + '.mgclsp'):
        mgc_lsp_coeff = np.fromfile(basefilename + '.mgclsp', dtype=np.float32).reshape(-1, order + 1)
        lf0 = np.fromfile(basefilename + '.lf0', dtype=np.float32)
    else:
        (mgc_lsp_coeff, lf0) = vocoder_LSP_sptk.encode(basefilename, samplingFrequency, frameLength, frameShift, order, alpha, stage)
    return (mgc_lsp_coeff, lf0)

# convert an array of values into a dataset matrix
# code with modifications from
# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
def create_dataset_img(data_in_X, data_in_Y, look_back=1):
    (dim1_X, dim2_X, dim3_X, dim4_X) = data_in_X.shape
    (dim1_Y, dim2_Y) = data_in_Y.shape
    data_out_X = np.empty((dim1_X - look_back - 1, look_back, dim2_X, dim3_X, dim4_X))
    data_out_Y = np.empty((dim1_Y - look_back - 1, dim2_Y))

    for i in range(dim1_X - look_back - 1):
        for j in range(look_back):
            data_out_X[i, j] = data_in_X[i + j]
        data_out_Y[i] = data_in_Y[i + j]
    return data_out_X, data_out_Y

# convert an array of values into a dataset matrix
# code with modifications from
# https://machinelearningmastery.com/time-series-prediction-lstm-recurrent-neural-networks-python-keras/
def create_dataset_img_inverse(data_in_X, data_in_Y, look_back=1):
    (dim1_X, dim2_X) = data_in_X.shape
    (dim1_Y, dim2_Y, dim3_Y, dim4_Y) = data_in_Y.shape
    data_out_X = np.empty((dim1_X - look_back - 1, look_back, dim2_X))
    data_out_Y = np.empty((dim1_Y - look_back - 1, dim2_Y, dim3_Y, dim4_Y))

    for i in range(dim1_X - look_back - 1):
        for j in range(look_back):
            data_out_X[i, j] = data_in_X[i + j]
        data_out_Y[i] = data_in_Y[i + j]
    return data_out_X, data_out_Y

# mri2vid converts raw MRI data to .mp4 video
def mri2vid(mri_data, dir_file, filename_no_ext, n_width, n_height, FramesPerSec):

    print(filename_no_ext + ' - MRI video started')

    output_file_no_ext = dir_file + filename_no_ext
    n_frames = len(mri_data)

    # compressed
    # fourcc = VideoWriter_fourcc(*'MP4V')

    # uncompressed 8-bit
    fourcc = VideoWriter_fourcc(*'Y800')
    video = VideoWriter(output_file_no_ext + '.avi', fourcc, float(FramesPerSec), (n_width, n_height), 0)

    for n in range(n_frames):
        frame = np.uint8(255 * mri_data[n]).reshape(n_width, n_height, 1)

        video.write(frame)
        print('frame ', n, ' done', end='\r')

    video.release()

    print(filename_no_ext + ' - MRI video finished')

def mrividwav2demo(dir_mri, file_mri, dir_wav, file_wav):
    # "-codec copy " + \
    command = "ffmpeg " + \
           "-y " + \
           "-i " + dir_mri + file_mri + " " + \
           "-i " + dir_wav + file_wav + " " + \
           "-shortest " + \
           "-acodec copy -vcodec copy " + \
            dir_mri + file_mri[:-4] + "_with_audio.avi"
           # "-c:v h264 -crf 20 -c:a aac -strict -2 " + \
           # "-filter:v \"crop=820:496:215:48\" " + \

    print(command)
    run(command, shell=True)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# speakerlist = ['F1']
# model_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/models/Models_for_Text2MRI/'
# # output_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/generated_image_sequence/'
# output_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/generated_image_sequence_original_audio/'


# # Input_audio_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/audio_files_for_input/'
# Input_audio_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/original_audio_files/'



speakerlist = ['M2']
model_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/models_speech2mri_2024/'
# output_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/testing_models/'
output_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/temp_image_sequence/'

# Input_audio_path = '/content/drive/MyDrive/GAN_based_models/wav_files/'
Input_audio_path = '/content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/original_audio_files/'


speakerInd = 0
# for speaker in ['f1']: # ['f1', 'f2', 'm1', 'm2']:
for speaker in speakerlist:
    # TODO: modify this according to your data path
    # dir_mri = '/content/drive/MyDrive/backup_PhD/database/MRI_USC/data/' + speaker + '/avi/'
    dir_mri_test = '/content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/temp_image_sequence/' + speaker + '/'

    if not os.path.exists(dir_mri_test):
        os.makedirs(dir_mri_test)

    # Parameters of vocoder
    samplingFrequency = 20000
    frameLength = 1024 #
    frameShift = 863 # 43.14 ms at 20000 Hz sampling, correspondong to 23.18 fps (MRI video)
    order = 24
    alpha = 0.42
    stage = 3
    n_mgc = order + 1

    # context window of LSTM
    n_sequence = 10

    # properties of MRI videos
    framesPerSec = 23.18
    n_width = 68
    n_height = 68

    # modelbasenames = [ 'SPEECH2MRI_LSTM_baseline_Text2MRI_M2_2023-04-18_10-55-58',
    #           'SPEECH2MRI_LSTM_baseline_Text2MRI_M3_2023-04-18_12-28-25',
    #           'SPEECH2MRI_LSTM_baseline_Text2MRI_F1_2023-04-18_13-44-16',
    #           'SPEECH2MRI_LSTM_baseline_F2_2024-10-04_10-04-46']

    modelbasenames = [ 'SPEECH2MRI_LSTM_baseline_F1_2024-10-04_09-57-40',
                      'SPEECH2MRI_LSTM_baseline_F2_2024-10-04_10-04-46',
                       'SPEECH2MRI_LSTM_baseline_M2_2024-10-04_09-33-21',
                       'SPEECH2MRI_LSTM_baseline_M3_2024-10-04_09-52-15']


    # DNN_types = ['FC-DNN_baseline', 'CNN', 'LSTM']
    #DNN_types = ['FC-DNN_baseline']
    # DNN_types = ['LSTM-CNN']
    DNN_types = ['LSTM']
    # basefilenames_mri_test = ['usctimit_mri_' + speaker.lower() + '_146_150', 'usctimit_mri_' + speaker.lower() + '_441_445']
    basefilenames_mri_test = glob.glob(Input_audio_path +"/"+ speaker +'/*.wav')

    for DNN_type in DNN_types:
        # e.g. MRI2SPEECH_CNN_f1_2020-01-16_10-36-35
        # csv_files = glob.glob('/content/drive/MyDrive/GAN_based_models/speech2mri/models/SPEECH2MRI_' + DNN_type + '_baseline_Text2MRI_' + speaker + '*.csv')
        csv_files = glob.glob(model_path + modelbasenames[speakerInd] +'*.csv')
        model_name = csv_files[-1][:-4]

        speakerInd = speakerInd + 1
        # load model
        print('loading model', model_name)
        with open(model_name + '_model.json', "r") as json_file:
            loaded_model_json = json_file.read()
        # model = model_from_json(loaded_model_json)

        model = model_from_json(loaded_model_json, custom_objects={'Sequential': Sequential})
        model.load_weights(model_name + '_weights.keras')
        print("Loaded model from disk")
        # load weights into new model
        # model.load_weights(model_name + '_weights.h5')
        # load scalers
        mgc_scalers = pickle.load(open(model_name + '_mgc_scalers.sav', 'rb'))

        for basefilename in basefilenames_mri_test:
            print('Predicting output for: ', basefilename)

            # load data for sentence
            # mri_data = load_video_3D(dir_mri + basefilename + '.avi', framesPerSec)
            # mri_len = mri_data.shape[2]
            # mri_test = np.empty((mri_len, n_width, n_height))
            # (mgc_lsp_coeff, lf0) = get_mgc_lsp_coeff(dir_mri + basefilename)
            # dir_mri_wav_only = dir_mri.replace('/avi/','/wav/')
            # dir_mri_wav = dir_mri_wav_only + basefilename+'.wav'
            dir_mri_wav = basefilename
            basefilename_name_only = basefilename.split('/')
            basefilename_name_only = basefilename_name_only[-1]
            basefilename_name_only = basefilename_name_only.replace('.wav', '')

            print(basefilename_name_only)
            x, sr = librosa.load(dir_mri_wav, sr = samplingFrequency)
            n_fft = frameLength   # window length: 0.02 s
            hop_length = frameShift  #
            mgc_lsp_coeff = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=25, hop_length=hop_length, n_fft=n_fft)
            mgc_lsp_coeff = mgc_lsp_coeff.transpose()



            # for i in range(mri_len):
            #     mri_test[i] = mri_data[:, :, i] # original, 68x68

            # transform of input parameters
            for i in range(n_mgc):
                mgc_lsp_coeff[:, i] = mgc_scalers[i].transform(mgc_lsp_coeff[:, i].reshape(-1, 1)).ravel()

            # reshape for LSTM
            if DNN_type == 'LSTM' or DNN_type == 'LSTM-CNN':
                mgc_len = len(mgc_lsp_coeff)
                mri0 = np.empty((mgc_len, n_width, n_height, 1))

                mgc_test0, mri0 = create_dataset_img_inverse(mgc_lsp_coeff, mri0, look_back = n_sequence)
                mri0 = mri0.reshape(-1, n_width * n_height)
                mgc_test = np.empty((mgc_len, n_sequence, n_mgc))

                # # add first n_sequence values
                # for i in range(mgc_len - 2):
                #     if i < n_sequence - 0:
                #         mgc_test[i] = mgc_test0[0]
                #     else:
                #         mgc_test[i] = mgc_test0[i - n_sequence + 1]

                # mgc_lsp_coeff = mgc_test

            mgc_lsp_coeff = mgc_test0
            # predict MR image sequence using the trained model
            mri_predicted = model.predict(mgc_lsp_coeff)
            print('Prediction done')
            # clip extreme values
            mri_predicted = np.clip(mri_predicted, 0, 1)

            print(mri_predicted.shape)

            y_pred = mri_predicted
            y_true = mri0

            # # Calculating the error
            # FrameErr = np.zeros((y_true.shape[0],1))
            # for i in range(y_true.shape[0]):
            #   t1 = y_pred[i,:]
            #   t2 = y_true[i,:]
            #   terr =np.mean((np.square(t1-t2)))
            #   FrameErr[i]=terr

            # MSErr = np.mean(FrameErr)
            # print(MSErr)

            # MSErr_fn = mean_squared_error(y_pred, y_true)
            # print(np.mean(MSErr_fn))


            # save image sequence to video (without audio)
            mri2vid(mri_predicted, dir_mri_test, basefilename_name_only + '_' + DNN_type, n_width, n_height, framesPerSec)

            dir_mri_wav_only = Input_audio_path + speaker +'/'
            # put together video and audio
            mrividwav2demo(dir_mri_test, basefilename_name_only + '_' + DNN_type + '.avi', \
                dir_mri_wav_only, basefilename_name_only + '.wav')




loading model /content/drive/MyDrive/GAN_based_models/speech2mri/models_speech2mri_2024/SPEECH2MRI_LSTM_baseline_F1_2024-10-04_09-57-40
Loaded model from disk
Predicting output for:  /content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/original_audio_files/F1/usctimit_mri_f1_441_445.wav
usctimit_mri_f1_441_445
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m403s[0m 21s/step
Prediction done
(591, 4624)
usctimit_mri_f1_441_445_LSTM - MRI video started
usctimit_mri_f1_441_445_LSTM - MRI video finished
ffmpeg -y -i /content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/temp_image_sequence/F1/usctimit_mri_f1_441_445_LSTM.avi -i /content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/original_audio_files/F1/usctimit_mri_f1_441_445.wav -shortest -acodec copy -vcodec copy /content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/temp_image_sequence/F1/usctimit

NameError: name 'run' is not defined

In [None]:

import subprocess
from subprocess import run
mrividwav2demo(dir_mri_test, basefilename_name_only + '_' + DNN_type + '.avi', \
                dir_mri_wav_only, basefilename_name_only + '.wav')


ffmpeg -y -i /content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/temp_image_sequence/F1/usctimit_mri_f1_441_445_LSTM.avi -i /content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/original_audio_files/F1/usctimit_mri_f1_441_445.wav -shortest -acodec copy -vcodec copy /content/drive/MyDrive/GAN_based_models/speech2mri/Video_generation_code_for_NCC_paper/temp_image_sequence/F1/usctimit_mri_f1_441_445_LSTM_with_audio.avi
