In [3]:
import tensorflow as tf
import librosa
import numpy as np
import os, shutil, subprocess
from keras import backend as K
from keras.layers import Input, LSTM, Dense, Reshape, Activation, Dropout, Flatten
from keras.models import Model
from tqdm import tqdm
from keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop,Adam
#from keras.optimizers import RMSprop, Adam
import h5py
from keras.callbacks import TensorBoard
import argparse, fnmatch
import pickle
import random
import time, datetime

In [4]:

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-i", "--in-file", default="/mnt/hdd/eric/.tmp_ipy_d/make/TalkingFaceGeneration_Pytorch/output/eric_output.hdf5",type=str, help="Input file containing train data")
parser.add_argument("-u", "--hid-unit", default=512, type=int, help="hidden units")

# The amount of delay we introduce
# is between 1 (40 ms) and 5 frames (200 ms).
parser.add_argument("-d", "--delay", default=1,type=int, help="Delay in terms of number of frames")

parser.add_argument("-c", "--ctx", default=3,type=int, help="context window size")
# can find "3" in generator.py

parser.add_argument("-o", "--out-fold", default="/mnt/hdd/eric/.tmp_ipy_d/make/TalkingFaceGeneration_Pytorch/train_output",type=str, help="output folder")
args = parser.parse_args([])

'''
"--i",
"/mnt/hdd/eric/.tmp_ipy_d/make/TalkingFaceGeneration_Pytorch/output/eric_output.hdf5",
"--hid-unit",
"512",
"--d",
"1",
"--c",
"3",
"--o",
"/mnt/hdd/eric/.tmp_ipy_d/make/TalkingFaceGeneration_Pytorch/train_output"
'''


'\n"--i",\n"/mnt/hdd/eric/.tmp_ipy_d/make/TalkingFaceGeneration_Pytorch/output/eric_output.hdf5",\n"--hid-unit",\n"512",\n"--d",\n"1",\n"--c",\n"3",\n"--o",\n"/mnt/hdd/eric/.tmp_ipy_d/make/TalkingFaceGeneration_Pytorch/train_output"\n'

In [5]:
args.delay

1

In [6]:
args.out_fold

'/mnt/hdd/eric/.tmp_ipy_d/make/TalkingFaceGeneration_Pytorch/train_output'

In [4]:
output_path = args.out_fold+'_'+str(args.hid_unit)+'/'

if not os.path.exists(output_path):
    os.makedirs(output_path)
else:
    shutil.rmtree(output_path)
    os.mkdir(output_path)

In [5]:
ctxWin = args.ctx
num_features_X = 128 * (ctxWin+1)# input feature size # 128 * 3 = 512
num_features_Y = 136 # output feature size --> (68, 2)
num_frames = 75 # time-steps
batchsize = 128
h_dim = args.hid_unit
lr = 1e-3
drpRate = 0.2 # Dropout rate 
recDrpRate = 0.2 # Recurrent Dropout rate 
frameDelay = args.delay # Time delay

numEpochs = 200

In [6]:
# Data
dset = h5py.File(args.in_file, 'r')

numIt = int(dset['flmark'].shape[0]//batchsize) + 1
metrics = ['MSE', 'MAE']

In [7]:
dset.keys()

<KeysViewHDF5 ['MelFeatures', 'flmark']>

In [73]:
dset.__len__()

2

In [74]:
from torch.utils.data import Dataset, DataLoader
import torch 

class Custom_Dataset(Dataset):
    ''' torch version data pipeline '''
    
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data['MelFeatures'])
    
    
    def __getitem__(self,idx):
        cur_lmark = self.data['flmark'][idx, :, :]
        cur_mel = self.data['MelFeatures'][idx, :, :]
        
        def addContext(melSpc, ctxWin):
            ctx = melSpc[:,:]
            filler = melSpc[0, :]
            for i in range(ctxWin):
                melSpc = np.insert(melSpc, 0, filler, axis=0)[:ctx.shape[0], :]
                ctx = np.append(ctx, melSpc, axis=1)
            return ctx

        if frameDelay > 0:
            filler = np.tile(cur_lmark[0:1, :], [frameDelay, 1])
            cur_lmark = np.insert(cur_lmark, 0, filler, axis=0)[:num_frames]
        
        X = addContext(cur_mel, ctxWin)
        Y_= cur_lmark

        out = {'cur_mel':torch.from_numpy(X), 'cur_lmark':torch.from_numpy(Y_)}
        return out 

In [75]:
lm_dataset = Custom_Dataset(data=dset)

In [76]:
lm_dataset.__getitem__(1)

{'cur_mel': tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 3.9939,  2.7080,  2.9608,  ...,  0.0000,  0.0000,  0.0000],
         [-1.5734, -1.0146,  0.6926,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.2623, -0.3484, -0.4165,  ..., -0.1854, -0.1593, -0.1499],
         [-0.6610,  0.5725,  0.0415,  ...,  0.4424,  0.2738,  0.1941],
         [-0.8899, -1.4737,  0.5270,  ..., -0.0752, -0.2463, -0.0744]]),
 'cur_lmark': tensor([[0.1841, 0.3372, 0.1908,  ..., 0.7029, 0.4540, 0.6996],
         [0.1841, 0.3372, 0.1908,  ..., 0.7029, 0.4540, 0.6996],
         [0.1838, 0.3308, 0.1904,  ..., 0.7028, 0.4539, 0.6964],
         ...,
         [0.1869, 0.3370, 0.1908,  ..., 0.6834, 0.4615, 0.6769],
         [0.1836, 0.3277, 0.1877,  ..., 0.6834, 0.4587, 0.6770],
         [0.1838, 0.3308, 0.1877,  ..., 0.6802, 0.4587, 0.6770]])}

In [77]:
lm_dataset.__getitem__(1)['cur_lmark'].shape

torch.Size([75, 136])

In [78]:
dset.keys(), dset['MelFeatures'].__len__()

(<KeysViewHDF5 ['MelFeatures', 'flmark']>, 978)

In [83]:
train_dataloader = DataLoader(lm_dataset, batch_size=128, 
                        shuffle=True, num_workers=4) #? batch_size 에 따라서, 왜 안바뀌지? 

In [100]:
next(iter(train_dataloader))["cur_mel"].shape

torch.Size([128, 75, 512])

In [101]:
next(iter(train_dataloader))["cur_lmark"].shape

torch.Size([128, 75, 136])

In [92]:
#---- build torch model 
import torch.nn as nn

class AirModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm_1 = nn.LSTM(input_size=512, hidden_size=512, num_layers=3, batch_first=True)
        # self.lstm_2 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
        # self.lstm_3 = nn.LSTM(input_size=512, hidden_size=512, batch_first=True)
        self.lstm_4 = nn.LSTM(input_size=512, hidden_size=136, num_layers=1,batch_first=True)
        # #self.linear = nn.Linear(50, 1)
    def forward(self, x):
        x, _ = self.lstm_1(x)
        # x, _ = self.lstm_2(x)
        # x, _ = self.lstm_3(x)
        x, _ = self.lstm_4(x)
        # #x = self.linear(x)
        return x


In [102]:
model = AirModel()

In [96]:
pred = model(next(iter(train_dataloader))["cur_mel"])

In [106]:
pred.shape

torch.Size([128, 75, 136])

In [109]:
x,y = next(iter(train_dataloader))

In [113]:
train_dataloader

8

In [117]:
# MSE and adam 
import torch.optim as optim
device = "cuda:0"

model = AirModel()
model.to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.MSELoss()
loader = train_dataloader

n_epochs = 200 
for epoch in range(n_epochs):
    model.train()
    
    # train
    for i,data_ in enumerate(loader):
        x_train = data_["cur_mel"].to(device)
        y_train = data_["cur_lmark"].to(device)
        
        y_pred = model(x_train)
        loss   = loss_fn(y_pred, y_train)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 20 ==0:
            print(f"{np.sqrt(loss.detach().cpu())}")

    # model save 
    if epoch % 10 ==0:
        torch.save(model.state_dict(), os.path.join( args.out_fold, f"{epoch}_model.pth"))
        
        
    

0.5377535223960876
0.1307123601436615
0.08724506199359894
0.07066360861063004
0.0640668123960495
0.05995002016425133
0.05684724450111389
0.05367538705468178
0.0507766492664814
0.04689768701791763
0.04279610887169838
0.03740498796105385
0.033158231526613235
0.02838161587715149
0.02539215236902237
0.021832067519426346
0.020485321059823036


Exception ignored in: <function _releaseLock at 0x7f5f2e9388b0>
Traceback (most recent call last):
  File "/mnt/hdd/eric/.conda/envs/9.tmp/lib/python3.9/logging/__init__.py", line 227, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 


0.018367191776633263
0.017556913197040558
0.01702919974923134
0.04962783679366112
0.014022758230566978
0.01431606151163578
0.013599080964922905
0.01349638495594263
0.048621855676174164
0.01275260467082262
0.013838053680956364


0.011175678111612797
0.011615114286541939
