In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time

import tensorflow as tf
import scipy.io.wavfile as wav
import numpy as np

import os
import sys

from six.moves import xrange as range
from python_speech_features import mfcc
from tensorflow.python.ops import ctc_ops

import random
import scipy
import glob

In [4]:
def sparse_tuple_from(sequences, dtype=np.int32):
    """Create a sparse representention of x.
    Args:
        sequences: a list of lists of type dtype where each element is a sequence
    Returns:
        A tuple with (indices, values, shape)
    """
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n]*len(seq[0]), range(len(seq[0]))))
        values.extend(seq[0])

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)

    return indices, values, shape

# Loading the data
# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1  # 0 is reserved to space




In [5]:

#s = getBatch(1,3,['test/video-2-train.mat','test/audio-2-train.mat','test/align-2-train.mat'])


#print(np.asarray([s[0][0]]).shape)


In [6]:
def weight_variable(shape,name="v"):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial,name=name+"_weight")

def bias_variable(shape,name="v"):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial, name=name+"_bias")

def conv2d(x, W):
  return tf.nn.conv3d(x, W, strides=[1, 1, 1, 1,1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool3d(x, ksize=[1,1, 2, 2, 1],
                        strides=[1,1, 2, 2, 1], padding='SAME')

# Some configs

# Accounting the 0th indice +  space + blank label = 28 characters
num_classes = ord('z') - ord('a') + 1 + 1 + 1

# Hyper-parameters

num_hidden = 100
num_layers = 2
batch_size = 1
initial_learning_rate = 1e-3
momentum = 0.9
height, width = 50,80
num_features = height* width*3



# THE MAIN CODE!

graph = tf.Graph()
with graph.as_default():
    # e.g: log filter bank or MFCC features
    # Has size [batch_size, max_stepsize, num_features], but the
    # batch_size and max_stepsize can vary along each step
    #inputs = tf.placeholder(tf.float32, [None, None, num_features])
    
    
    #batch, steps-75, heigth*width-50*120
    videoInputs = tf.placeholder(tf.float32, [None, None, num_features])
    shapeV = tf.shape(videoInputs)
    
    W_conv1 = weight_variable([3, 5,5, 3, 20],"l1")
    b_conv1 = bias_variable([20],"l1")
    #size (batch*steps, heigth, width,1)
    x_image = tf.reshape(videoInputs , [-1,75,height, width,3])
    #shape (batch*steps, heigth, width,32)
    h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
    #shape (batch*steps, heigth/2=25, width/2=30,32)
    h_pool1 = max_pool_2x2(h_conv1)
    
    W_conv2 = weight_variable([3, 5,5, 20, 40],"l2")
    b_conv2 = bias_variable([40],"l2")
    #shape (batch*steps, 25,30,64)
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
    #shape (batch*steps, 13,15,64)
    h_pool2 = max_pool_2x2(h_conv2)  
    
    W_conv3 = weight_variable([3, 5,5, 40, 60],"l3")
    b_conv3 = bias_variable([60],"l3")
    #shape (batch*steps, 25,30,64)
    h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3) + b_conv3)
    #shape (batch*steps, 13,15,64)
    h_pool3 = max_pool_2x2(h_conv3) 
    
    hh = int(round(round(round(height/2)/2)/2))
    ww = int(round(round(round(width/2)/2)/2))
    
    
    #W_fc1 = weight_variable([hh*ww*60, 104],name="w1")#41216 based on heigth and weith
    #b_fc1 = bias_variable([104],name="b1")

    h_pool2_flat = tf.reshape(h_pool3, [-1,75, hh*ww*60])
    #h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
    
    keep_prob = tf.placeholder(tf.float32)
    #h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
    
    #h_pool2_flat = tf.reshape(h_fc1_drop, [shapeV[0],shapeV[1], 104])   
    
    
    # Here we use sparse_placeholder that will generate a
    # SparseTensor required by ctc_loss op.
    targets = tf.sparse_placeholder(tf.int32)

    # 1d array of size [batch_size]
    seq_len = tf.placeholder(tf.int32, [None])

    # Defining the cell
    # Can be:
    #   tf.nn.rnn_cell.RNNCell
    #   tf.nn.rnn_cell.GRUCell
    cell = tf.nn.rnn_cell.GRUCell(num_hidden)#, state_is_tuple=True)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=0.9)
    """
    cell = tf.nn.rnn_cell.LSTMCell(num_units=64, state_is_tuple=True)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell=cell, output_keep_prob=0.5)
    cell = tf.nn.rnn_cell.MultiRNNCell(cells=[cell] * 4, state_is_tuple=True)
    
    outputs, states  = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=cell,
        cell_bw=cell,
        dtype=tf.float64,
        sequence_length=X_lengths,
        inputs=X)

    output_fw, output_bw = outputs
    states_fw, states_bw = states
    """

    # Stacking rnn cells
    stackf = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers,
                                        state_is_tuple=True)
    stackb = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers,
                                        state_is_tuple=True)

    # The second output is the last state and we will no use that
    #outputs, _ = tf.nn.dynamic_rnn(stack, h_pool2_flat, seq_len, dtype=tf.float32)

    # Permuting batch_size and n_steps
    x = tf.transpose(h_pool2_flat, [1, 0, 2])
    # Reshape to (n_steps*batch_size, n_input)
    x = tf.reshape(x, [-1, hh*ww*60])
    # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
    x = tf.split(0, 75, x)

    # The second output is the last state and we will no use that
    #outputs, _ = tf.nn.dynamic_rnn(stack, inputs, seq_len, dtype=tf.float32)
    #try:
    outputs1, _, _ = tf.nn.bidirectional_rnn(stackf, stackb, x, sequence_length=seq_len,
                                              dtype=tf.float32)
    
    batch_s, max_timesteps = shapeV[0],shapeV[1]

    # Reshaping to apply the same weights over the timesteps
    outputs = tf.reshape(outputs1, [-1, 2*num_hidden])

    # Truncated normal with mean 0 and stdev=0.1
    # Tip: Try another initialization
    # see https://www.tensorflow.org/versions/r0.9/api_docs/python/contrib.layers.html#initializers
    W = tf.Variable(tf.truncated_normal([num_hidden*2,
                                         num_classes],
                                        stddev=0.1))
    # Zero initialization
    # Tip: Is tf.zeros_initializer the same?
    b = tf.Variable(tf.constant(0., shape=[num_classes]))

    # Doing the affine projection
    logits = tf.matmul(outputs, W) +  b 
    
    #keep_prob = tf.placeholder(tf.float32)
    logits = tf.nn.dropout(logits, keep_prob)

    # Reshaping back to the original shape
    logits = tf.reshape(logits, [ -1, batch_s, num_classes])
    
    
    # Time major
    #logits = tf.transpose(logits, (1, 0, 2))

    loss = ctc_ops.ctc_loss(logits, targets, seq_len)
    cost = tf.reduce_mean(loss)

    optimizer = tf.train.AdamOptimizer(learning_rate=initial_learning_rate).minimize(cost)
    #optimizer = tf.train.MomentumOptimizer(initial_learning_rate,  0.9).minimize(cost)

    # Option 2: tf.contrib.ctc.ctc_beam_search_decoder
    # (it's slower but you'll get better results)
    #decoded, log_prob = ctc_ops.ctc_greedy_decoder(logits, seq_len)
    decoded, log_prob = ctc_ops.ctc_beam_search_decoder(logits, seq_len)
    
    # Inaccuracy: label error rate
    ler = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                          targets))

In [29]:
def makeSameLengthO(ar):
    m = 0
    for k,i in enumerate(ar):
        m = max(m,i.shape[0])
    at = []
    for k,i in enumerate(ar):
        if i.shape[0]!=m:            
            ar[k] = np.insert(ar[k], [i.shape[0]]*(m-i.shape[0]), 0., axis=0)
            at.append(ar[k])
        else:
            at.append(ar[k])
    return np.asarray(at)

def getBatchO(batchSize, epoch=0, fileNames=False,randomIds=False, database=False,saveAll=False, fromFile=False):
    #what is better, to save in memory or to read from file every time>>??
    #openfile    
    if fileNames != False:
        video=scipy.io.loadmat(fileNames[0])['video']
        audio=scipy.io.loadmat(fileNames[1])['audio'][0]#must be same length-get max length, and add zeros to others
        align=scipy.io.loadmat(fileNames[2])['align'][0]
        lengthA=scipy.io.loadmat(fileNames[1])['lengths'][0]
        lengthV=scipy.io.loadmat(fileNames[0])['lengths'][0]
        #audio, lengths = makeSameLength(audio)
        database = [audio,align,video,lengthA,lengthV]
    
    assert batchSize < len(database[0])
    #print(len(database[0]))
    mask = np.ones(len(database[0]),dtype=bool)
    if randomIds:
        b = random.sample(range(0, len(database[0])), batchSize)        
    else:
        offset = (batchSize*epoch)%len(database[0])
        b = np.zeros(batchSize,dtype=np.int32)
        b[:min(offset+batchSize,len(database[0]))-offset] = np.arange(offset,min(offset+batchSize,len(database[0]))) 
        if offset+batchSize > len(database[0]):
            b[min(offset+batchSize,len(database[0]))-offset:] = np.arange(offset+batchSize-len(database[0]),dtype=np.int32)
    #print(b)
    mask[b] = False    
    data = [False,sparse_tuple_from(database[1][~mask]),
            database[2][~mask],database[3][~mask],database[4][~mask]]  
    #return and correct audio length
    return data

def getVideoBatch(batchSize, nr, path):
    t=0
    for f1 in sorted(glob.glob(path+"/*.mat")):        
        if f1.find('video')!=-1 and f1.find('words')==-1:
            t=t+1
    nr = nr % t
    
    for f1 in sorted(glob.glob(path+"/*.mat")):        
        if f1.find('video')!=-1 and f1.find('words')==-1:
            if nr != 0:
                nr = nr -1
                continue
            f2 = f1.replace('video','align')            
            #print(f1)
            #print(f2)
            video=scipy.io.loadmat(f1)['video']
            align=scipy.io.loadmat(f2)['align'][0]
            lengthA=scipy.io.loadmat(f2)['lengths'][0]
            lengthV=scipy.io.loadmat(f1)['lengths'][0]
            if len(video)<=batchSize  :
                return [video, lengthV,sparse_tuple_from(align)]
            b = random.sample(range(0, len(video)), batchSize)
            mask = np.ones(len(video),dtype=bool)
            mask[b] = False
            return [video[~mask], lengthV[~mask],sparse_tuple_from(align[~mask])]
#s = getBatchO(2,8,['test/video-2-train5.mat','test/audio-2-train5.mat','test/align-2-train.mat'],randomIds=True)

s  = getVideoBatch(2,2,"test/batch-3-50/")
print(s[1])

[75 75]


In [None]:
num_epochs = 600
display_step = 10
batch_size = 2
num_examples = 2
num_batches_per_epoch = int(num_examples/batch_size)
restore, save = "models/images-100ep-10bs-100ex-2CL.ckpt","models/images-200ep-10bs-100ex-2CL.ckpt"#.ckpt
restore, save = False,False#"models/images-bgru-2ep-10bs-100ex-2CLstd-direct-3d.ckpt"
"""

"""

with tf.Session(graph=graph) as session:
    # Initializate the weights and biases
    saver = tf.train.Saver()
    if restore:
        saver.restore(session, restore)
        print("Model restored.",restore)
    else:
        tf.global_variables_initializer().run()


    for curr_epoch in range(num_epochs):
        train_cost = train_ler = 0
        start = time.time()

        for batch in range(num_batches_per_epoch):
            s = getVideoBatch(batch_size,batch,
                         "test/batch-3-50/")
            feed = {videoInputs: s[0],
                    targets: s[2],
                    seq_len: s[1],keep_prob:0.9}
            batch_cost, _ = session.run([cost, optimizer], feed)
            train_cost += batch_cost*batch_size
            train_ler += session.run(ler, feed_dict=feed)*batch_size
        if curr_epoch % display_step == 0:
            train_cost /= num_examples
            train_ler /= num_examples
            s1 = getVideoBatch(2,0,
                          "test/batch-3-50/")
            val_feed = {videoInputs: s1[0],
                        targets: s1[2],
                        seq_len: s1[1],keep_prob:0.9}

            val_cost, val_ler = session.run([cost, ler], feed_dict=val_feed)

            log = "Epoch {}/{}, train_cost = {:.3f}, train_ler = {:.3f}, val_cost = {:.3f}, val_ler = {:.3f}, time = {:.3f}"
            print(log.format(curr_epoch+1, num_epochs, train_cost, train_ler,
                             val_cost, val_ler, time.time() - start))
            
            if save and curr_epoch%100==0 and curr_epoch>0:
                save_path = saver.save(session, save)
                print("model saved:",save_path)
    # Decoding
    d = session.run(decoded[0], feed_dict=val_feed)
    str_decoded = ''.join([chr(x) for x in np.asarray(d[1]) + FIRST_INDEX])
    # Replacing blank label to none
    str_decoded = str_decoded.replace(chr(ord('z') + 1), '')
    # Replacing space label to space
    str_decoded = str_decoded.replace(chr(ord('a') - 1), ' ')
    
    if save:
        save_path = saver.save(session, save)
        print("model saved:",save_path)

    #print('Original:\n%s' % original)
    print('Decoded:\n%s' % str_decoded)