Within a single batch, you must have the same number of timesteps (this is typically where you see 0-padding and masking). But between batches there is no such restriction. During inference, you can have any length.  
(https://datascience.stackexchange.com/questions/26366/training-an-rnn-with-examples-of-different-lengths-in-keras)

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

def rnn_model(num_tokens, num_units):
    # encoder
    enc_input = Input(shape = (None, num_tokens))
    enc_output, enc_state = tf.keras.layers.GRU(num_units, return_state=True)(enc_input) # input = [batch, timesteps, feature]
    
    # decoder
    dec_input = Input(shape = (None, num_tokens))
    dec_output = tf.keras.layers.GRU(num_units, return_sequences=True)(dec_input, initial_state = enc_state)
    
    # match input and ouput size
    fin_output = Dense(num_tokens, activation='softmax')(dec_output)

    sequence_autoencoder = Model([enc_input, dec_input], fin_output)
    
    encoder = Model(enc_input, enc_output)
    
    return sequence_autoencoder, encoder

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# create the model
num_features = 26 # number of features per sample (Mel)
num_units = 3 # GRU units in encoder and decoder
autoenc, enc = rnn_model(num_features, num_units)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [3]:
# produce Mel features
from python_speech_features import mfcc
import scipy.io.wavfile as wav

# creating a fake trainset set with 10 samples, just repeating one sample wav file
X_train = []
for i in range (0, 10):
    (rate, sig) = wav.read("wav_example.wav")
    mfcc_feat = mfcc(sig, samplerate=rate,winlen=0.025,winstep=0.01,numcep=num_features,
                     nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
                     ceplifter=22,appendEnergy=True)
    X_train.append(mfcc_feat)
X_train = np.array(X_train)

# decoder input, as shifted encoder input
X_train_shifted = np.zeros(X_train.shape)
# loop in timesteps
for sample in range(0, X_train.shape[0]):
    for timestep in range(0, X_train.shape[1] - 1):
        X_train_shifted[sample, timestep + 1, :] = X_train[sample, timestep, :]

# train the model
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
autoenc.compile(optimizer = opt, loss = tf.keras.losses.MeanSquaredError(), metrics = ["accuracy"])
autoenc.fit(x = [X_train, X_train_shifted], y = X_train, epochs = 100, batch_size = 256)

ModuleNotFoundError: No module named 'python_speech_features'

In [19]:
import numpy as np
import os
import shutil

# read a custom audio dataset, creating a generator

# count number of samples in the dataset
num_samples = 0
for subdirs, dirs, files in os.walk('../debug_dataset_020620/train'):
    num_samples += len(files)
print('Files in the dataset: ' + str(num_samples))

filenames = []
labels = np.zeros((num_samples, 1))

train_dir = '../debug_dataset_020620/train'
filenames_counter = 0
labels_counter = -1
    
for subdir, dirs, files in os.walk(train_dir):
    for file in files:
        filepath = os.path.join(subdir, file)
        
        if filepath.endswith(".wav"):
            #print(filepath)
            filenames.append(filepath)
            labels[filenames_counter, 0] = labels_counter
            filenames_counter = filenames_counter + 1
            
    # assign numeric index based on the directory
    labels_counter = labels_counter+1

print(len(filenames))
print(labels.shape)


Files in the dataset: 200
200
(200, 1)


In [20]:
from tensorflow.keras.utils import to_categorical
from sklearn.utils import shuffle

labels_one_hot = to_categorical(labels)
filenames_shuffled, labels_one_hot_shuffled = shuffle(filenames, labels_one_hot)

In [21]:
from sklearn.model_selection import train_test_split

filenames_shuffled_numpy = np.array(filenames_shuffled)

X_train_filenames, X_val_filenames, Y_train, Y_val = train_test_split(
    filenames_shuffled_numpy, labels_one_hot_shuffled, test_size=0.1, random_state=1)

print(X_train_filenames.shape)
print(Y_train.shape)

print(X_val_filenames.shape)
print(Y_val.shape)

(180,)
(180, 2)
(20,)
(20, 2)


In [36]:
class Autoenc_Audio_Generator(tf.keras.utils.Sequence):
  
    def __init__(self, filenames, labels, batch_size):
        self.filenames = filenames
        self.labels = labels
        self.batch_size = batch_size
    
    def __len__(self) :
        return (np.ceil(len(self.filenames) / float(self.batch_size))).astype(np.int)

    def __getitem__(self, idx):
        # get batch of filenames and labels
        batch_filename = self.filenames[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_y = self.labels[idx * self.batch_size : (idx + 1) * self.batch_size]
        
        batch_x = []
        # generate batch of encoder data
        for file_name in batch_filename:
            rate, sig = wav.read(file_name)
            mfcc_feat = mfcc(sig, samplerate=rate,winlen=0.025,winstep=0.01,numcep=num_features,
                     nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
                     ceplifter=22,appendEnergy=True)
            print(mfcc_feat.shape)
            batch_x.append(mfcc_feat)
            
        # obtain input data for decoder train as shifted encoder data
        batch_x = np.array(batch_x)
        print(batch_x.shape)
        batch_x_shifted = calculate_dec_input(batch_x)
        batch_y = np.array(batch_y)
        
        # TODO: the element generated is only suitable for autoencoder train, we must change it for audio classification
        return [batch_x, batch_x_shifted], batch_y

In [37]:
batch_size = 32

my_training_batch_generator = Autoenc_Audio_Generator(X_train_filenames, Y_train, batch_size)
my_validation_batch_generator = Autoenc_Audio_Generator(X_val_filenames, Y_val, batch_size)

autoenc.fit(x = my_training_batch_generator, epochs = 2)

Epoch 1/2
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(97, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(69, 26)
(99, 26)
(72, 26)
(73, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(97, 26)
(80, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(32,)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(89, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(87, 26)
(99, 26)
(99, 26)
(84, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)
(99, 26)


IndexError: too many indices for array

(99, 26)
(99, 26)
(99, 26)
(80, 26)
(99, 26)
(99, 26)
(32,)
