In [1]:
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.platform import gfile
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
from matplotlib import pyplot as plt
from itertools import count
import ipdb
import os
%matplotlib 

Using matplotlib backend: Qt5Agg


In [2]:
#Use this if summaries live longer than they should
#tf.reset_default_graph()
isess = tf.InteractiveSession()

In [47]:
FLAGS = {}
#FLAGS['dataset_dir'] = r"/mnt/6A0850980850655B/Datasets/commnads/audio"
#FLAGS['dataset_dir'] = r"/tmp/speech_dataset"
FLAGS['dataset_dir'] = r"dev_audio"
FLAGS['wanted_words'] = ['nine', 'seven']
FLAGS['background_dir'] = "_background_noise_"
FLAGS['clip_time_ms'] = 1000
FLAGS['window_time_ms'] = 30
FLAGS['window_stride_ms'] = 10
FLAGS['batch_size'] = 2
FLAGS['sampling_rate'] = 16000
FLAGS['mel_num_bins'] = 40
FLAGS['mel_f_min'] = 40
FLAGS['mel_f_max'] = FLAGS['sampling_rate']/2
FLAGS['lr'] = 1e-4
FLAGS['valid_percent'] = 30
FLAGS['train_log'] = 'train_log'
FLAGS['valid_log'] = 'valid_log'
FLAGS['checkpoint_dir'] = 'checkpoints/conv.ckpt'
FLAGS['background_volume'] = 0.1

In [48]:
def play_audio(fs, data):
    from io import BytesIO
    from scipy.io import wavfile
    import pyaudio 
    import wave  
    
    if data.dtype == np.float32:
        if np.max(np.abs(data)) > 1.0:
            raise RuntimeError("Float shoud be in range[-1,1]")
        data = (data*(1<<15)).astype(np.int16)
    elif data.dtype == np.int16:
        pass
    else:
        raise RuntimeError("Data type unsupported")
        
    with BytesIO() as buffer:
        p = pyaudio.PyAudio()
        wavfile.write(buffer, fs, data)
        f = wave.open(buffer,"rb")  
        #define stream chunk   
        chunk = 1024  
        
        stream = p.open(format = p.get_format_from_width(f.getsampwidth()),  
                        channels = f.getnchannels(),  
                        rate = f.getframerate(),  
                        output = True)  
        
        #read data  
        audio_data = f.readframes(chunk)

        #play stream  
        while audio_data:  
            stream.write(audio_data)  
            audio_data = f.readframes(chunk)  

        #stop stream  
        stream.stop_stream()  
        stream.close()  
        p.terminate()

In [49]:
def get_model_settings(clip_time_ms, sampling_rate, window_time_ms, window_stride_time_ms, mel_f_min, mel_f_max, mel_num_bins, 
                       background_volume):
    clip_samples = int(clip_time_ms * sampling_rate / 1000)
    window_samples = int(window_time_ms * sampling_rate / 1000)
    window_stride_samples = int(window_stride_time_ms * sampling_rate / 1000)
    length_in_samples = 1 + int((clip_samples - window_samples) / window_stride_samples)
    fft_size = int(2**np.ceil(np.log2(window_samples)))
    settings = {
        'clip_samples':          clip_samples,
        'window_samples':        window_samples,
        'window_stride_samples':window_stride_samples,
        'fingerprint_size':      length_in_samples*mel_num_bins,
        'sampling_rate':         sampling_rate,
        'mel_f_min':             mel_f_min,
        'mel_f_max':             mel_f_max,
        'mel_num_bins':          mel_num_bins,
        'lenght_in_samples':     length_in_samples,
        'fft_size':              fft_size,
        'background_volume':     background_volume
    }
    return settings

def get_training_settings(batch_size, valid_percent):
    settings = {
        'batch_size':batch_size,
        'valid_percent':valid_percent
    }
    return settings
    

In [50]:
class AudioProcessor:
    def __init__(self, dataset_dir, background_dir, wanted_words, model_settings, training_settings):
        self._model_settings = model_settings
        self._mel_matrix = tf.constant(self.prepare_to_mel_matrix())
        
        self.id_to_label =  wanted_words
        self.label_to_id = {label:i for i, label in enumerate(wanted_words)}
        self.train_set, self.valid_set, background_paths = self.load_audio(wanted_words, dataset_dir, background_dir, training_settings)
        self._next_background = self.process_backgrounds(background_paths)
        self.x_src, self.y_src, self.iter = self.prepare_processing_graph(len(wanted_words), training_settings)
        self._signals, self._labels = self.iter.get_next()
        
    def prepare_to_mel_matrix(self):
        print("Preparing mel matrix")
        mel_num_bins = self._model_settings['mel_num_bins']
        mel_f_min = self._model_settings['mel_f_min']
        mel_f_max = self._model_settings['mel_f_max']
        num_spectrogram_bins = self._model_settings['fft_size']//2+1
        fs = self._model_settings['sampling_rate']

        linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
          mel_num_bins, num_spectrogram_bins, fs, mel_f_min, mel_f_max)
        with tf.Session() as sess:
            out = linear_to_mel_weight_matrix.eval(session=sess)
        print("Preparing mel matrix finished")
        return out
    
    def load_audio(self, wanted_words, dataset_dir, background_dir, training_settings):
        """
            Finds all wave files in dataset directory and arranges them into word -> list of files
        """
        print("Indexing audio files")
        wanted_words = set(wanted_words)
        all_paths = []
        all_labels = []
        background_paths = []
        search_path = os.path.join(dataset_dir, '*', '*.wav')
        for wav_path in gfile.Glob(search_path):
            _, word = os.path.split(os.path.dirname(wav_path))
            if word == background_dir:
                background_paths.append(wav_path)
                continue
            if word in wanted_words:
                all_paths.append(wav_path)
                all_labels.append(self.label_to_id[word])
                
#         all_paths = ['/media/sebastian/Itanos/Kursy/Audio signal processing coursera/sounds/piano.wav',
#                      '/media/sebastian/Itanos/Kursy/Audio signal processing coursera/sounds/sine-440.wav']
#         all_paths = ['I:/Kursy/Audio signal processing coursera/sounds/piano.wav',
#                      'I:/Kursy/Audio signal processing coursera/sounds/sine-440.wav']
#         all_labels = [0, 1]

        #shuffle both list in the same way
        combined = list(zip(all_paths,all_labels))
        random.shuffle(combined)
        valid_size = int(len(combined)*training_settings['valid_percent']/100)
        valid_combined = list(zip(*combined[0:valid_size]))
        train_combined = list(zip(*combined[valid_size:] ))
        
        print("Indexing audio files finished")
        return train_combined, valid_combined, background_paths
    
    def load_wav(self, path, desired_samples=-1):
        wav_loader = io_ops.read_file(path)
        audio, fs = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples)
        tf.Assert(tf.equal(fs, self._model_settings['sampling_rate']), [fs])
        audio = tf.squeeze(audio)
        return audio
    
    def process_backgrounds(self, background_paths):
        #TODO normalize background loudness
        print("Loading backgrounds")
        path_input = tf.placeholder(tf.string)
        wav_out = self.load_wav(path_input)
        with tf.Session() as sess:
            wavs = [ sess.run(wav_out, feed_dict={path_input:path}) for path in background_paths ]
        
        #to limit RAM usage, instead of using dataset.from_tensor_slices (which creates huge copies of
        #of wav_out, taking more than x4 memory than they should) we create generator that returns data
        #wav_out, that is referenced in this generator, is never freed since tf doesn't delete datasets
        #this causes about 35 MB leak per AudioProcessor creation
        def gen():
            num_examples = len(wavs)
            clip_samples = self._model_settings['clip_samples']
            while True:
                example_no = random.randint(0, num_examples-1)
                wav_data = wavs[example_no]
                random_offset = random.randint(0, wav_data.size-1-clip_samples)
                yield wav_data[random_offset:random_offset+clip_samples]
                
        next_background = tf.data.Dataset.from_generator(gen, tf.float32, output_shapes=(self._model_settings['clip_samples'],))\
                          .make_one_shot_iterator().get_next()
        print("Loading backgrounds finished")
        return next_background
        
    def prepare_processing_graph(self, num_labels, training_settings):
        clip_samples = self._model_settings['clip_samples']
        
        x_src = tf.placeholder(tf.string, (None,))
        y_src = tf.placeholder(tf.int32,  (None,))
        x_data = tf.data.Dataset.from_tensor_slices(x_src).map(lambda path: self.load_wav(path, clip_samples))
        y_data = tf.data.Dataset.from_tensor_slices(y_src)
        dataset = tf.data.Dataset.zip((x_data, y_data)).batch(training_settings['batch_size'])
            
        return x_src, y_src, dataset.make_initializable_iterator()

    def get_data(self, debug_stuff = False):
        """
            Applies random transforms, and returns audio as spectrograms
        """
        frame_len = self._model_settings['window_samples']
        frame_step = self._model_settings['window_stride_samples']
        fft_size = self._model_settings['fft_size']
        background_volume = self._model_settings['background_volume']/(1+self._model_settings['background_volume'])
        foreground_volume = 1/(1+self._model_settings['background_volume'])
        
        signals, labels = self._signals, self._labels
        background = self._next_background[0:signals.shape[1]]
        
        signals_mixed = foreground_volume * signals + background_volume * background

        spect = tf.contrib.signal.stft(signals_mixed, frame_len, frame_step, fft_size)
        spect_mag = tf.abs(spect)
        mel_spect = tf.tensordot(spect_mag, self._mel_matrix, 1)
        
        if debug_stuff:
            return signals, background, signals_mixed, spect, mel_spect, labels
        else:
            return mel_spect, labels

#ipdb.set_trace()
model_settings = get_model_settings(FLAGS['clip_time_ms'], FLAGS['sampling_rate'], FLAGS['window_time_ms'],
                                    FLAGS['window_stride_ms'], FLAGS['mel_f_min'], FLAGS['mel_f_max'], FLAGS['mel_num_bins'],
                                    FLAGS['background_volume'])
training_settings = get_training_settings(FLAGS['batch_size'], FLAGS['valid_percent'])
ap = AudioProcessor(FLAGS['dataset_dir'], FLAGS['background_dir'], FLAGS['wanted_words'], model_settings, training_settings)

Preparing mel matrix
Preparing mel matrix finished
Indexing audio files
Indexing audio files finished
Loading backgrounds
Loading backgrounds finished


In [51]:
#play_audio(FLAGS['sampling_rate'], ap._next_background.eval())

In [52]:
def reconstruct_audio_from_mel(mel_spect, model_settings, true_spect = None):
    mel_inv = np.linalg.pinv(ap.mel_matrix.eval())
    spect_mag_reconstructed = (mel_spect @ mel_inv)
    
    if true_spect is not None:
        phase = tf.angle(tf.constant(true_spect))
    else:
        phase = tf.random_uniform(spect_mag_reconstructed.shape, 2 * np.pi)
        
    spect_reconstructed = tf.complex(spect_mag_reconstructed * tf.cos(phase), spect_mag_reconstructed * tf.sin(phase))
    audio = tf.contrib.signal.inverse_stft(spect_reconstructed, model_settings['window_samples'], 
                                           model_settings['window_stride_samples'], model_settings['fft_size'])
    return audio.eval()

In [53]:
signals, background, signals_mixed, spect, mel_spectras, labels = ap.get_data(True)

In [54]:
print(background.shape)

(16000,)


In [55]:
play_example = False
if play_example:
    ap.iter.initializer.run(feed_dict={ap.x_src: ap.train_set[0], ap.y_src: ap.train_set[1]})
#     play_audio(FLAGS['sampling_rate'],reconstruct_audio_from_mel(spectras[0,:,:], model_settings, spectras_true[0,:,:]))
    play_audio(FLAGS['sampling_rate'], signals_mixed.eval()[0])

In [56]:
def conv_model(model_settings, fingerprint, num_labels):
    if fingerprint.shape[1] != model_settings['lenght_in_samples'] or fingerprint.shape[2] != model_settings['mel_num_bins']:
        raise RuntimeError("Unexpected input: " + str(fingerprint.shape))
    h = tf.expand_dims(fingerprint, -1) #Add dimension at the end as as channels
    h = tf.layers.conv2d(h, 64, [20, 8], activation=tf.nn.relu) #[time span, freq_span]
    h = tf.layers.max_pooling2d(h, [1,3], [1,3])
    h = tf.layers.conv2d(h, 64, [10, 4], activation=tf.nn.relu)
    h = tf.layers.flatten(h)
    h = tf.layers.dense(h, 128, activation=tf.nn.relu)
    h = tf.layers.dense(h, num_labels)
    scores = h
    return scores
        
scores = conv_model(model_settings, mel_spectras, len(FLAGS['wanted_words']))
xent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=scores)
loss = tf.reduce_mean(xent)
loss_summary = tf.summary.scalar("loss", loss)
pred = tf.argmax(scores, 1, output_type=tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred,labels), tf.float32))
accuracy_summary = tf.summary.scalar("accuracy", accuracy)
merged_summary = tf.summary.merge([loss_summary, accuracy_summary])

In [57]:
optim = tf.train.AdamOptimizer(FLAGS['lr'])
step = optim.minimize(loss)
global_step = tf.train.get_or_create_global_step()
increment_step = tf.assign(global_step, global_step+1)

saver = tf.train.Saver(tf.global_variables())
train_writer = tf.summary.FileWriter('train_log', isess.graph)
valid_writer = tf.summary.FileWriter('valid_log')

In [58]:
tf.global_variables_initializer().run()

In [59]:
#saver.restore(isess, "checkpoints/conv.ckpt-27")

In [60]:
epochs = 10
for epoch in range(epochs):
    #train
    ap.iter.initializer.run(feed_dict={ap.x_src: ap.train_set[0], ap.y_src: ap.train_set[1]})
    e_sum = 0
    mean_acc = 0
    for i in count():
        try:
            train_summary, loss_val, acc_val, _, _ = isess.run([merged_summary, loss, accuracy, step, increment_step])
            e_sum += loss_val
            mean_acc +=acc_val
            train_writer.add_summary(train_summary, global_step.eval())
        except tf.errors.OutOfRangeError:
            break
    print("Epoch {}:{}loss {:.3f} acc {:.3f}".format(epoch," "*30, (e_sum/i), (mean_acc/i)))
    print("Saving to {}".format(FLAGS['checkpoint_dir']))
    saver.save(isess, FLAGS['checkpoint_dir'], global_step=global_step.eval())
    
    #valid
    ap.iter.initializer.run(feed_dict={ap.x_src: ap.valid_set[0], ap.y_src: ap.valid_set[1]})
    e_sum = 0
    mean_acc = 0
    for i in count():
        try:
            valid_summary, loss_val, acc_val, = isess.run([merged_summary, loss, accuracy])
            e_sum += loss_val
            mean_acc += acc_val
            valid_writer.add_summary(valid_summary, epoch)
        except tf.errors.OutOfRangeError:
            break
    print("Validation {}:{}loss {:.3f} acc {:.3f}".format(epoch," "*50, (e_sum/i), (mean_acc/i)))


Epoch 0:                              loss 1.654 acc 0.577
Saving to checkpoints/conv.ckpt
Validation 0:                                                  loss 0.566 acc 0.636
Epoch 1:                              loss 0.472 acc 0.788
Saving to checkpoints/conv.ckpt
Validation 1:                                                  loss 0.444 acc 0.864
Epoch 2:                              loss 0.247 acc 0.942
Saving to checkpoints/conv.ckpt
Validation 2:                                                  loss 0.412 acc 0.818
Epoch 3:                              loss 0.177 acc 0.923
Saving to checkpoints/conv.ckpt
Validation 3:                                                  loss 0.587 acc 0.773
Epoch 4:                              loss 0.128 acc 0.962
Saving to checkpoints/conv.ckpt
Validation 4:                                                  loss 0.285 acc 0.864
Epoch 5:                              loss 0.052 acc 1.000
Saving to checkpoints/conv.ckpt
Validation 5:                     

### Random tests

In [None]:
data = [ ['ą','as'], ['b', 'bas'] ]
p = tf.placeholder(tf.string, [None, 2])
pa = p+'a'
pa = sess.run(pa, feed_dict={p:data})
str(pa[0][0].decode("utf-8"))

In [32]:
a = np.ones((5,10))
b = np.ones((5,))
a+b

ValueError: operands could not be broadcast together with shapes (5,10) (5,) 