In [1]:
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.platform import gfile
from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
from tensorflow.python.ops import io_ops
from matplotlib import pyplot as plt
from itertools import count
import os
%matplotlib 

Using matplotlib backend: Qt5Agg


In [2]:
#Use this if summaries live longer than they should
#tf.reset_default_graph()
sess = tf.InteractiveSession()

In [3]:
FLAGS = {}
#FLAGS['dataset_dir'] = r"/mnt/6A0850980850655B/Datasets/commnads/audio"
#FLAGS['dataset_dir'] = r"/tmp/speech_dataset"
FLAGS['dataset_dir'] = r"dev_audio"
FLAGS['wanted_words'] = ['nine', 'seven']
FLAGS['background_dir'] = "_background_noise_"
FLAGS['clip_time_ms'] = 1000
FLAGS['window_time_ms'] = 30
FLAGS['window_stride_ms'] = 10
FLAGS['batch_size'] = 2
FLAGS['sampling_rate'] = 16000
FLAGS['mel_num_bins'] = 40
FLAGS['mel_f_min'] = 40
FLAGS['mel_f_max'] = FLAGS['sampling_rate']/2
FLAGS['lr'] = 1e-4
FLAGS['valid_percent'] = 10

In [4]:
def play_audio(fs, data):
    from io import BytesIO
    from scipy.io import wavfile
    import pyaudio 
    import wave  
    
    if data.dtype == np.float32:
        if np.max(np.abs(data)) > 1.0:
            raise RuntimeError("Float shoud be in range[-1,1]")
        data = (data*(1<<15)).astype(np.int16)
    elif data.dtype == np.int16:
        pass
    else:
        raise RuntimeError("Data type unsupported")
        
    with BytesIO() as buffer:
        p = pyaudio.PyAudio()
        wavfile.write(buffer, fs, data)
        f = wave.open(buffer,"rb")  
        #define stream chunk   
        chunk = 1024  
        
        stream = p.open(format = p.get_format_from_width(f.getsampwidth()),  
                        channels = f.getnchannels(),  
                        rate = f.getframerate(),  
                        output = True)  
        
        #read data  
        audio_data = f.readframes(chunk)

        #play stream  
        while audio_data:  
            stream.write(audio_data)  
            audio_data = f.readframes(chunk)  

        #stop stream  
        stream.stop_stream()  
        stream.close()  
        p.terminate()

In [5]:
def get_model_settings(clip_time_ms, sampling_rate, window_time_ms, window_stride_time_ms, mel_f_min, mel_f_max, mel_num_bins):
    clip_samples = int(clip_time_ms * sampling_rate / 1000)
    window_samples = int(window_time_ms * sampling_rate / 1000)
    window_stride_samples = int(window_stride_time_ms * sampling_rate / 1000)
    length_in_samples = 1 + int((clip_samples - window_samples) / window_stride_samples)
    fft_size = int(2**np.ceil(np.log2(window_samples)))
    settings = {
        'clip_samples':          clip_samples,
        'window_samples':        window_samples,
        'window_stride_samples':window_stride_samples,
        'fingerprint_size':      length_in_samples*mel_num_bins,
        'sampling_rate':         sampling_rate,
        'mel_f_min':             mel_f_min,
        'mel_f_max':             mel_f_max,
        'mel_num_bins':          mel_num_bins,
        'lenght_in_samples':     length_in_samples,
        'fft_size':              fft_size
    }
    return settings

def get_training_settings(batch_size, valid_percent):
    settings = {
        'batch_size':batch_size,
        'valid_percent':valid_percent
    }
    return settings
    

In [10]:
class AudioProcessor:
    def __init__(self, dataset_dir, background_dir, wanted_words, model_settings, training_settings):
        self.model_settings = model_settings
        self.training_settings = training_settings
        self.id_to_label =  wanted_words
        self.label_to_id = {label:i for i, label in enumerate(wanted_words)}
        train_combined, valid_combined = self.load_audio(wanted_words, dataset_dir, background_dir)
        #TODO change dataset to accept combined list
        all_paths, all_labels = zip(*train_combined)
        self.dataset = self.prepare_processing_graph(all_paths, all_labels, len(wanted_words))
        self.iter = self.dataset.make_initializable_iterator()
        self.mel_matrix = tf.constant(self.prepare_to_mel_matrix())
        self.signals, self.labels = self.iter.get_next()
        
    def prepare_to_mel_matrix(self):
        mel_num_bins = self.model_settings['mel_num_bins']
        mel_f_min = self.model_settings['mel_f_min']
        mel_f_max = self.model_settings['mel_f_max']
        num_spectrogram_bins = self.model_settings['fft_size']//2+1
        fs = self.model_settings['sampling_rate']

        linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
          mel_num_bins, num_spectrogram_bins, fs, mel_f_min, mel_f_max)
        return linear_to_mel_weight_matrix.eval()
        
    def prepare_processing_graph(self, all_paths, all_labels, num_labels):
        clip_samples = self.model_settings['clip_samples']
        
        def process_single_file(path):
            wav_loader = io_ops.read_file(path)
            audio, fs = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=clip_samples)
            tf.Assert(tf.equal(fs, self.model_settings['sampling_rate']), [fs])
            audio = tf.squeeze(audio)
            return audio
        #dataset_placeholder = tf.placeholder(tf.string, (None, 2))
        dataset = tf.data.Dataset.from_tensor_slices(
                    tf.convert_to_tensor(all_paths))
        dataset = dataset.map(process_single_file)
        
        labels_dataset = tf.data.Dataset.from_tensor_slices(
            tf.convert_to_tensor(all_labels))
        
        #add labels to dataset
        dataset = tf.data.Dataset.zip((dataset, labels_dataset))
        
        dataset = dataset.batch(self.training_settings['batch_size'])
        return dataset

        
    def load_audio(self, wanted_words, dataset_dir, background_dir):
        """
            Finds all wave files in dataset directory and arranges them into word -> list of files
        """
        wanted_words = set(wanted_words)
        all_paths = []
        all_labels = []
        search_path = os.path.join(dataset_dir, '*', '*.wav')
        for wav_path in gfile.Glob(search_path):
            _, word = os.path.split(os.path.dirname(wav_path))
            if word == background_dir:
                continue
            if word in wanted_words:
                all_paths.append(wav_path)
                all_labels.append(self.label_to_id[word])
#         all_paths = ['/media/sebastian/Itanos/Kursy/Audio signal processing coursera/sounds/piano.wav',
#                      '/media/sebastian/Itanos/Kursy/Audio signal processing coursera/sounds/sine-440.wav']
#         all_paths = ['I:/Kursy/Audio signal processing coursera/sounds/piano.wav',
#                      'I:/Kursy/Audio signal processing coursera/sounds/sine-440.wav']
#         all_labels = [0, 1]

        #shuffle both list in the same way
        combined = list(zip(all_paths,all_labels))
        random.shuffle(combined)
        valid_size = int(len(combined)*self.training_settings['valid_percent']/100)
        valid_combined = combined[0:valid_size]
        train_combined = combined[valid_size:]
        
        return train_combined, valid_combined
    
    def get_data(self, want_raw_spect: bool = False):
        """
            Applies random transforms, and returns audio as spectrograms
        """
        signals, labels = self.signals, self.labels
        frame_len = self.model_settings['window_samples']
        frame_step = self.model_settings['window_stride_samples']
        fft_size = self.model_settings['fft_size']
        spect = tf.contrib.signal.stft(signals, frame_len, frame_step, fft_size)
        spect_mag = tf.abs(spect)
        mel_spect = tf.tensordot(spect_mag, self.mel_matrix, 1)
        if want_raw_spect:
            return mel_spect, labels, spect
        else:
            return mel_spect, labels

model_settings = get_model_settings(FLAGS['clip_time_ms'], FLAGS['sampling_rate'], FLAGS['window_time_ms'],
                                    FLAGS['window_stride_ms'], FLAGS['mel_f_min'], FLAGS['mel_f_max'], FLAGS['mel_num_bins'])
training_settings = get_training_settings(FLAGS['batch_size'], FLAGS['valid_percent'])
ap = AudioProcessor(FLAGS['dataset_dir'], FLAGS['background_dir'], FLAGS['wanted_words'], model_settings, training_settings)

In [11]:
def reconstruct_audio_from_mel(mel_spect, model_settings, true_spect = None):
    mel_inv = np.linalg.pinv(ap.mel_matrix.eval())
    spect_mag_reconstructed = (mel_spect @ mel_inv)
    
    if true_spect is not None:
        phase = tf.angle(tf.constant(true_spect))
    else:
        phase = tf.random_uniform(spect_mag_reconstructed.shape, 2 * np.pi)
        
    spect_reconstructed = tf.complex(spect_mag_reconstructed * tf.cos(phase), spect_mag_reconstructed * tf.sin(phase))
    audio = tf.contrib.signal.inverse_stft(spect_reconstructed, model_settings['window_samples'], 
                                           model_settings['window_stride_samples'], model_settings['fft_size'])
    return audio.eval()

In [12]:
spectras, labels, spectras_true = ap.get_data(True)
#spectras, labels, spectras_true = sess.run((spectras, labels, spectras_true))
#play_audio(FLAGS['sampling_rate'],reconstruct_audio_from_mel(spectras[0,:,:], model_settings, spectras_true[0,:,:]))

In [13]:
def conv_model(model_settings, fingerprint, num_labels):
    if fingerprint.shape[1] != model_settings['lenght_in_samples'] or fingerprint.shape[2] != model_settings['mel_num_bins']:
        raise RuntimeError("Unexpected input: " + str(fingerprint.shape))
    h = tf.expand_dims(fingerprint, -1) #Add dimension at the end as as channels
    h = tf.layers.conv2d(h, 64, [20, 8], activation=tf.nn.relu) #[time span, freq_span]
    h = tf.layers.max_pooling2d(h, [1,3], [1,3])
    h = tf.layers.conv2d(h, 64, [10, 4], activation=tf.nn.relu)
    h = tf.layers.flatten(h)
    h = tf.layers.dense(h, 128, activation=tf.nn.relu)
    h = tf.layers.dense(h, num_labels)
    scores = h
    return scores
        
scores = conv_model(model_settings, spectras, len(FLAGS['wanted_words']))
xent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=scores)
loss = tf.reduce_mean(xent)
loss_summary = tf.summary.scalar("loss", loss)
pred = tf.argmax(scores, 1, output_type=tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(pred,labels), tf.float32))
accuracy_summary = tf.summary.scalar("accuracy", accuracy)
merged_summary = tf.summary.merge([loss_summary, accuracy_summary])

In [14]:
optim = tf.train.AdamOptimizer(FLAGS['lr'])
step = optim.minimize(loss)

In [15]:
global_step = tf.train.get_or_create_global_step()
increment_step = tf.assign(global_step, global_step+1)

In [16]:
train_writer = tf.summary.FileWriter('train_log', sess.graph)

In [17]:
tf.global_variables_initializer().run()

In [19]:
ap.iter.initializer.run()
e_sum = 0
mean_acc = 0
for i in count():
    try:
        train_summary, pred_val, labels_val, loss_val, acc_val, _, _ = sess.run([merged_summary, pred, labels, loss, accuracy, step, increment_step])
        e_sum+=loss_val
        mean_acc +=acc_val
        train_writer.add_summary(train_summary, global_step.eval())
    except tf.errors.OutOfRangeError:
        break
print(e_sum/i)
print(mean_acc/i)


0.3945672207257964
0.8939393939393939


In [None]:
a = tf.constant([1,2])
b = tf.constant([1,3])
c = tf.equal(a,b)
print(c.eval())

In [None]:
a = ["Spears", "Adele", "NDubz", "Nicole", "Cristina"]
b = [1, 2, 3, 4, 5]

combined = list(zip(a, b))
random.shuffle(combined)
print(combined)

a[:], b[:] = zip(*combined)

In [30]:
data = [ ['ą','as'], ['b', 'bas'] ]
p = tf.placeholder(tf.string, [None, 2])
pa = p+'a'
pa = sess.run(pa, feed_dict={p:data})
str(pa[0][0].decode("utf-8"))

'ąa'