# Flex Net 2: Neural Network for Sound Parsing

In [28]:
import soundfile as sf
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
from os import listdir, getcwd, rename, remove
from os.path import isfile, join
from IPython.display import Audio


    
class Song(tf.Module):
    def __init__(self, sample_bank, sample_rate, length_in_samples, frame_length, frame_step, pulse_width):
        self.soundbank_ref = sample_bank.soundbank_ref
        self.instruments = sample_bank.instruments
        self.num_inst_samples = len(sample_bank.soundbank)
        self.sample_rate = sample_rate
        self.length_in_samples = length_in_samples
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.num_frames = ((length_in_samples - frame_length) // frame_step) + 1
        self.pulse_width = pulse_width
        padded_samples = []
        for sample_path in sample_bank.soundbank:
            sample = sf.read(sample_path)[0][:length_in_samples,0]
            padded_sample = np.zeros(length_in_samples)
            padded_sample[:sample.shape[0]] = sample
            padded_samples.append(padded_sample)
        self.padded_samples = tf.constant(padded_samples, dtype=tf.float32)
        self.output_signal = tf.Variable(tf.zeros([length_in_samples,], dtype=tf.float32))
        self.piano_roll = tf.Variable(tf.zeros([self.num_frames, self.num_inst_samples], dtype=tf.float32))

    def play(self):
        return Audio(self.output_signal.numpy(), rate=self.sample_rate, autoplay=True)
        
    @tf.function
    def zero(self):
        self.output_signal = self.output_signal.assign(
            tf.zeros([self.length_in_samples,], dtype=tf.float32)
        )
        self.piano_roll = self.piano_roll.assign(
            tf.zeros([self.num_frames, self.num_inst_samples], dtype=tf.float32)
        )
    
    @tf.function
    def create(self, rate=0.0003):
        # Create Bernoulli generator for random note choice
        length = 3 * self.num_frames // 4
        probs = tf.concat([rate * tf.ones((length, self.num_inst_samples)),
                           tf.zeros((self.num_frames - length, self.num_inst_samples))],
                          axis=0)
        self.bernoulli_generator = tfp.distributions.Bernoulli(probs=probs, dtype=tf.float32)
        
        # Generate random notes
        random_notes = self.bernoulli_generator.sample()
        
        # Reset piano roll and output signal
        self.zero()
        
        # Add pulses to piano roll for each note
        for i in range(self.pulse_width):
            self.piano_roll = self.piano_roll.assign_add(
                tf.roll(random_notes, shift=i, axis=0)
            )
            
        # Write generated notes to output signal
        new_signals = tf.matmul(random_notes, song.padded_samples)
        for i in range(self.num_frames):
            self.output_signal = self.output_signal.assign_add(
                tf.roll(new_signals[i, :], shift=i*self.frame_length, axis=0)
            )
        return self.output_signal, self.piano_roll
        
    def generate(self, instruments=None):
        if instruments is None:
            instruments = self.instruments
        length = 3 * self.length_in_beats // 4
        lst = [k for k in range(self.num_inst_samples) if self.soundbank_ref[k][0] in instruments]
        for i in range(length):
            make_note = np.random.choice([0 for i in range(9)] + [1])
            if make_note:
                j = np.random.choice(lst)
                intensity = 127 #np.random.randint(1,127)
                self.add(i, j, intensity)
        
    @tf.function                
    def __call__(self, notes_float, padded_samples):
        samples_per_beat = (self.sr * 60) // self.bpm
        N = samples_per_beat * self.length_in_beats
        self.zero()
        for i in range(self.length_in_beats):
            for j in range(self.num_inst_samples):
                self.output.assign_add(notes_float[i,j] * tf.roll(padded_samples[j], shift=i*samples_per_beat, axis=0))
        return None
    
    #@tf.function
    def add(self, i, j, intensity):
        samples_per_beat = (self.sr * 60) // self.bpm
        old_intensity = self.notes[i,j]
        new_intensity = tf.cast(intensity, dtype=tf.int32)
        float_intensity = tf.cast(intensity, dtype=tf.float32) / 128.0
        float_intensity_difference = tf.cast((new_intensity - old_intensity), dtype=tf.float32) / 128.0
        self.output = self.output.assign_add(float_intensity_difference * tf.roll(self.padded_samples[j], shift=i*samples_per_beat, axis=0))
        self.notes_float = self.notes_float[i,j].assign(float_intensity)
        self.notes = self.notes[i,j].assign(new_intensity)
        
    def transform(self, matrix):
        for i in range(self.notes.shape[0]):
            self.notes[i] = tf.matmul(matrix, self.notes[i])

class SampleBank(tf.Module):
    def __init__(self, restricted_instruments = None, name=None):
        super(SampleBank, self).__init__(name=name)
        path = join(getcwd(), 'preprocessed_samples')
        files = [f for f in listdir(path) if isfile(join(path, f)) and f[0] != '.']
        samples = []
        for f in files:
            sample = {'instrument' : '_'.join(f.split('_')[:-1]),
                      'midi_number' : int(f.split('_')[-1][:-4]),
                      'filename' : join(path, f)}
            samples.append(sample)
        samples = sorted(samples, key=lambda x: (x['instrument'], x['midi_number']))
    
        instrument_names = set([s['instrument'] for s in samples])
        instruments = {}
        for inst in instrument_names:
            samples_for_inst = {s['midi_number'] : s['filename'] for s in samples if s['instrument'] == inst}
            instruments[inst] = {'samples' : samples_for_inst,
                                 'min_note' : min(samples_for_inst.keys()),
                                 'max_note' : max(samples_for_inst.keys())}
        if not restricted_instruments:
            restricted_instruments = [
                inst for inst in instrument_names
                if len(instruments[inst]['samples']) == 1 + (instruments[inst]['max_note'] - instruments[inst]['min_note'])
            ]
        soundfont = {}
        soundbank_ref = []
        k = 0
        for inst in restricted_instruments:
            print(inst, ' --- notes ',instruments[inst]['min_note'], ' through ', instruments[inst]['max_note'])
            soundfont[inst] = {}
            for i in range(128):
                if i in instruments[inst]['samples'].keys():
                    soundfont[inst][i] = (k, instruments[inst]['samples'][i])
                    soundbank_ref.append((inst, i))
                    k += 1
                else:
                    soundfont[inst][i] = None
        soundbank = [soundfont[x[0]][x[1]][1] for x in soundbank_ref]
        self.soundbank = soundbank
        self.soundbank_ref = soundbank_ref
        self.instruments = restricted_instruments

In [29]:
song = Song(SampleBank(restricted_instruments=['Marimba_cord_ff']),
                         44100, 176400, 1024, 256, 50)

Marimba_cord_ff  --- notes  36  through  96


In [30]:
output, piano = song.create()

In [31]:
song.play()

In [22]:
from IPython.display import Audio

Audio(output.numpy(), rate=44100, autoplay=True)

In [None]:
rn = song.create()

In [None]:
tf.roll(rn, shift=0, axis=0)

In [None]:
tf.matmul(tf.cast(rn, tf.float32), song.padded_samples)

In [None]:
song.padded_samples

### Avant Garde Composer

Code to generate batches of random marimba compositions (with piano roll and corresponding audio)

In [None]:
import tensorflow as tf
from sonic2 import *

class Generate_Batch():
    def __init__(self, batch_size, length_in_beats, sample_rate, bpm):
        self.song = Song(SampleBank(restricted_instruments=['Marimba_cord_ff']),
                         length_in_beats, bpm, sample_rate)
        self.batch_size = batch_size
        self.num_inst_samples = self.song.num_inst_samples
        self.length_in_beats = length_in_beats
        self.bpm = bpm
        self.sr = sample_rate
        self.N = self.sr * self.length_in_beats * 60 // self.bpm
        self.samples_per_beat = (self.sr * 60) // self.bpm
        self.batch_audio = tf.Variable(tf.zeros([self.batch_size, self.length_in_beats-4, self.samples_per_beat*5], dtype=tf.float32))
        self.batch_piano_roll = tf.Variable(tf.zeros([self.batch_size, self.length_in_beats-4, self.num_inst_samples], dtype=tf.float32))
    
    #@tf.function
    def __chop__(self, song_output, song_notes_float):
        audio = tf.TensorArray(tf.float32, size=(self.length_in_beats-4), element_shape=(self.samples_per_beat*5,))
        piano_roll = tf.TensorArray(tf.float32, size=(self.length_in_beats-4), element_shape=(self.num_inst_samples,))
        for j in tf.range(2, self.song.length_in_beats - 2):
            audio = audio.write(j-2, song_output[(j-2)*self.samples_per_beat:(j+3)*self.samples_per_beat])
            piano_roll = piano_roll.write(j-2, tf.reshape(song_notes_float, [self.song.length_in_beats, self.song.num_inst_samples])[j])
        return audio.stack(), piano_roll.stack()
    
    @tf.function
    def __call2__(self):
        for i in range(self.batch_size):
            output, notes_float = self.song.create()
            audio, piano_roll = self.__chop__(output, notes_float)
            self.batch_audio[i].assign(audio)
            self.batch_piano_roll[i].assign(piano_roll)
        return self.batch_audio, self.batch_piano_roll

    #@tf.function
    def __call__(self):
        for i in range(self.batch_size):
            self.song.zero()
            for j in range(np.random.choice([3,4,5,6,7,8,9,10,11,12])):
                self.song.generate()
            audio, piano_roll = self.__chop__(self.song.output, self.song.notes_float)
            self.batch_audio[i].assign(audio)
            self.batch_piano_roll[i].assign(piano_roll)
        return self.batch_audio, self.batch_piano_roll    

In [None]:
batch_size = 10
batches_per_epoch = 1

generate_batch = Generate_Batch(batch_size, 4*10, 44100, 60*10)

In [None]:
a, p = generate_batch()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10,10))

ax.imshow(p[0].numpy(), aspect=(1), cmap='inferno')

In [None]:
# Example of a random composition

from IPython.display import Audio

generate_batch.song.generate()
example_composition = generate_batch.song.output.numpy()

Audio(example_composition, rate=44100, autoplay=True)

## Neural Network Classes

In [None]:
import tensorflow as tf
from keras.layers import Layer

class STFT(Layer):
    def __init__(self, frame_length=1024, frame_step=256, fft_length=1024):
        super(STFT, self).__init__()
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length

    def build(self, input_shape):  # Create the state of the layer (weights)
        pass

    def call(self, inputs):  # Defines the computation from inputs to outputs
        stfts = tf.signal.stft(inputs, frame_length=self.frame_length, frame_step=self.frame_step,
                       fft_length=self.fft_length)
        spectrograms = tf.abs(stfts)
        return spectrograms

In [None]:
from keras import Model, Input
from keras.layers import Dense, Activation, Dropout, LSTM, Lambda, Reshape
from keras.optimizers import Adam

def compute_fft_length(frame_length):
    fft_length = 1
    for i in range(100):
        if frame_length <= (2 ** (i+1)):
            fft_length *= 2
            break
    return fft_length

hidden_size = 128

def ConvLSTMModel(hidden_size, num_classes=61, frame_length=1024, frame_step=256, num_samples=176400, sample_rate=44100):
    
    # Compute fft_length (smallest power of 2 enclosing frame_length)
    fft_length = compute_fft_length(frame_length)
    # Compute number of fft bins
    fft_unique_bins = fft_length // 2 + 1
    # Compute number of frames
    num_frames = ((num_samples - frame_length) // frame_step) + 1
    
    LSTM_cell = LSTM(hidden_size, return_state = True)
    Dense_cell = Dense(num_classes, activation='sigmoid')
    
    PCM = Input(shape=(num_samples,), name='pcm_input')    
    hidden_state_0 = Input(shape=(hidden_size,), name='hidden_state_0')
    cell_state_0 = Input(shape=(hidden_size,), name='cell_state_0')
    
    spec = STFT()(PCM)
    hidden_state = hidden_state_0
    cell_state = cell_state_0
    
    outputs = []
    for t in range(num_frames):
        spec_slice = Lambda(lambda x: x[:, t:t+1, :], output_shape=(1, 513))(spec)
        hidden_state, _, cell_state = LSTM_cell(spec_slice, initial_state=[hidden_state, cell_state])
        output = Dense_cell(hidden_state)
        outputs.append(output)
        
    model = Model(inputs=[PCM, hidden_state_0, cell_state_0], outputs=outputs)
    
    return model

model = ConvLSTMModel(hidden_size)

In [None]:
model.summary()

optimizer = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)

model.compile(optimizer=optimizer, loss='mean_squared_error')

In [None]:
test = np.reshape(output.numpy(), [1, -1])
print(test.shape)

hidden_state_0 = np.zeros((1,128))
cell_state_0 = np.zeros((1,128))

In [None]:
outputs = model.predict([test, hidden_state_0, cell_state_0])

In [None]:
outputs

In [None]:
print(a.shape, b.shape, c.shape)

In [None]:
from neural2 import *

sample_rate = 44100                 # 44.1khz sample rate
frame_length = sample_rate // 10    # frame size of 0.1 seconds
num_frames = 5                      # five frames read at a time
input_features = int(frame_length * num_frames)
    
dl = DisperseLayer(input_features, frame_length, 1.0)
rl = RFFTLayer(dl.chunk_size)

In [None]:
from sonic2 import *

length_in_beats = 40
bpm = 600
sample_rate = 44100

song = Song(SampleBank(restricted_instruments=['Marimba_cord_ff']),
            length_in_beats, bpm, sample_rate)

In [None]:
output, notes_float = song.create()

In [None]:
notes_float

In [None]:
stft = STFT()

In [None]:
x = tf.identity(output)

In [None]:
spec = stft(x)

In [None]:
spec

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.plot(range(len(x.numpy())), x.numpy())

In [None]:
from IPython.display import Audio

Audio(x.numpy(), rate=44100, autoplay=True)

In [None]:
spec = tf.abs(tf.signal.stft(x, frame_length=1024, frame_step=256,
                       fft_length=1024))

In [None]:
spec.shape

In [None]:
np.max(spec, axis=0)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

plt.imshow(spec.numpy().T, vmin=0.0, vmax=0.01)

In [None]:
x = dl(a)

In [None]:
x.shape

In [None]:
a.shape

In [None]:
from neural2 import *

class Network(tf.Module):
    def __init__(self, name=None):
        super(Network, self).__init__(name=name)
        self.layers = []
        
                
        sample_rate = 44100                 # 44.1khz sample rate
        frame_length = sample_rate // 10    # frame size of 0.1 seconds
        num_frames = 5                      # five frames read at a time
        input_features = int(frame_length * num_frames)
    
        self.layers.append(DisperseLayer(input_features, frame_length, 1.0))
        num_chunks = num_frames
        
        self.layers.append(RFFTLayer(self.layers[-1].chunk_size))
        
        self.layers.append(ReluNormLayer(self.layers[-1].output_features, self.layers[-1].output_features))
        self.layers.append(ReluNormLayer(self.layers[-1].output_features, self.layers[-1].output_features // 64))
    
        
        self.layers.append(JoinLayer(self.layers[-1].output_features, num_chunks))
        
        self.layers.append(ReluNormLayer(self.layers[-1].output_features, self.layers[-1].output_features // 4))
        self.layers.append(SigmoidNormLayer(self.layers[-1].output_features, 61))
        

    def save(self, path):
        if not os.path.exists(path):
            os.makedirs(path)
        for i, layer in enumerate(self.layers):
            if not os.path.exists(os.path.join(path, f'layer_{i}')):
                os.makedirs(os.path.join(path, f'layer_{i}'))
            layer.save(os.path.join(path, f'layer_{i}'))
            
    def load(self, path):
        for i, layer in enumerate(self.layers):
            layer.load(os.path.join(path, f'layer_{i}'))
        
    def apply(self, data, layers):
        if len(layers) == 1:
            return layers[0](data)
        else:
            return self.apply(layers[0](data), layers[1:])

    #@tf.function
    def __call__(self, data):
        x = data
        for i, layer in enumerate(self.layers):
            x = layer(x)
        return x
        #return self.apply(data, self.layers)
        
    def reset(self):
        for layer in self.layers:
            layer.reset()

network = Network()

In [None]:
def clean_output(arr):
    N = arr.shape[0]
    M = arr.shape[1]
    output = np.zeros([N, M])
    for i in range(N):
        for j in range(M):
            output[i][j] = int(max(0, min(127, round(128 * arr[i][j]))))
    return output

In [None]:
network.reset()

In [None]:
from IPython.display import display, clear_output, Audio
import seaborn as sns
import sys, signal
import matplotlib.pyplot as plt
import plotting

keep_going = True
def signal_handler(signal, frame):
    global keep_going
    keep_going = False
signal.signal(signal.SIGINT, signal_handler)

#@tf.function
def loss(trainable_variables, notes_actual, notes_predicted):
    note_cost1 = tf.reduce_mean((notes_actual - notes_predicted)**2)
    #note_cost2 = tf.reduce_mean(-notes_actual * tf.math.log(10**(-6) + notes_predicted) - (1 - notes_actual) * tf.math.log((1 + 10**(-6)) -  notes_predicted))
    #regularization_cost = 0.0
    #for i in range(len(trainable_variables)):
    #    regularization_cost +=  (1/100) * tf.reduce_sum(tf.math.maximum(trainable_variables[i], 0.0))
    cost = note_cost1
    tf.print('Cost:', cost, 'Note Cost:', note_cost1) #, 'Regularization Cost:', regularization_cost)
    return cost
    

@tf.function
def grad(network, input_pcms, notes_actual):
    with tf.GradientTape() as tape:
        tape.watch(network.trainable_variables)
        notes_predicted = network(input_pcms)
        loss_value = loss(network.trainable_variables, notes_actual, notes_predicted)
    return loss_value, notes_predicted, tf.clip_by_global_norm(tape.gradient(loss_value, network.trainable_variables), 1000000.0)

#@tf.function
def apply_grads(trainable_vars, grads, learning_rate):
    for i in range(len(grads)):
        trainable_vars[i].assign_sub(learning_rate * grads[i])

#optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9) #, momentum=0.01) #, momentum=0.1, nesterov=True)
#optimizer=tf.keras.optimizers.SGD(learning_rate=0.001)


num_inst_samples = 61
length_in_seconds = 4
quantization = 10
length_in_beats = length_in_seconds * quantization


learning_rate = 0.5
beta = 0.9
                                                          
grads = tuple(tf.Variable(tf.zeros_like(var, dtype=tf.float32)) for var in network.trainable_variables)
v = tuple(tf.Variable(tf.zeros_like(var, dtype=tf.float32)) for var in network.trainable_variables) 

i = -1
while keep_going:
    i += 1
    if i % 1000 == 0:                           
        input_pcms, notes_actual = generate_batch()

    loss_value, notes_predicted, (new_grads, global_norm) = grad(network, input_pcms, notes_actual)
    print('Global Norm:', global_norm.numpy())
    for j in range(len(grads)):
        grads[j].assign_add(tf.where(tf.math.is_nan(new_grads[j]), tf.zeros_like(new_grads[j]), new_grads[j]) / batches_per_epoch)

    if i % 100 == 99:
        clear_output()
        
    if i % 10 == 0:
        pass
        fig = plotting.error(notes_predicted, notes_actual)
        display(fig)
        plt.close(fig)
        del fig

    
    if (i % batches_per_epoch) == (batches_per_epoch - 1):
        for j in range(len(grads)):
            v[j].assign(beta * v[j] + (1.0 - beta) * grads[j])
        apply_grads(network.trainable_variables, grads, learning_rate)
        #optimizer.apply_gradients(zip(grads, network.trainable_variables))
    
        for j in range(len(grads)):
            grads[j].assign(tf.zeros_like(network.trainable_variables[j], dtype=tf.float32))

        print(f'Epoch {i // batches_per_epoch} completed!')
        
    if i % 1000 == 999:
        network.save('checkpoint4')
        print('Network Parameters Saved!')

    print('Step', i, 'complete.')


### Saving and Loading

In [None]:
network.save('checkpoint4')

In [None]:
network.load('checkpoint3')

### To run a single test example

In [None]:
import plotting

input_pcms, notes_actual = generate_batch()

loss_value, notes_predicted, (new_grads, global_norm) = grad(network, input_pcms, notes_actual)
        
plt = plotting.error(notes_predicted, notes_actual)

del plt

In [None]:
import gc

gc.collect()