In [None]:
from os import listdir, getcwd, rename, remove
from os.path import isfile, join

path = join(getcwd(), 'preprocessed_samples')

files = [f for f in listdir(path) if isfile(join(path, f)) and f[0] != '.']

samples = []
for f in files:
    sample = {'instrument' : '_'.join(f.split('_')[:-1]),
              'midi_number' : int(f.split('_')[-1][:-4]),
              'filename' : join(path, f)}
    samples.append(sample)

samples = sorted(samples, key=lambda x: (x['instrument'], x['midi_number']))
    
instrument_names = set([s['instrument'] for s in samples])
print(instrument_names)
instruments = {}
for inst in instrument_names:
    samples_for_inst = {s['midi_number'] : s['filename'] for s in samples if s['instrument'] == inst}
    instruments[inst] = {
        'samples' : samples_for_inst,
        'min_note' : min(samples_for_inst.keys()),
        'max_note' : max(samples_for_inst.keys())
    }

In [None]:
restricted_instruments = ['AltoSax_NoVib_ff',
                          'BassClarinet_ff',
                          'BassFlute_ff',
                          'BassTrombone_ff',
                          'Bass_arco_ff_sulA',
                          'Bass_arco_ff_sulD',
                          'Bass_arco_ff_sulE',
                          'Bass_arco_ff_sulG',
                          'Bass_pizz_ff_sulA',
                          'Bass_pizz_ff_sulD',
                          'Bass_pizz_ff_sulE',
                          'Bass_pizz_ff_sulG',
                          'BbClarinet_ff',
                          'Cello_arco_ff_sulA',
                          'Cello_arco_ff_sulC',
                          'Cello_arco_ff_sulD',
                          'Cello_arco_ff_sulG',
                          'Cello_pizz_ff_sulA',
                          'Cello_pizz_ff_sulC',
                          'Cello_pizz_ff_sulD',
                          'Cello_pizz_ff_sulG',
                          'Crotale_ff',
                          'EbClarinet_ff',
                          'Flute_nonvib_ff',
                          'Horn_ff',
                          'Marimba_cord_ff',
                          'Marimba_roll_ff',
                          'Marimba_rubber_ff',
                          'Oboe_ff',
                          'TenorTrombone_ff',
                          'Trumpet_novib_ff',
                          'Trumpet_vib_ff',
                          'Tuba_ff',
                          'Vibraphone_bow',
                          'Vibraphone_dampen_ff',
                          'Vibraphone_shortsustain_ff',
                          'Viola_arco_ff_sulA',
                          'Viola_arco_ff_sulC',
                          'Viola_arco_ff_sulD',
                          'Viola_arco_ff_sulG',
                          'Viola_pizz_ff_sulA',
                          'Viola_pizz_ff_sulC',
                          'Viola_pizz_ff_sulD',
                          'Viola_pizz_ff_sulG',
                          'Violin_pizz_ff_sulA',
                          'Violin_pizz_ff_sulD',
                          'Violin_pizz_ff_sulE',
                          'Violin_pizz_ff_sulG',
                          'Xylophone_hardrubber_ff',
                          'Xylophone_hardrubber_roll_ff',
                          'Xylophone_rosewood_ff',
                          'bells_brass_ff',
                          'bells_plastic_ff']

In [3]:
soundfont = {}
soundbank_ref = []
k = 0
for inst in restricted_instruments:
    soundfont[inst] = {}
    for i in range(128):
        if instruments[inst]['min_note'] <= i <= instruments[inst]['max_note']:
            soundfont[inst][i] = (k, instruments[inst]['samples'][i])
            soundbank_ref.append((inst, i))
            k += 1
        else:
            soundfont[inst][i] = None
            
soundbank = [soundfont[x[0]][x[1]][1] for x in soundbank_ref]

In [4]:
def random_endo_matrix(size=len(soundbank)):
    matrix = np.diag([1 for i in range(size)])
    idx = np.random.randint(size, size=size)
    return matrix[:,idx]

In [5]:
import soundfile as sf
import numpy as np

class Song:
    def __init__(self, sample_bank, length_in_beats, bpm, sr):
        self.sample_bank = sample_bank
        self.num_samples = len(sample_bank)
        self.length_in_beats = length_in_beats
        self.bpm = bpm
        self.sr = sr
        self.notes = np.zeros([length_in_beats, len(sample_bank)], dtype=bool)
        self.wavs = []
        for i in range(len(sample_bank)):
            self.wavs.append(sf.read(self.sample_bank[i])[0])
    def reset(self):
        self.notes = np.zeros([self.length_in_beats, len(self.sample_bank)], dtype=bool)
    def generate(self):
        samples_per_beat = (self.sr * 60) // self.bpm
        num_samples = samples_per_beat * self.length_in_beats
        self.output = np.zeros(num_samples)
        for i in range(self.length_in_beats):
            for j in range(self.num_samples):
                if self.notes[i][j]:
                    N = self.sr * self.length_in_beats * 60 // self.bpm
                    sample = self.wavs[j][:N,0]
                    padded_sample = np.zeros(num_samples)
                    padded_sample[:sample.shape[0]] = sample
                    self.output += np.roll(padded_sample, i*samples_per_beat)
                    self.output = self.output
    def add(self, i, j):
        self.notes[i][j] = 1
    def transform(self, matrix):
        for i in range(self.notes.shape[0]):
            self.notes[i] = np.matmul(matrix, self.notes[i])

In [6]:
def generate_random_part(instruments, song):
    length = 3 * song.length_in_beats // 4
    lst = [k for k in list(range(len(song.sample_bank))) if soundbank_ref[k][0] in instruments]
    for i in range(length):
        make_note = np.random.choice([0,0,0,1])
        if make_note:
            j = np.random.choice(lst)
            song.add(i, j)
    #song.generate()

In [7]:
import copy

song = Song(soundbank, 60, 240, 44100)

def create_example(song):
    song.reset()
    generate_random_part(restricted_instruments, song)
    song.generate()
    transformation_matrix = random_endo_matrix()
    coded_transformation = np.argmax(transformation_matrix, axis=0)
    features = copy.deepcopy(song.output)
    song.transform(transformation_matrix)
    song.generate()
    target = copy.deepcopy(song.output)
    return features, coded_transformation, target

def create_examples(num_examples, song):
    features_list = []
    coded_transformation_list = []
    target_list = []
    for i in range(num_examples):
        features, coded_transformation, target = create_example(song)
        features_list.append(features)
        coded_transformation_list.append(coded_transformation)
        target_list.append(target)
    return np.array(features_list), np.array(coded_transformation_list), np.array(target_list)

In [8]:
import numpy as np
from scipy.io.wavfile import write as wav_write, read as wav_read

def load_example(index):
    transformation_matrix = np.loadtxt(f'input{index}.csv', delimiter=',')
    coded_transformation = np.argmax(transformation_matrix, axis=0)
    _, input_signal = wav_read(f'input{index}.wav')
    _, output_signal = wav_read(f'output{index}.wav')
    x2 = np.reshape(transformation_matrix, [1721*1721])
    #features = np.concatenate([input_signal, np.reshape(transformation_matrix, 1721*1721)])
    features = np.concatenate([input_signal[:15*44100], coded_transformation])
    target = output_signal[:15*44100]
    return features, target

def load_examples(indices):
    features_list = []
    target_list = []
    for idx in indices:
        features, target = load_example(idx)
        features_list.append(features)
        target_list.append(target)
    return np.array(features_list), np.array(target_list)
    

# Neural Network

In [9]:
def mfccs(pcm):

    # sample_rate = 44100            # 44.1k samples per channel per second
    # num_samples = 60*sample_rate   # 60 seconds of data
    # num_channels = 2               # 2 channels
    # # Input is Tensor of [batch_size, num_samples, num_channels] PCM samples in the range [-1, 1].
    # pcm = tf.compat.v1.placeholder(tf.float32, [None, num_channels, num_samples])

    # A 2048-point STFT with frames of ??? ms and 75% overlap.
    stfts = tf.signal.stft(pcm, frame_length=2048, frame_step=256, fft_length=2048)
    spectrograms = tf.abs(stfts)

    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins = stfts.shape[-1].value
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins,
                                                                        num_spectrogram_bins,
                                                                        sample_rate,
                                                                        lower_edge_hertz,
                                                                        upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))
    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    # Compute MFCCs from log_mel_spectrograms and take the first 13.
    mfccs = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[..., :13]
    return mfccs

def mfccs_loss(pcm_true, pcm_pred):
    mfccs_true = mfccs(pcm_true)
    mfccs_pred = mfccs(pcm_pred)
    return tf.losses.mean_squared_error(labels=mfccs_true, predictions=mfccs_pred)

In [10]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
import sounddevice as sd


# create the TF neural net
# some hyperparams
training_batches = 2000

n_neurons_per_part_in_h1 = 500
n_neurons_in_h2 = 500
n_neurons_in_h3 = 500
n_neurons_in_h4 = 500
n_neurons_in_h5 = 200
learning_rate = 0.1


sample_rate = 44100            # 44.1k samples per channel per second
num_samples = 15*sample_rate   # 15 seconds of data

num_sounds = 1721

#n_features = num_samples + (num_sounds)**2

num_parts = 30
samples_per_part = num_samples // num_parts

n_features = num_samples
n_targets = num_samples
#############################################

# basic 2 layer dense net (MLP) example adapted from
# https://becominghuman.ai/creating-your-own-neural-network-using-tensorflow-fa8ca7cc4d0e

# these placeholders serve as our input tensors
x = tf.placeholder(tf.float32, [None, n_features], name='input')
t = tf.placeholder(tf.float32, [None, num_sounds], name='tone_transformation')
y = tf.placeholder(tf.float32, [None, n_targets], name='labels')

# TF Variables are our neural net parameter tensors, we initialize them to random (gaussian) values in
# Layer1. Variables are allowed to be persistent across training epochs and updatable bt TF operations

W1s = []
b1s = []
y1s = []

for i in range(2*num_parts-1):

    W1s.append(tf.Variable(tf.truncated_normal([samples_per_part + num_sounds, n_neurons_per_part_in_h1],
                                              mean=0,
                                              stddev=1 / np.sqrt(n_features)),
                          name=f'weights1_{i}'))
    b1s.append(tf.Variable(tf.truncated_normal([n_neurons_per_part_in_h1],
                                              mean=0,
                                              stddev=1 / np.sqrt(n_features)),
                          name=f'biases1_{i}'))
    y1s.append(
        tf.nn.relu(
            tf.matmul(
                tf.concat([tf.slice(x, [0, i*samples_per_part // 2], [-1, samples_per_part]), t], axis=1),
                W1s[i]
            ) + b1s[i],
            name=f'activationLayer1_{i}'
        )
    )

y1 = tf.concat(y1s, axis=1)

# note the output tensor of the 1st layer is the activation applied to a
# linear transform of the layer 1 parameter tensors
# the matmul operation calculates the dot product between the tensors

# network parameters(weights and biases) are set and initialized (Layer2)
W2 = tf.Variable(tf.random_normal([(2*num_parts - 1)*n_neurons_per_part_in_h1, n_neurons_in_h2],
                                  mean=0,
                                  stddev=1),
                 name='weights2')
b2 = tf.Variable(tf.random_normal([n_neurons_in_h2], mean=0, stddev=1), name='biases2')
# activation function(sigmoid)
y2 = tf.nn.relu((tf.matmul(y1, W2) + b2), name='activationLayer2')

# network parameters(weights and biases) are set and initialized (Layer3)
W3 = tf.Variable(tf.random_normal([n_neurons_in_h2, n_neurons_in_h3], mean=0, stddev=1),
                 name='weights2')
b3 = tf.Variable(tf.random_normal([n_neurons_in_h3], mean=0, stddev=1), name='biases3')
# activation function(sigmoid)
y3 = tf.nn.relu((tf.matmul(y2, W3) + b3), name='activationLayer2')

# network parameters(weights and biases) are set and initialized (Layer4)
W4 = tf.Variable(tf.random_normal([n_neurons_in_h3, n_neurons_in_h4], mean=0, stddev=1),
                 name='weights2')
b4 = tf.Variable(tf.random_normal([n_neurons_in_h4], mean=0, stddev=1), name='biases4')
# activation function(sigmoid)
y4 = tf.nn.relu((tf.matmul(y3, W4) + b4), name='activationLayer2')

# network parameters(weights and biases) are set and initialized (Layer5)
W5 = tf.Variable(tf.random_normal([n_neurons_in_h4, n_neurons_in_h5], mean=0, stddev=1),
                 name='weights2')
b5 = tf.Variable(tf.random_normal([n_neurons_in_h5], mean=0, stddev=1), name='biases5')
# activation function(sigmoid)
y5 = tf.nn.relu((tf.matmul(y4, W5) + b5), name='activationLayer2')

# output layer weights and biases
Wo = tf.Variable(tf.random_normal([n_neurons_in_h5, n_targets], mean=0, stddev=1 ),
                 name='weightsOut')
bo = tf.Variable(tf.random_normal([n_targets], mean=0, stddev=1), name='biasesOut')

# the sigmoid (binary softmax) activation is absorbed into TF's sigmoid_cross_entropy_with_logits loss
#logits = (tf.matmul(y2, Wo) + bo)
output = tf.nn.relu(tf.matmul(y5, Wo) + bo)
#loss = tf.nn.sigmoid_cross_entropy_with_logits(labels = y, logits = logits)
loss = mfccs_loss(y, output)

# tap a separate output that applies softmax activation to the output layer
# for training accuracy readout
#a = tf.nn.sigmoid(logits, name='activationOutputLayer')

# optimizer used to compute gradient of loss and apply the parameter updates.
# the train_step object returned is ran by a TF Session to train the net

train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
#train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)

# prediction accuracy
# compare predicted value from network with the expected value/target

#correct_prediction = tf.equal(tf.round(a), y)
# accuracy determination
#accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="Accuracy")

#############################################
# ***NOTE global_variables_initializer() must be called before creating a tf.Session()!***
init_op = tf.global_variables_initializer()

# create a session for training and feedforward (prediction). Sessions are TF's way to run
# feed data to placeholders and variables, obtain outputs and update neural net parameters
with tf.Session() as sess:
    # ***initialization of all variables... NOTE this must be done before running any further sessions!***
    sess.run(init_op)

    # training loop over the number of epochs
    batches = 2000

    for batch in range(training_batches):
        losses = 0
        accs = 0
        
        #X, Y = load_examples(np.random.randint(200, size=10))
        print('Generating New Examples')
        X, T, Y = create_examples(2, song)
        print('New Examples Generated')
        frac = 0.5
        train_stop = int(len(X) * frac)
        X_train = X[:train_stop]
        T_train = T[:train_stop]
        Y_train = Y[:train_stop]
        X_test = X[train_stop:]
        T_test = T[train_stop:]
        Y_test = Y[train_stop:]
        
        X_b = X_train
        T_b = T_train
        Y_b = Y_train

        # train the network, note the dictionary of inputs and labels
        sess.run(train_step, feed_dict={x: X_b, t: T_b, y: Y_b})
        # feedforwad the same data and labels, but grab the accuracy and loss as outputs
        l = sess.run([loss], feed_dict={x: X_b, t: T_b, y: Y_b})

        losses = np.sum(l)
        print("Batch %.8d " % batch, "train loss %.4f" % losses)

        # test on the holdout set
        test_output, l = sess.run([output, loss], feed_dict={x: X_test, t: T_test, y: Y_test})
        losses = np.sum(l)
        print("Batch %.8d " % batch, "test loss %.4f" % losses)
        if (batch % 5 == 0):
            sd.play(test_output[0], 44100)

Instructions for updating:
non-resource variables are not supported in the long term
Generating New Examples
New Examples Generated
Batch 00000000  train loss 14691.2910
Batch 00000000  test loss 16199.7363
Generating New Examples
New Examples Generated
Batch 00000001  train loss 14658.5576
Batch 00000001  test loss 17791.4453
Generating New Examples
New Examples Generated
Batch 00000002  train loss 16464.8281
Batch 00000002  test loss 18322.8691
Generating New Examples
New Examples Generated
Batch 00000003  train loss 13874.2109
Batch 00000003  test loss 14384.7227
Generating New Examples
New Examples Generated
Batch 00000004  train loss 17164.4160
Batch 00000004  test loss 19208.6348
Generating New Examples
New Examples Generated
Batch 00000005  train loss 13526.3770
Batch 00000005  test loss 16226.4326
Generating New Examples
New Examples Generated
Batch 00000006  train loss 13853.0908
Batch 00000006  test loss 15265.9668
Generating New Examples
New Examples Generated
Batch 00000007

New Examples Generated
Batch 00000066  train loss 927.4522
Batch 00000066  test loss 497.3214
Generating New Examples
New Examples Generated
Batch 00000067  train loss 602.0576
Batch 00000067  test loss 1138.3055
Generating New Examples
New Examples Generated
Batch 00000068  train loss 476.7871
Batch 00000068  test loss 1032.7307
Generating New Examples
New Examples Generated
Batch 00000069  train loss 1554.1077
Batch 00000069  test loss 1143.3466
Generating New Examples
New Examples Generated
Batch 00000070  train loss 887.3333
Batch 00000070  test loss 1353.4348
Generating New Examples
New Examples Generated
Batch 00000071  train loss 374.5355
Batch 00000071  test loss 507.3273
Generating New Examples
New Examples Generated
Batch 00000072  train loss 770.4086
Batch 00000072  test loss 316.2920
Generating New Examples
New Examples Generated
Batch 00000073  train loss 1688.8949
Batch 00000073  test loss 579.1281
Generating New Examples
New Examples Generated
Batch 00000074  train loss 

PortAudioError: Error opening OutputStream: Internal PortAudio error [PaErrorCode -9986]