In [1]:
%pylab notebook

import librosa
import IPython.display as display

import sklearn.decomposition as decomp
import sklearn.cluster as cluster

import tensorflow as tf
from tensorflow.keras import layers

Populating the interactive namespace from numpy and matplotlib


In [2]:
def l2_loss(y_true, y_pred):
    return tf.sqrt(tf.reduce_sum(tf.square(y_true - y_pred), axis=1))

In [3]:
# load and normalize data
frames, rate = librosa.load('./gnossienne.wav', sr=5000, mono=True)
frames -= frames.min()
frames /= frames.max()

In [4]:
display.Audio(frames, rate=rate)

In [6]:
# extract overlapping patches from audio
patch_len = 200
patch_skip = 100
patch_starts = range(0, len(frames)-patch_len, patch_skip)

patches = np.array([
    frames[i:i+patch_len]
    for i in patch_starts
])

In [7]:
# transform patches to PCA basis
n_components = 64
pca = decomp.PCA(n_components)
patches_ = pca.fit_transform(patches)

In [8]:
# quantize patches to cluster IDs
n_clusters = 512
kmeans = cluster.MiniBatchKMeans(n_clusters)
labels = kmeans.fit_predict(patches_)

  init_size=init_size)


In [10]:
# extract sequences of cluster IDs for training
seq_len = 50
seq_skip = 1
seq_starts = range(0, len(labels)-seq_len, seq_skip)

sequences = np.array([
    labels[i:i+seq_len] for i in seq_starts
])

In [12]:
next_patches = np.zeros(shape=[len(sequences), n_components])
for i in seq_starts:
    next_patches[i, :] = patches_[i+seq_len, :] 

In [13]:
model = tf.keras.Sequential([
    layers.Embedding(n_clusters, 128, input_length=seq_len),
    
    # 256-unit LSTM
    layers.LSTM(256),
    
    # predict next patch
    layers.Dense(n_components)
])

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
# train model
model.compile(tf.keras.optimizers.RMSprop(1e-2), l2_loss)
history = model.fit(sequences, next_patches, batch_size=128, epochs = 50)

Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [21]:
# choose random sequence as seed
seed = sequences[[np.random.choice(len(sequences))]]
generated = []

# add 1000 patches
for i in range(1000):
    # get last seq_len IDs
    inp = seed[:, -seq_len:]
    
    # predict next patch
    out = model.predict(inp)
    generated.append(out[0])
    
    # quantize patch + noise
    out_id = kmeans.predict(
        out + np.random.normal(0, 0.06, size=out.shape)
    ).reshape(1,1)
    
    # add to end of generated sequence
    seed = np.concatenate([seed, out_id], axis=1)
    
generated = np.array(generated)

In [22]:
recon = np.matmul(generated, pca.components_) + pca.mean_

In [23]:
display.Audio(
    recon[:, :patch_skip].flatten(),
    rate = rate
)