In [1]:
%pylab notebook

import librosa
import IPython.display as display

import sklearn.decomposition as decomp
import sklearn.cluster as cluster

import tensorflow as tf
from tensorflow.keras import layers

Populating the interactive namespace from numpy and matplotlib


In [2]:
# lifted from https://keras.io/examples/lstm_text_generation/
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [3]:
# load and normalize data
frames, rate = librosa.load('./thuille.wav', sr=5000, mono=True)
frames -= frames.min()
frames /= frames.max()

In [4]:
display.Audio(frames, rate=rate)

In [5]:
# extract overlapping patches from audio
patch_len = 200
patch_skip = 100
patch_starts = range(0, len(frames)-patch_len, patch_skip)

patches = np.array([
    frames[i:i+patch_len]
    for i in patch_starts
])

In [6]:
# transform patches to ICA basis
n_components = 128
ica = decomp.FastICA(n_components)
patches_ = ica.fit_transform(patches)

In [7]:
# quantize patches to cluster IDs
n_clusters = 2048
kmeans = cluster.KMeans(n_clusters)
labels = kmeans.fit_predict(patches_)

In [8]:
# test reconstruction from quantized patches -> audio
recon = np.matmul(kmeans.cluster_centers_[labels], ica.mixing_.T) + ica.mean_
display.Audio(recon[:,:patch_skip].flatten(), rate=rate)

In [9]:
# extract sequences of cluster IDs for training
seq_len = 50
seq_skip = 1
seq_starts = range(0, len(labels)-seq_len, seq_skip)

sequences = np.array([
    labels[i:i+seq_len] for i in seq_starts
])

# for each sequence, one-hot encode cluster ID of start of next sequence
next_labels = np.zeros(shape=[len(sequences), n_clusters])
for i in seq_starts:
    next_labels[i, labels[i+seq_len]] = 1

In [10]:
model = tf.keras.Sequential([
    # embed cluster ID in 128-dim vector space
    layers.Embedding(input_dim=n_clusters, output_dim=128, input_length=seq_len),
    
    # 256-unit LSTM
    layers.LSTM(256),
    
    # predict next cluster ID
    layers.Dense(n_clusters, activation='softmax')
])

In [11]:
# train model
model.compile(tf.keras.optimizers.RMSprop(1e-2), 'categorical_crossentropy')
history = model.fit(sequences, next_labels, batch_size=128, epochs = 50)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [12]:
# choose random sequence as seed
generated = sequences[[np.random.choice(len(sequences))]]

# add 1000 cluster IDs
for i in range(1000):
    # get last seq_len IDs
    inp = generated[:, -seq_len:]
    
    # predict next ID (with some noise)
    out = sample(model.predict(inp)[0], 1.0).reshape(1,1)
    
    # add to end of generated sequence
    generated = np.concatenate([generated, out], axis=1)

In [13]:
# reconstruct signal
recon = np.matmul(kmeans.cluster_centers_[generated[0]], ica.mixing_.T) + ica.mean_

In [14]:
# play from end of seed
display.Audio(
    recon[seq_len:, :patch_skip].flatten(),
    rate = rate
)