In [1]:
import librosa 
import h5py 
import numpy as np

import os, sys

In [2]:
path = 'Music Data/nsynth-valid/audio/'

In [3]:
audio = os.listdir(path)

In [4]:
with h5py.File('music.hdf5', 'a') as f:
    for i, wav in enumerate(audio):
        y, _ = librosa.load(path + wav)
        dset = f.create_dataset(str(i), data=y)
        if i % 100 == 0:
            print("{} Audio wav Files Processed".format(i))

0 Audio wav Files Processed
100 Audio wav Files Processed
200 Audio wav Files Processed
300 Audio wav Files Processed
400 Audio wav Files Processed
500 Audio wav Files Processed
600 Audio wav Files Processed
700 Audio wav Files Processed
800 Audio wav Files Processed
900 Audio wav Files Processed
1000 Audio wav Files Processed
1100 Audio wav Files Processed
1200 Audio wav Files Processed
1300 Audio wav Files Processed
1400 Audio wav Files Processed
1500 Audio wav Files Processed
1600 Audio wav Files Processed
1700 Audio wav Files Processed
1800 Audio wav Files Processed
1900 Audio wav Files Processed
2000 Audio wav Files Processed
2100 Audio wav Files Processed
2200 Audio wav Files Processed
2300 Audio wav Files Processed
2400 Audio wav Files Processed
2500 Audio wav Files Processed
2600 Audio wav Files Processed
2700 Audio wav Files Processed
2800 Audio wav Files Processed
2900 Audio wav Files Processed
3000 Audio wav Files Processed
3100 Audio wav Files Processed
3200 Audio wav Files

In [5]:
with h5py.File('music.hdf5', 'r') as f:
    l = len(f.keys())
    print(l)

12678


In [2]:
def next_batch(batch_size, seq_len):
    x = np.ndarray(shape=(batch_size, seq_len))
    y = np.ndarray(shape=(batch_size, seq_len))
    
    l = 12678
    
    with h5py.File('music.hdf5', 'r') as f:
        for i in range(0,l,batch_size):
            cnt = 0
            for j in range(i,i+batch_size):
                x[cnt] = f[str(j)][:seq_len]
                y[cnt] = f[str(j)][1:seq_len + 1]
                cnt += 1
            yield (x, y)

In [9]:
itr = next_batch(32,128)

In [10]:
x, y = next(itr)

In [13]:
y.shape

(32, 128)

In [None]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Dense
from keras.optimizers import Adam
from GenreFeatureData import GenreFeatureData  # local python class with Audio feature extraction (librosa)

# Turn off TF verbose logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

genre_features = GenreFeatureData()
genre_features.load_preprocess_data()
# genre_features.load_deserialize_data()

# Keras optimizer defaults:
# Adam   : lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8, decay=0.
# RMSprop: lr=0.001, rho=0.9, epsilon=1e-8, decay=0.
# SGD    : lr=0.01, momentum=0., decay=0.
opt = Adam()

batch_size = 35
nb_epochs = 400

print("Training X shape: " + str(genre_features.train_X.shape))
print("Training Y shape: " + str(genre_features.train_Y.shape))
print("Dev X shape: " + str(genre_features.dev_X.shape))
print("Dev Y shape: " + str(genre_features.dev_Y.shape))
print("Test X shape: " + str(genre_features.test_X.shape))
print("Test Y shape: " + str(genre_features.test_X.shape))

input_shape = (genre_features.train_X.shape[1], genre_features.train_X.shape[2])
print('Build LSTM RNN model ...')
model = Sequential()
model.add(LSTM(units=128, dropout=0.05, recurrent_dropout=0.35, return_sequences=True, input_shape=input_shape))
model.add(LSTM(units=32, dropout=0.05, recurrent_dropout=0.35, return_sequences=False))
model.add(Dense(units=genre_features.train_Y.shape[1], activation='softmax'))

print("Compiling ...")
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

print("Training ...")
model.fit(genre_features.train_X, genre_features.train_Y, batch_size=batch_size, epochs=nb_epochs)

print("\nValidating ...")
score, accuracy = model.evaluate(genre_features.dev_X, genre_features.dev_Y, batch_size=batch_size, verbose=1)
print("Dev loss:  ", score)
print("Dev accuracy:  ", accuracy)


print("\nTesting ...")
score, accuracy = model.evaluate(genre_features.test_X, genre_features.test_Y, batch_size=batch_size, verbose=1)
print("Test loss:  ", score)
print("Test accuracy:  ", accuracy)

In [None]:
'''Example script to generate text from Nietzsche's writings.
At least 20 epochs are required before the generated text
starts sounding coherent.
It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.
If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io

path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = io.open(path, encoding='utf-8').read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])