# Prepare audio

This notebook tells you how to prepare your audio when you use Kapre.

In [8]:
import librosa
"""
# You might consider soundfile unless loading mp3 is your concern.
import soundfile as sf
# mac, window: pip install soundfile, 
# linux: pip install soundfile & sudo apt-get install libsndfile1
"""
import keras
import kapre
from keras.models import Sequential
from kapre.time_frequency import Spectrogram
import numpy as np

print('Keras version: {}'.format(keras.__version__))
print('Keras backend: {}'.format(keras.backend._backend))
print('Keras image dim ordering: {}'.format(keras.backend.image_dim_ordering()))
print('Kapre version: {}'.format(kapre.__version__))


Keras version: 1.2.1
Keras backend: tensorflow
Keras image dim ordering: th
Kapre version: 0.0.3


# Loading an mp3 file

In [48]:
src, sr = librosa.load('bensound-cute.mp3', sr=None, mono=True)
print(src.shape)
print(sr)

(453888,)
44100


# Trim it make it a 2d.

If your file is mono, librosa.load returns a 1D array. Kapre always expects 2d array, so make it 2d.

In [23]:
len_second = 1.0 # 1 second
src = src[:int(sr*len_second)]
src = src[np.newaxis, :]
input_shape = src.shape
print(input_shape)

(1, 44100)


# Let's assume we have 16 of this 

to make it more like a proper dataset. You should have many files indeed.

In [49]:
x = np.array([src] * 16)
print(x.shape)

(16, 453888)


# Now get a keras model using kapre

A simple model with 10-class and single-label classification.

In [41]:
model = Sequential()
model.add(Spectrogram(n_dft=512, n_hop=256, input_shape=input_shape, 
          return_decibel_spectrogram=True, power_spectrogram=2.0, 
          trainable_kernel=False, name='static_stft'))
model.add(keras.layers.Convolution2D(32, 3, 3, name='conv1', activation='relu'))
model.add(keras.layers.MaxPooling2D((25, 17)))
model.add(keras.layers.Convolution2D(32, 10, 10, name='conv2', activation='relu'))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(10, activation='softmax'))
model.summary(line_length=80, positions=[.33, .65, .8, 1.])

________________________________________________________________________________
Layer (type)              Output Shape              Param #     Connected to    
static_stft (Spectrogram) (None, 1, 257, 173)       263168      spectrogram_inpu
________________________________________________________________________________
conv1 (Convolution2D)     (None, 32, 255, 171)      320         static_stft[0][0
________________________________________________________________________________
maxpooling2d_8 (MaxPoolin (None, 32, 10, 10)        0           conv1[0][0]     
________________________________________________________________________________
conv2 (Convolution2D)     (None, 32, 1, 1)          102432      maxpooling2d_8[0
________________________________________________________________________________
flatten_5 (Flatten)       (None, 32)                0           conv2[0][0]     
________________________________________________________________________________
dense_2 (Dense)           (N

# Training

With real labels you'll train the model. I don't do it here.

In [50]:
# model.fit()

# Prediction

In this notebook, it's not really trained to predict

In [46]:
y = model.predict(x)
print np.argmax(y, axis=1)

[7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7]
