In [14]:
import librosa as lr
import numpy as np
from keras.layers import Dense, LSTM, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from keras.models import load_model

In [15]:
SR = 16000 # sampling frequency
LENGTH = 16 # The number of blocks in one pass of the neural network
OVERLAP = 8 # The step in the number of blocks between training samples
FFT = 1024 # Block Length (64 ms)

In [16]:
def filter_audio(audio):
  # We calculate the voice energy for each block in 125 ms
  apower = lr.amplitude_to_db(np.abs(lr.stft(audio, n_fft=2048)), ref=np.max)

  # We summarize the energy at each frequency, normalize
  apsums = np.sum(apower, axis=0)**2
  apsums -= np.min(apsums)
  apsums /= np.max(apsums)

  # Smooth the graph to keep short skips and pauses, remove sharpness
  apsums = np.convolve(apsums, np.ones((9,)), 'same')
  # Normalize again
  apsums -= np.min(apsums)
  apsums /= np.max(apsums)

  # Set the threshold to 35% noise above the voice
  apsums = np.array(apsums > 0.35, dtype=bool)

  # Extend blocks each 125 ms to individual samples (2048 per block)
  apsums = np.repeat(apsums, np.ceil(len(audio) / len(apsums)))[:len(audio)]

  return audio[apsums] # Filtering

In [17]:
def prepare_audio(aname, target=False):
  # Download and prepare data
  print('loading %s' % aname)
  audio, _ = lr.load(aname, sr=SR)
  audio = filter_audio(audio) # Remove silence and spaces between words
  data = lr.stft(audio, n_fft=FFT).swapaxes(0, 1) # extract the spectrogram
  samples = []

  for i in range(0, len(data) - LENGTH, OVERLAP):
    samples.append(np.abs(data[i:i + LENGTH])) # Create a training set

  results_shape = (len(samples), 1)
  results = np.ones(results_shape) if target else np.zeros(results_shape)
  return np.array(samples), results


In [18]:
#%cd C:\Users\Igor\Desktop\phyton\SOUND

C:\Users\Igor\Desktop\phyton\SOUND


In [19]:
voices = [('Mogilko_5.wav', False)]

In [20]:
X, Y = prepare_audio(*voices[0])
for voice in voices[1:]:
  dx, dy = prepare_audio(*voice)
  X = np.concatenate((X, dx), axis=0)
  Y = np.concatenate((Y, dy), axis=0)
  del dx, dy

loading Mogilko_5.wav


In [21]:
# Randomly mix all the blocks
perm = np.random.permutation(len(X))
X = X[perm]
Y = Y[perm]

In [22]:
print(type(X))


<class 'numpy.ndarray'>


In [23]:

model = load_model('Mogilko.hdf5')

In [13]:
prediction = model.predict(X)
print(np.mean(prediction))
print(type(prediction))

0.7830922
<class 'numpy.ndarray'>
