In [30]:
import librosa as lr
import numpy as np
import os
from keras.layers import Dense, LSTM, Activation
from keras.models import Sequential
from keras.optimizers import Adam

In [43]:
SR = 16000 #Sampling frequency

def process_audio(aname):
  audio, _ = lr.load(aname, sr=SR)

  afs = lr.feature.mfcc(audio, 
                        sr=SR, 
                        n_mfcc=34, 
                        n_fft=2048) 
  afss = np.sum(afs[2:], axis=-1)

  # Normalize them
  afss = afss / np.max(np.abs(afss))

  return afss

In [44]:
def confidence(x, y):
  return np.sum((x - y)**2) # Euclidean distance


In [45]:
# Download multiple audio tracks
#%cd C:\Users\Igor\Desktop\phyton\SOUND
woman11 = process_audio("Mogilko_1.wav")
woman12 = process_audio("Mogilko_2.wav")
woman21 = process_audio("Maria_1.wav")
woman22 = process_audio("Maria_2.wav")

C:\Users\Igor\Desktop\phyton\SOUND


In [46]:
#Compare proximity coefficients
print('same', confidence(woman11, woman12))
print('same', confidence(woman21, woman22))
print('diff', confidence(woman11, woman21))
print('diff', confidence(woman11, woman22))
print('diff', confidence(woman12, woman21))
print('diff', confidence(woman12, woman22))

same 2.8557765
same 0.178615
diff 1.9570591
diff 1.6290247
diff 2.5593805
diff 2.1673884


In [48]:
def filter_audio(audio):
  # We calculate the voice energy for each block in 125 ms
  apower = lr.amplitude_to_db(np.abs(lr.stft(audio, n_fft=2048)), ref=np.max)

  # We summarize the energy at each frequency, normalize
  apsums = np.sum(apower, axis=0)**2
  apsums -= np.min(apsums)
  apsums /= np.max(apsums)

  # We smooth the chart to keep short skips and pauses, to remove sharpness
  apsums = np.convolve(apsums, np.ones((9,)), 'same')
  # Normalize again
  apsums -= np.min(apsums)
  apsums /= np.max(apsums)

  # Устанавливаем порог в 35% шума над голосом
  apsums = np.array(apsums > 0.35, dtype=bool)

   # Extend blocks each 125 ms to individual samples (2048 per block)
  apsums = np.repeat(apsums, np.ceil(len(audio) / len(apsums)))[:len(audio)]

  return audio[apsums] # We filter

In [49]:
SR = 16000 #Sampling frequency
LENGTH = 16 #The number of blocks in one pass of the neural network
OVERLAP = 8 #The step in the number of blocks between training samples
FFT = 1024 # Block Length (64 ms)

In [52]:
def prepare_audio(aname, target=False):
  # Download and prepare data
  print('loading %s' % aname)
  audio, _ = lr.load(aname, sr=SR)
  audio = filter_audio(audio) #Remove silence and spaces between words
  data = lr.stft(audio, n_fft=FFT).swapaxes(0, 1) #Extract the spectrogram
  samples = []

  for i in range(0, len(data) - LENGTH, OVERLAP):
    samples.append(np.abs(data[i:i + LENGTH])) # Create a training set

  results_shape = (len(samples), 1)
  results = np.ones(results_shape) if target else np.zeros(results_shape)
  return np.array(samples), results

In [53]:
# List of all entries
voices = [("Mogilko_1.wav", True),
          ("Mogilko_2.wav", True),
          ("Mogilko_3.wav", True),
          ("Maria_1.wav", False),
          ("Maria_2.wav", False),
          ("Maria_3.wav", False)]

In [54]:
# Combining training samples
X, Y = prepare_audio(*voices[0])
for voice in voices[1:]:
  dx, dy = prepare_audio(*voice)
  X = np.concatenate((X, dx), axis=0)
  Y = np.concatenate((Y, dy), axis=0)
  del dx, dy

loading Mogilko_1.wav
loading Mogilko_2.wav
loading Mogilko_3.wav
loading Maria_1.wav
loading Maria_2.wav
loading Maria_3.wav


In [55]:
#Randomly mix all the blocks
perm = np.random.permutation(len(X))
X = X[perm]
Y = Y[perm]

In [56]:
## Create a model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=X.shape[1:]))
model.add(LSTM(64))
model.add(Dense(64))
model.add(Activation('tanh'))
model.add(Dense(16))
model.add(Activation('sigmoid'))
model.add(Dense(1))
model.add(Activation('hard_sigmoid'))





In [57]:
## Compile and train the model
model.compile(Adam(lr=0.005), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, Y, epochs=15, batch_size=32, validation_split=0.2)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 2728 samples, validate on 683 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x28de84fb6d8>

In [58]:
## Testing the resulting model
print(model.evaluate(X, Y))

[0.1714822928364135, 0.9821166814474444]


In [59]:
## Save the model for future use
model.save('Mogilko.hdf5')