In [26]:
import csv
import numpy as np
import librosa
import matplotlib.pyplot as plt
import keras

In [None]:
# Para descomprimir el archivo Neumonia.zip (Si se usa desde Colab)
# Creará dos carpetas:
#     "test"
#     "train"
# y dos archivos en el directorio donde se descomprima:
#     "test.csv"
#     "train.csv"

!unzip "Perros_y_gatos.zip" -d "."

In [None]:
# Lectura y visualización de un archivo de audio

sfile = "train/dog/dog_barking_0.wav"

audio_data, sample_rate = librosa.load(sfile)

print("Longitud del vector de muestras del archivo: " + str(len(audio_data)) + "\n\n")

fig = plt.figure(figsize=(12, 4))
plt.plot(audio_data)
plt.show()

In [4]:
# Lectura del dataset completo de train
trainfile = csv.reader(open("train.csv"), delimiter='\t')
trainrows = [[c for c in row] for row in trainfile]
row_count_train = len(trainrows)
T_train = [int(c[0]) for c in trainrows]
wavfiles = np.array(trainrows)[:, 1]

#Prep 1 <BR>
Creación del dataset dividiendo cada archivo de audio en distintas "porciones" de igual longitud.

In [None]:
longitud_porcion = 10000

P = []  ; T = []
for i in range(len(wavfiles)):
    audio_data, sample_rate = librosa.load(wavfiles[i])

    longitud = len(audio_data)
    comienzo = 0
    while (comienzo+longitud_porcion) < longitud:
      data = audio_data[comienzo:(comienzo+longitud_porcion)]
      comienzo+= longitud_porcion

      P.append(data)
      T.append(T_train[i])

P = np.array(P)
T = np.array(T)

#Prep 2<BR>
Creación del dataset utilizando características espectrales de las señales de audio.


In [32]:
# Codificación a utilizar (puede probar con una de ellas, un subconjunto o todas a la vez)

codificacion = []
codificacion.append("mfcc")    # Cepstral coefficients
#codificacion.append("stft")    # Short-Time Fourier Transform and Chroma Features
#codificacion.append("mels")    # Melspectrogram
#codificacion.append("cont")    # Spectral contrast
#codificacion.append("tonn")    # Tonnetz
#codificacion.append("cent")    # Spectral centroids
#codificacion.append("band")    # Spectral bandwidth
#codificacion.append("roll")    # Spectral rolloff
#codificacion.append("zcrt")    # Zero crossing rate

P = None ; T = []
for i in range(len(wavfiles)):
    audio_data, sample_rate = librosa.load(wavfiles[i])
    stft = np.abs(librosa.stft(audio_data))

    features = None

    if "mfcc" in codificacion:
      a = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "stft" in codificacion:
      a = librosa.feature.chroma_stft(S=stft, sr=sample_rate).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "mels" in codificacion:
      a = librosa.feature.melspectrogram(audio_data, sr=sample_rate).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "cont" in codificacion:
      a = librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "tonn" in codificacion:
      a = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio_data, ), sr=sample_rate).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "cent" in codificacion:
      a = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "band" in codificacion:
      a = librosa.feature.spectral_bandwidth(y=audio_data, sr=sample_rate).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "roll" in codificacion:
      a = librosa.feature.spectral_rolloff(y=audio_data, sr=sample_rate).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    if "zcrt" in codificacion:
      a = librosa.feature.zero_crossing_rate(audio_data).T
      if features is None:
        features = a
      else:
        features = np.concatenate((features, a), axis=1)

    for j in range(features.shape[0]):
      if P is None:
        P = features
      else:
        P = np.concatenate( (P, features[j,:][np.newaxis]) , axis = 0)
      T.append(T_train[i])

T = np.array(T)

#Prep 3<BR>
Creación del dataset utilizando el promedio de las características espectrales de las señales de audio.
Simplifica la tarea de testing, ya que se genera un único vector de features por archivo.

In [None]:
P = [] ; T = []
for i in range(len(wavfiles)):
    audio_data, sample_rate = librosa.load(wavfiles[i])
    stft = np.abs(librosa.stft(audio_data))

    features = []

    if "mfcc" in codificacion:
      a = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "stft" in codificacion:
      a = librosa.feature.chroma_stft(S=stft, sr=sample_rate).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "mels" in codificacion:
      a = librosa.feature.melspectrogram(audio_data, sr=sample_rate).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "cont" in codificacion:
      a = librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "tonn" in codificacion:
      a = librosa.feature.tonnetz(y=librosa.effects.harmonic(audio_data, ), sr=sample_rate).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "cent" in codificacion:
      a = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "band" in codificacion:
      a = librosa.feature.spectral_bandwidth(y=audio_data, sr=sample_rate).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "roll" in codificacion:
      a = librosa.feature.spectral_rolloff(y=audio_data, sr=sample_rate).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    if "zcrt" in codificacion:
      a = librosa.feature.zero_crossing_rate(audio_data).T
      if features is None:
        features = np.mean(a, axis=0)
      else:
        features = np.concatenate( (features, np.mean(a, axis=0)) )

    P.append(features)
    T.append(T_train[i])

P = np.array(P)
T = np.array(T)
print(P.shape)

#Entrenamiento del modelo
Solo usar una preparación de los datos (Prep1 o Prep2 o Prep3)

In [None]:
# Mezclar las muestras
inds = np.arange(len(T))
inds = np.random.permutation(inds)
P = P[inds]
T = T[inds]
print(P.shape)
print(T.shape)

In [None]:
d_in = P.shape[1]
d_out = 1 # Clasificación binaria

modelo = keras.Sequential([
    keras.layers.Dense(5, input_shape=(d_in,), activation= 'tanh'),
    keras.layers.Dense(d_out, activation="sigmoid")
])

modelo.compile(
  optimizer = keras.optimizers.SGD(learning_rate=0.01),
  loss = 'binary_crossentropy', metrics = ['accuracy']
)

modelo.summary()

In [None]:
history = modelo.fit(P, T, epochs=3, batch_size=16, verbose=True, validation_split=0.2)