In [5]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.optimizers import RMSprop
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from keras.initializers import glorot_uniform
import keras.backend as K
import numpy as np
import matplotlib.pyplot as plt
import librosa
import os
import json

In [None]:
SAMPLE_RATE = 48000 # 48kHz
DATASET_PATH = "../../ABA-Audio-Data/Labelled/"
JSON_PATH = "../../ABA-Audio-Data/data.json"
MELSPECT_OR_MFCC = "melspect" # Choose "melspect" or "mfcc" for train on either data type.
N_MFCC = 25

In [None]:
# This structure is used to save and export the data.
data = {
    # Mapping the different genre lables onto numbers.
    "mapping": [],
    
    # Either the MFCC data or the MELSPECT data, depending on mode.
    "all_audio-whine": [],
    "all_audio-none": [],
    "all_audio": [],
    
    # The targets. Each value in "labels" refers to the index of the
    # "labels" list. 
    "labels-whine": [],
    "labels-none": [],
    "labels": []
}

for i, (dirpath, dirnames, filenames) in enumerate(os.walk(DATASET_PATH)):
    if dirpath is not DATASET_PATH:
        # This gives us dirpath ~/none and ~/whine.
        dirpath_components = dirpath.split('/')
        label = dirpath_components[-1]
        data["mapping"].append(label)
        print(f"Currently processing label: {label}")
        
        for j, file in enumerate(filenames):
            # Load audio file.
            file_path = os.path.join(dirpath, file)
            signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)
            
            # The last second of the video is usually not a full second, and therefore produces
            # an nparray that is abnormal in shape to the rest of the arrays. All the arrays
            # need to be the same shape, this pads the abnormal arrays with empty data.
            if (len(signal) < SAMPLE_RATE):
                signal = np.pad(signal, (0, SAMPLE_RATE-len(signal)))
                if (signal.shape != (48000,)):
                    print("Error, bad signal shape.")
            
            if (MELSPECT_OR_MFCC == "melspect"):
                processed_audio = librosa.feature.melspectrogram(y=signal)
            elif (MELSPECT_OR_MFCC == "mfcc"):
                processed_audio = librosa.feature.mfcc(y=signal, sr=SAMPLE_RATE, n_mfcc=N_MFCC)
                processed_audio = processed_audio.T
            else:
                exit('Set MELSPECT_OR_MFCC to either "melspect" or "mfcc".')
            
            if (label == "none"):
                data["all_audio-none"].append(processed_audio.tolist())
                data["labels-none"].append(processed_audio.tolist())
            elif (label == "whine"):
                data["all_audio-whine"].append(processed_audio.tolist())
                data["labels-whine"].append(processed_audio.tolist())
            
            data["all_audio"].append(processed_audio.tolist())
            data["labels"].append(i-1)
            
            if (j % 750 == 0):
                print(f"Working file path: {file_path} {j}")
            
            final = j
        else:
            print(final)

print("DONE")

In [None]:
with open(JSON_PATH, "w") as fp:
    json.dump(data, fp, indent=4)

In [None]:
# Use the output of this statment for the input of "input_shape" in the first
# Conv2D layer in the model blow with an added 3rd dimension, like (x, y, 1).
print(np.array(data["all_audio"][0]).shape)

In [None]:
# X1 = np.array(data["all_audio-none"])
# y1 = np.array(data["labels-none"])

# X2 = np.array(data["all_audio-whine"])
# y2 = np.array(data["labels-whine"])

# print(X1.shape)
# print(y1.shape)

In [None]:
X = np.array(data["all_audio"])
y = np.array(data["labels"])
X, y = shuffle(X, y)

In [None]:
# X1, y1 = shuffle(X1, y1)

# X1n = X1[:-5594, :]
# y1n = y1[:-5594, :]

# print(X1n.shape, y1n.shape)
# print(X2.shape, y2.shape)

# X = np.concatenate((X1n, X2), axis=0)
# y = np.concatenate((y1n, y2), axis=0)

# X, y = shuffle(X, y)
# print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3)

X_val, X_test, y_val, y_test = train_test_split(X_test,
                                                y_test,
                                                test_size=0.10)

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 94, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D(2, 2))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))

model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()


In [None]:
# Reset Weights and Biases of model for retraining.
for i, layer in enumerate(model.layers):
    if hasattr(model.layers[i], 'kernel_initializer') and \
            hasattr(model.layers[i], 'bias_initializer'):
        weight_initializer = model.layers[i].kernel_initializer
        bias_initializer = model.layers[i].bias_initializer
        
        old_weights, old_biases = model.layers[i].get_weights()
        
        model.layers[i].set_weights([
            weight_initializer(shape=old_weights.shape),
            bias_initializer(shape=old_biases.shape)
        ])

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(learning_rate=0.01),
              metrics='accuracy')

history = model.fit(X_train, y_train, epochs=10)

In [None]:
model.save('whine-cry.model')

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim(0.8, .88)
plt.legend(loc='lower right')

test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)