In [0]:
# imports 
# versions used: Keras 2.3.0, Tensorflow-gpu 2.1.0, Librosa 0.7.1, Pysoundfile 0.10.3

%matplotlib inline
from memory_profiler import memory_usage
import gc
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import librosa
import librosa.display
import pylab
import soundfile as sf

import keras
import tensorflow 

from IPython.display import clear_output
import timeit

print("Librosa version = ", librosa.__version__)
print("Pysoundfile version = ", sf.__version__)
print("Keras version = ", keras.__version__)
print("Tensorflow version = ", tensorflow.__version__)

In [0]:
TrainFile = '/...your directory.../fold1_train.csv'
ValFile = '/...your directory/fold1_evaluate.csv'
sr = 44100 # sampling rate
audio_channels = 1
SampleDuration = 10 # sec

# pre-processing parameters
FreqBins = 128
NumFFTPoints = 2048
HopLength = int(NumFFTPoints/2)
TimeBins = int(np.ceil(SampleDuration*sr/HopLength))

# training parameters
batch_size = 16
epochs = 150
mixup_alpha = 0.2 # for mixup data augmentation

In [0]:
# load filenames and labels

dev_train = pd.read_csv(TrainFile, sep='\t', encoding='ASCII')
dev_val = pd.read_csv(ValFile, sep='\t', encoding='ASCII')
wav_train = dev_train['filename'].tolist()
wav_val = dev_val['filename'].tolist()
y_train_labels =  dev_train['scene_label'].astype('category').cat.codes.values # dataframe to categorical
y_val_labels =  dev_val['scene_label'].astype('category').cat.codes.values

ClassNames = np.unique(dev_train['scene_label']) # returns the sorted unique elements of an array
NumClasses = len(ClassNames)

y_train = keras.utils.to_categorical(y_train_labels, NumClasses, dtype='float32') # Converts a class vector (integers) to binary class matrix,
y_val = keras.utils.to_categorical(y_val_labels, NumClasses, dtype='float32') # e.g. for use with categorical_crossentropy.

In [0]:
# PCEN, training
PCEN_train = np.zeros((len(wav_train),FreqBins,TimeBins,audio_channels),'float32')

start = timeit.default_timer()

for i in range(len(wav_train)):
    
    clear_output(wait=True)
    
    y, fs = sf.read('...your directory...' + wav_train[i], stop=SampleDuration*sr)
    spec = librosa.feature.melspectrogram(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, power=1.0, n_mels=FreqBins, fmin=0.0, fmax=sr/2, htk=True, norm=None)
    PCEN_train[i,:,:,0] = librosa.core.pcen(spec * (2**31), sr=sr, hop_length=HopLength, gain=0.8, bias=1000, power=0.25, time_constant=0.06)
    
    stop = timeit.default_timer()
    
    if (i/len(wav_train)*100) < 5:
            expected_time = "Calculating..."

    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( ( (time_perc-start) / (i/len(wav_train) ) /60,2))

    print("Current Progress:", np.round(i/len(wav_train)*100, 2), "%")
    print("Current runtime:", np.round((stop-start)/60, 2), "mins")
    print("Expected runtime:", expected_time, "mins")

In [0]:
# PCEN, validation
PCEN_val = np.zeros((len(wav_val),FreqBins,TimeBins,audio_channels),'float32')

start = timeit.default_timer()

for i in range(len(wav_val)):
    
    clear_output(wait=True)
    
    y, fs = sf.read('...your directory...' + wav_val[i], stop=SampleDuration*sr)
    spec = librosa.feature.melspectrogram(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, power=1.0, n_mels=FreqBins, fmin=0.0, fmax=sr/2, htk=True, norm=None)
    PCEN_val[i,:,:,0] = librosa.core.pcen(spec * (2**31), sr=sr, hop_length=HopLength, gain=0.8, bias=1000, power=0.25, time_constant=0.06)
    
    stop = timeit.default_timer()

    if (i/len(wav_val)*100) < 5:
            expected_time = "Calculating..."

    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( ( (time_perc-start) / (i/len(wav_val) ) /60,2))

    print("Current Progress:", np.round(i/len(wav_val)*100, 2), "%")
    print("Current runtime:", np.round((stop-start)/60, 2), "mins")
    print("Expected runtime:", expected_time, "mins")

In [0]:
# log-mel spectrograms, training
Spec_train = np.zeros((len(wav_train),FreqBins,TimeBins,audio_channels),'float32')

start = timeit.default_timer()

for i in range(len(wav_train)):
    
    clear_output(wait=True)

    y, fs = sf.read('...your directory...' + wav_train[i], stop=SampleDuration*sr)
    Spec_train[i,:,:,0] = librosa.feature.melspectrogram(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, n_mels=FreqBins, fmin=0.0, fmax=sr/2, htk=True, norm=None)   
    Spec_train[i,:,:,0] = np.log10(Spec_train[i,:,:,0]+1e-8) # no need for 10*np.log10(...), + eps=1e-8 to avoid zeros

    stop = timeit.default_timer()

    if (i/len(wav_train)*100) < 5:
        expected_time = "Calculating..."

    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( ( (time_perc-start) / (i/len(wav_train) ) /60,2))

    print("Current Progress:", np.round(i/len(wav_train)*100, 2), "%")
    print("Current runtime:", np.round((stop-start)/60, 2), "mins")
    print("Expected runtime:", expected_time, "mins")

In [0]:
# log-mel spectrograms, validation
Spec_val = np.zeros((len(wav_val),FreqBins,TimeBins,audio_channels),'float32')

start = timeit.default_timer()

for i in range(len(wav_val)):
    
    clear_output(wait=True)

    y, fs = sf.read('...your directory...' + wav_val[i], stop=SampleDuration*sr)
    Spec_val[i,:,:,0] = librosa.feature.melspectrogram(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, n_mels=FreqBins, fmin=0.0, fmax=sr/2, htk=True, norm=None)
    Spec_val[i,:,:,0] = np.log10(Spec_val[i,:,:,0]+1e-8) 

    stop = timeit.default_timer()

    if (i/len(wav_val)*100) < 5:
        expected_time = "Calculating..."

    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( ( (time_perc-start) / (i/len(wav_val) ) /60,2))

    print("Current Progress:", np.round(i/len(wav_val)*100, 2), "%")
    print("Current runtime:", np.round((stop-start)/60, 2), "mins")
    print("Expected runtime:", expected_time, "mins")

In [0]:
# check plots
plt.figure()
librosa.display.specshow(PCEN_train[5,:,:,0], x_axis='time',
                         y_axis='mel', sr=sr,
                         fmax=sr/2)
plt.show()

In [0]:
# create and compile the model. Different CNN archtectures available in the directory. For example run the following script:
!python FCNN_1.py

In [0]:
# fit model
from keras.callbacks import ModelCheckpoint

# filepath="weights.bestacc.hdf5" # if you want to save weights
checkpoint = ModelCheckpoint(filepath="DCASE1bmodel.h5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') # saves model

history = model.fit(Spec_train, y_train, # Spec_train or PCEN_train
                    validation_data=(Spec_val, y_val), # Spec_val or PCEN_val
                    epochs=epochs,
                    batch_size=batch_size, 
                    verbose=2, 
                    workers=1,
                    max_queue_size = 100,
                    callbacks=[checkpoint], 
)

In [0]:
# load weights and save the model if necessary
model.load_weights('weights.bestacc.hdf5')
model.save('DCASE2019_1b.h5')

In [0]:
# fit_generator: Trains the model on data generated batch-by-batch by a Python generator (for data augmentation)
# Note that fit_generator is deprecated starting from tensorflow v2.1.0
from keras.callbacks import ModelCheckpoint

TrainDataGen = MixupGenerator(X_train = Spec_train, # Spec_train or PCEN_train
                              y_train = y_train, 
                              batch_size=batch_size,
                              alpha=mixup_alpha)()

checkpoint = ModelCheckpoint(filepath="weights.mixupbestacc.hdf5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') # or save model immediately

history = model.fit_generator(TrainDataGen,
                              validation_data=(Spec_val, y_val), # Spec_val or PCEN_val
                              epochs=epochs, 
                              verbose=2, 
                              workers=1,
                              max_queue_size = 100,
                              callbacks=[checkpoint], 
                              steps_per_epoch=np.ceil(Spec_train.shape[0] / batch_size) # or PCEN_train.shape[0]
                              )

In [0]:
# load weights and save the model if necessary
model.load_weights('weights.mixupbestacc.hdf5')
model.save('DCASE2019_1b_mixup.h5')