In [0]:
#imports 
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import h5py
import pandas as pd
import librosa
import librosa.display
import soundfile as sf

from IPython.display import clear_output
import timeit
import gc

import keras
import tensorflow 

print("Librosa version = ", librosa.__version__)
print("Pysoundfile version = ", sf.__version__)
print("Keras version = ", keras.__version__)
print("Tensorflow version = ", tensorflow.__version__)

In [0]:
TrainFile = '/...your directory.../fold1_train.csv'
ValFile = '/...your directory/fold1_evaluate.csv'

sr = 44100
SampleDuration = 10

#log-mel spectrogram parameters
FreqBins = 128
NumFFTPoints = 2048
HopLength = int(NumFFTPoints/2)
TimeBins = int(np.ceil(SampleDuration*sr/HopLength))

batch_size = 16
epochs = 100

In [0]:
# load filenames and labels

dev_train = pd.read_csv(TrainFile, sep='\t', encoding='ASCII')
dev_val = pd.read_csv(ValFile, sep='\t', encoding='ASCII')
wav_train = dev_train['filename'].tolist()
wav_val = dev_val['filename'].tolist()
y_train_labels =  dev_train['scene_label'].astype('category').cat.codes.values # dataframe to categorical
y_val_labels =  dev_val['scene_label'].astype('category').cat.codes.values

ClassNames = np.unique(dev_train['scene_label']) # returns the sorted unique elements of an array
NumClasses = len(ClassNames)

y_train = keras.utils.to_categorical(y_train_labels, NumClasses, dtype='float32') # Converts a class vector (integers) to binary class matrix,
y_val = keras.utils.to_categorical(y_val_labels, NumClasses, dtype='float32') # e.g. for use with categorical_crossentropy.

In [0]:
# feature extraction stage + early temporal integration, training

X_train_short = np.zeros((len(wav_train),40,TimeBins),'float32') 
X_train = np.zeros((len(wav_train),40,5),'float32') 

start = timeit.default_timer()

for i in range(len(wav_train)):

    clear_output(wait=True)

    y, fs = sf.read('...your directory...' + wav_train[i], stop=SampleDuration*sr)

    # baseline features, short-frame format
    X_train_short[i,0,:] = librosa.feature.zero_crossing_rate(y, frame_length=2048, hop_length=HopLength) #zcr:np.ndarray [shape=(1, t)]
    X_train_short[i,1,:] = librosa.feature.spectral_centroid(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength) #centroid:np.ndarray [shape=(1, t)]
    X_train_short[i,1,:] = np.log10(X_train_short[i,1,:])
    X_train_short[i,2,:] = librosa.feature.spectral_rolloff(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, roll_percent=0.85) #rolloff:np.ndarray [shape=(1, t)]
    X_train_short[i,2,:] = np.log10(X_train_short[i,2,:])
    X_train_short[i,3,:] = librosa.feature.spectral_flatness(y, n_fft=NumFFTPoints, hop_length=HopLength, power=2.0) #flatness:np.ndarray [shape=(1, t)]
    X_train_short[i,4,:] = librosa.feature.spectral_bandwidth(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, p=2) #bandwidth:np.ndarray [shape=(1, t)]
    X_train_short[i,4,:] = np.log10(X_train_short[i,4,:])
    X_train_short[i,5:12,:] = librosa.feature.spectral_contrast(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, fmin=200.0, n_bands=6, quantile=0.02) #contrast:np.ndarray [shape=(n_bands + 1, t)]
    S = librosa.feature.melspectrogram(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, htk=True, fmin=0.0, fmax=sr/2, norm=None, n_mels=FreqBins, power=2.0)
    S = np.log10(S + 1e-8)
    mfcc = librosa.feature.mfcc(sr=sr, S=S, n_mfcc=14) #M:np.ndarray [shape=(n_mfcc, t)
    X_train_short[i,12:26,:] = mfcc
    X_train_short[i,26:40,:] = librosa.feature.delta(mfcc, order=1, axis=-1)


    # Enhanced Temporal Integration
    for j in range(40):

        X_train[i,j,0] = np.mean(X_train_short[i,j,:], axis=-1)
        X_train[i,j,1] = np.std(X_train_short[i,j,:], axis=-1)

        # mean sequential difference
        X_train[i,j,2] = np.mean(np.absolute(X_train_short[i,j,1:431] - X_train_short[i,j,0:430]), axis=-1)

        # mean crossing rate
        dm = np.multiply((X_train_short[i,j,1:431]-X_train[i,j,0]), (X_train_short[i,j,0:430]-X_train[i,j,0]))
        ## Indicator function
        for idx in range(len(dm)):
            if dm[idx]<0:
                dm[idx]+=1
        X_train[i,j,3] = np.mean(dm)

        # crest factor
        X_train[i,j,4] = np.amax(X_train_short[i,j,:], axis=-1) / X_train[i,j,0]
    

    stop = timeit.default_timer()

    if (i/len(wav_train)*100) < 5:
        expected_time = "Calculating..."

    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( ( (time_perc-start) / (i/len(wav_train) ) /60,2))

    print("Current Progress:", np.round(i/len(wav_train)*100, 2), "%")
    print("Current runtime:", np.round((stop-start)/60, 2), "mins")
    print("Expected runtime:", expected_time, "mins")

In [0]:
X_train = np.reshape(X_train, (len(wav_train),200)) # proper input to the network

In [0]:
# same for validation
X_val_short = np.zeros((len(wav_val),40,TimeBins),'float32') 
X_val = np.zeros((len(wav_val),40,5),'float32') 

start = timeit.default_timer()

for i in range(len(wav_val)):

    clear_output(wait=True)

    y, fs = sf.read('...your directory...' + wav_val[i], stop=SampleDuration*sr)

    # baseline features, short-frame format
    X_val_short[i,0,:] = librosa.feature.zero_crossing_rate(y, frame_length=2048, hop_length=HopLength) #zcr:np.ndarray [shape=(1, t)]
    X_val_short[i,1,:] = librosa.feature.spectral_centroid(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength) #centroid:np.ndarray [shape=(1, t)]
    X_val_short[i,1,:] = np.log10(X_val_short[i,1,:])
    X_val_short[i,2,:] = librosa.feature.spectral_rolloff(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, roll_percent=0.85) #rolloff:np.ndarray [shape=(1, t)]
    X_val_short[i,2,:] = np.log10(X_val_short[i,2,:])
    X_val_short[i,3,:] = librosa.feature.spectral_flatness(y, n_fft=NumFFTPoints, hop_length=HopLength, power=2.0) #flatness:np.ndarray [shape=(1, t)]
    X_val_short[i,4,:] = librosa.feature.spectral_bandwidth(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, p=2) #bandwidth:np.ndarray [shape=(1, t)]
    X_val_short[i,4,:] = np.log10(X_val_short[i,4,:])
    X_val_short[i,5:12,:] = librosa.feature.spectral_contrast(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, fmin=200.0, n_bands=6, quantile=0.02)
    S = librosa.feature.melspectrogram(y, sr=sr, n_fft=NumFFTPoints, hop_length=HopLength, htk=True, fmin=0.0, fmax=sr/2, norm=None, n_mels=FreqBins, power=2.0)
    S = np.log10(S + 1e-8)
    mfcc = librosa.feature.mfcc(sr=sr, S=S, n_mfcc=14) #M:np.ndarray [shape=(n_mfcc, t)
    X_val_short[i,12:26,:] = mfcc
    X_val_short[i,26:40,:] = librosa.feature.delta(mfcc, order=1, axis=-1)


    # ETi
    for j in range(40):

        X_val[i,j,0] = np.mean(X_val_short[i,j,:], axis=-1)
        X_val[i,j,1] = np.std(X_val_short[i,j,:], axis=-1)

        # mean sequential difference
        X_val[i,j,2] = np.mean(np.absolute(X_val_short[i,j,1:431] - X_val_short[i,j,0:430]), axis=-1)

        # mean crossing rate
        dm = np.multiply((X_val_short[i,j,1:431]-X_val[i,j,0]), (X_val_short[i,j,0:430]-X_val[i,j,0]))
        ## Indicator function
        for idx in range(len(dm)):
            if dm[idx]<0:
                dm[idx]+=1
        X_val[i,j,3] = np.mean(dm)

        # crest factor
        X_val[i,j,4] = np.amax(X_val_short[i,j,:], axis=-1) / X_val[i,j,0]
    

    stop = timeit.default_timer()

    if (i/len(wav_val)*100) < 5:
        expected_time = "Calculating..."

    else:
        time_perc = timeit.default_timer()
        expected_time = np.round( ( (time_perc-start) / (i/len(wav_val) ) /60,2))

    print("Current Progress:", np.round(i/len(wav_val)*100, 2), "%")
    print("Current runtime:", np.round((stop-start)/60, 2), "mins")
    print("Expected runtime:", expected_time, "mins")

In [0]:
X_val = np.reshape(X_val, (len(wav_val),200))

In [0]:
# MLP architecture
from keras.models import Model 
from keras.layers import Dense, Dropout, Input, BatchNormalization
from keras.initializers import he_uniform, he_normal, lecun_uniform, lecun_normal, glorot_uniform, glorot_normal
from keras.regularizers import l2, l1_l2, l1
from keras.optimizers import Adam, SGD
from keras.utils import plot_model

input1 = Input(batch_shape=(None,200)) 
a = BatchNormalization()(input1)

a = Dense(units=60, activation='relu', kernel_initializer=glorot_normal(seed=42), kernel_regularizer=l1_l2(0.0002))(a) 
a = Dense(units=40, activation='relu', kernel_initializer=glorot_normal(seed=42), kernel_regularizer=l1_l2(0.0002))(a) 

out = Dense(units=10, activation='softmax', kernel_initializer=glorot_normal(seed=42))(a)

model = Model(inputs=input1, outputs=out)
model.compile(loss='categorical_crossentropy',
              optimizer = SGD(learning_rate=0.001, momentum=0.9, nesterov=True),  
              metrics=['accuracy'])

model.summary()
# plot graph
plot_model(model, to_file='MLP_ETi.png')

In [0]:
import tensorflow as tf
import datetime, os

# Load the TensorBoard notebook extension
%load_ext tensorboard

In [0]:
from keras.callbacks import TensorBoard

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tens = TensorBoard(logdir, histogram_freq=1)

In [0]:
%tensorboard --logdir logs

In [0]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

checkpoint = ModelCheckpoint(filepath="weights.mlpbestacc.hdf5", monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_loss', patience=30)


history = model.fit(x=X_train, 
                    y=y_train,
                    epochs=epochs,
                    verbose=2,
                    validation_data=(X_val, y_val), 
                    batch_size=batch_size,
                    callbacks=[checkpoint, es, tens],  
                    )

In [0]:
model.load_weights('weights.mlpbestacc.hdf5')
model.save('DCASE_MLP_development.h5')