In [1]:
import numpy as np
import librosa 
from tqdm import tqdm
import glob
import gzip
import pandas as pd
import h5py
from __future__ import division
import os
from sklearn import preprocessing
import pickle

In [2]:
#SET PATHS
dataspace = '/vol/vssp/datasets/audio01/UrbanSound8K/audio/'
projectspace = '/vol/vssp/AcousticEventsDetection/DLGdansk/UrbanSound/'

metadatafile = '/vol/vssp/datasets/audio01/UrbanSound8K/metadata/UrbanSound8K.csv'
metadata = pd.read_csv(metadatafile)

#PATHS TO PROCESSED DATA
hdf5_path = os.path.join(projectspace,'dataset.hdf5')  # address to where you want to save the hdf5 file
scaler_path = os.path.join(projectspace,'scaler.pkl')

In [3]:
#AUDIO ANALYSIS PARAMETERS
N_FFT = 1024
HOP_SIZE=1024
SR=44100
WIN_SIZE = 1024
WINDOW_TYPE = 'hann'
FEATURE= 'mel'

# Mel band parameters
N_MELS = 40

#MAXIMUM LENGTH OF AN AUDIO FILE IN SECONDS
MAX_LENGTH_S=4
MAX_LENTGH_SAMP=int(np.ceil(MAX_LENGTH_S*SR/WIN_SIZE))

In [4]:
# INSPECT THE DATASET
metadata[:10]

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing
5,100263-2-0-143.wav,100263,71.5,75.5,1,5,2,children_playing
6,100263-2-0-161.wav,100263,80.5,84.5,1,5,2,children_playing
7,100263-2-0-3.wav,100263,1.5,5.5,1,5,2,children_playing
8,100263-2-0-36.wav,100263,18.0,22.0,1,5,2,children_playing
9,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn


In [5]:
label_list = sorted(metadata['class'].unique())
print(label_list)

['air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling', 'engine_idling', 'gun_shot', 'jackhammer', 'siren', 'street_music']


In [6]:
# open a hdf5 file and create earrays
hdf5_file = h5py.File(hdf5_path, mode='w')

In [7]:
def extract_logmelspec_librosa(audiofile, outputfile):
    """Extracts the log mel spectrogram and saves it to outputfile.
    
    Args:
      audiofile:  string, path to a .wav file
      outputfile: string, path to write out the log mel spectogram.
    
    Returns:
        A numpy array of a log mel spectrogram
    """
#     print('--> Extracting Mel spectrogram for {:s} (bands={:d})'.format(os.path.basename(audiofile), MEL_numberBands))

    audio, sr = librosa.load(audiofile, sr=SR, mono=True)
    stft = librosa.stft(audio, n_fft=N_FFT, win_length=WIN_SIZE,
                        hop_length=HOP_SIZE, window=WINDOW_TYPE)
    stft = np.abs(stft)**2
    melspec = librosa.feature.melspectrogram(
        y=None, S=stft, sr=SR, n_fft=N_FFT, hop_length=HOP_SIZE,
        n_mels=N_MELS, htk=True, fmin=0.0, fmax=SR/2.0)

    logmelspec = librosa.core.logamplitude(melspec, ref=1.0)
    logmelspec = logmelspec.astype(np.float32) # downcast to float32

    if not os.path.isdir(os.path.split(outputfile)[0]):
        os.mkdir(os.path.split(outputfile)[0])

    f = gzip.open(outputfile, 'wb')
    logmelspec.dump(f)
    f.close()
    
    return logmelspec

In [8]:
def to_categorical(y, num_classes):
    """Converts a class vector (integers) to binary class matrix.
    
    Args:
      y: class vector to be converted into a matrix
        (integers from 0 to num_classes).
      num_classes: total number of classes.
    
    Returns:
      A binary matrix representation of the input. The classes axis
      is placed last.
    """
    y = np.array(y, dtype='int')
    y = y.ravel()
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=np.float32)
    categorical[np.arange(n), y] = 1
    output_shape = y.shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

## Create a training set

In [9]:
folds = [1, 2, 3, 4, 5, 6, 7, 8]
splitname = 'train'

featurefolder = os.path.join(projectspace, 'features', FEATURE, splitname)
# Create folder
if not os.path.isdir(featurefolder):
    os.makedirs(featurefolder)

label_count = {}
for label in label_list:
    label_count[label] = 0
 
X_train=[]
y_train=[]
for fold in folds:

    print('FOLD {:d}'.format(fold))
    audiofolder = os.path.join(dataspace, 'fold{:d}/'.format(fold))
    audiofiles = glob.glob(os.path.join(audiofolder, '*.wav'))

    for af in tqdm(audiofiles):
        #save the statistics of the dataset
        label = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'class'].values[0]
        classID = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'classID'].values[0]
        label_count[label] += 1
                
        #extract the features
        outfile = os.path.join(featurefolder, os.path.basename(af).replace(".wav", ".npy.gznpy.gz"))
        logmelspec = extract_logmelspec_librosa(af, outfile)    
        
        #pad or cut if necessary
        if logmelspec.shape[1] < MAX_LENTGH_SAMP:
            pad_tmp=np.zeros((N_MELS,MAX_LENTGH_SAMP))
            pad_tmp[:,0:logmelspec.shape[1]] = logmelspec
            logmelspec = pad_tmp  
        elif logmelspec.shape[1] > MAX_LENTGH_SAMP:
            logmelspec = logmelspec[:,:MAX_LENTGH_SAMP]
        
        X_train.append(logmelspec)
        y_train.append(np.squeeze(to_categorical(classID,len(label_list))))
    
hdf5_file.create_dataset('X_train',data=X_train)
hdf5_file.create_dataset('y_train',data=y_train)

# Print overall report
print('\n\nOVERALL labels:')
for label in label_list:
    print('{:s}:\t{:d}'.format(label, label_count[label]))


  0%|          | 0/873 [00:00<?, ?it/s]

FOLD 1


100%|██████████| 873/873 [03:14<00:00,  4.49it/s]
  0%|          | 0/888 [00:00<?, ?it/s]

FOLD 2


100%|██████████| 888/888 [02:37<00:00,  5.64it/s]
  0%|          | 3/925 [00:00<00:33, 27.60it/s]

FOLD 3


100%|██████████| 925/925 [02:02<00:00,  7.53it/s]
  0%|          | 0/990 [00:00<?, ?it/s]

FOLD 4


100%|██████████| 990/990 [03:44<00:00,  4.41it/s]
  0%|          | 2/936 [00:00<00:55, 16.88it/s]

FOLD 5


100%|██████████| 936/936 [02:38<00:00,  5.91it/s]
  0%|          | 3/823 [00:00<00:38, 21.19it/s]

FOLD 6


100%|██████████| 823/823 [01:37<00:00,  8.42it/s]
  0%|          | 0/838 [00:00<?, ?it/s]

FOLD 7


100%|██████████| 838/838 [02:27<00:00,  5.68it/s]
  0%|          | 2/806 [00:00<01:02, 12.89it/s]

FOLD 8


100%|██████████| 806/806 [02:20<00:00,  5.75it/s]




OVERALL labels:
air_conditioner:	800
car_horn:	364
children_playing:	800
dog_bark:	800
drilling:	800
engine_idling:	818
gun_shot:	311
jackhammer:	822
siren:	764
street_music:	800


Calculate a scaler from the training data

In [10]:
X_train_flat = np.hstack(np.array(X_train)).T
print(X_train_flat.shape)
scaler = preprocessing.StandardScaler().fit(X_train_flat)
pickle.dump(scaler, open(scaler_path, 'wb'))

(1224667, 40)


## Creating a validation set


In [11]:
folds = [9]
splitname = 'validation'

featurefolder = os.path.join(projectspace, 'features', FEATURE, splitname)
# Create folder
if not os.path.isdir(featurefolder):
    os.makedirs(featurefolder)

label_count = {}
for label in label_list:
    label_count[label] = 0

X_val=[]
y_val=[]
for fold in folds:

    print('FOLD {:d}'.format(fold))
    audiofolder = os.path.join(dataspace, 'fold{:d}/'.format(fold))
    audiofiles = glob.glob(os.path.join(audiofolder, '*.wav'))
    
    
    for af in tqdm(audiofiles):
        #save the statistics of the dataset
        label = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'class'].values[0]
        classID = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'classID'].values[0]
        label_count[label] += 1
        
        #extract the features
        outfile = os.path.join(featurefolder, os.path.basename(af).replace(".wav", ".npy.gz"))
        logmelspec = extract_logmelspec_librosa(af, outfile)    
        
        #pad or cut if necessary
        if logmelspec.shape[1] < MAX_LENTGH_SAMP:
            pad_tmp=np.zeros((N_MELS,MAX_LENTGH_SAMP))
            pad_tmp[:,0:logmelspec.shape[1]] = logmelspec
            logmelspec = pad_tmp 
        elif logmelspec.shape[1] > MAX_LENTGH_SAMP:
            logmelspec = logmelspec[:,:MAX_LENTGH_SAMP]
            
        X_val.append(logmelspec)
        y_val.append(np.squeeze(to_categorical(classID,len(label_list))))
     
hdf5_file.create_dataset('X_val',data=X_val)
hdf5_file.create_dataset('y_val',data=y_val)  

# Print overall report
print('\n\nOVERALL labels:')
for label in label_list:
    print('{:s}:\t{:d}'.format(label, label_count[label]))

  0%|          | 2/816 [00:00<01:02, 12.97it/s]

FOLD 9


100%|██████████| 816/816 [02:52<00:00,  4.73it/s]



OVERALL labels:
air_conditioner:	100
car_horn:	32
children_playing:	100
dog_bark:	100
drilling:	100
engine_idling:	89
gun_shot:	31
jackhammer:	82
siren:	82
street_music:	100





## Creating a test set

In [12]:
folds = [10]
splitname = 'test'

featurefolder = os.path.join(projectspace, 'features', FEATURE, splitname)
# Create folder
if not os.path.isdir(featurefolder):
    os.makedirs(featurefolder)

label_count = {}
for label in label_list:
    label_count[label] = 0
 
X_test=[]
y_test=[]
for fold in folds:

    print('FOLD {:d}'.format(fold))
    audiofolder = os.path.join(dataspace, 'fold{:d}/'.format(fold))
    audiofiles = glob.glob(os.path.join(audiofolder, '*.wav'))
 
    for af in tqdm(audiofiles):
        #save the statistics of the dataset
        label = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'class'].values[0]
        classID = metadata.loc[metadata.slice_file_name == os.path.basename(af), 'classID'].values[0]
        label_count[label] += 1

        
        #extract the features
        outfile = os.path.join(featurefolder, os.path.basename(af).replace(".wav", ".npy.gz"))
        logmelspec = extract_logmelspec_librosa(af, outfile)    
        
        if logmelspec.shape[1] < MAX_LENTGH_SAMP:
            pad_tmp=np.zeros((N_MELS,MAX_LENTGH_SAMP))
            pad_tmp[:,0:logmelspec.shape[1]] = logmelspec
            logmelspec = pad_tmp 
        elif logmelspec.shape[1] > MAX_LENTGH_SAMP:
            logmelspec = logmelspec[:,:MAX_LENTGH_SAMP]
        
        X_test.append(logmelspec)
        y_test.append(np.squeeze(to_categorical(classID,len(label_list))))
    
hdf5_file.create_dataset('X_test',data=X_test)
hdf5_file.create_dataset('y_test',data=y_test)

# Print overall report
print('\n\nOVERALL labels:')
for label in label_list:
    print('{:s}:\t{:d}'.format(label, label_count[label]))

  0%|          | 0/837 [00:00<?, ?it/s]

FOLD 10


100%|██████████| 837/837 [03:02<00:00,  4.59it/s]



OVERALL labels:
air_conditioner:	100
car_horn:	33
children_playing:	100
dog_bark:	100
drilling:	100
engine_idling:	93
gun_shot:	32
jackhammer:	96
siren:	83
street_music:	100





In [13]:
hdf5_file.create_dataset('feature',data=FEATURE)
hdf5_file.create_dataset('n_features',data=N_MELS, dtype='i8')
hdf5_file.create_dataset('max_length_samp',data=MAX_LENTGH_SAMP, dtype='i8')
hdf5_file.create_dataset('label_list',data=label_list)

hdf5_file.close()

In [14]:
#Sanity check
hf = h5py.File(hdf5_path, 'r')
print(hf.keys())
hf.close()

[u'X_test', u'X_train', u'X_val', u'feature', u'label_list', u'max_length_samp', u'n_features', u'y_test', u'y_train', u'y_val']
