# UrbanSound8K pre-processing

It creates three files, `train`, `valid`, and `test` + `.h5`

Split: folder 1-8: train, 9:valid, 10:test

By Keunwoo Choi. 29 Nov 2016.

### Load things

In [40]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import h5py
import librosa
import os, sys
import time
import pandas as pd 

### Setup path

**Change these to your setup**

* **`PATH_US`**: audio folder path
* **`path_csv`**: csv file path
* **`PATH_HDF`**: output HDF folder path 


In [4]:
PATH_US = '/misc/kcgscratch1/ChoGroup/keunwoo/UrbanSound8K/audio/'
path_csv = '/misc/kcgscratch1/ChoGroup/keunwoo/UrbanSound8K/metadata/UrbanSound8K.csv'
PATH_HDF = '/misc/kcgscratch1/ChoGroup/keunwoo/urbansound8k_hdf/'
# slice_file_name	fsID	start	end	salience	fold	classID	class
# 100032-3-0-0.wav	100032	0	0.317551	1	5	3	dog_bark
# 100263-2-0-117.wav	100263	58.5	62.5	1	5	2	children_playing
# 100263-2-0-121.wav	100263	60.5	64.5	1	5	2	children_playing

In [5]:
fold_folders = ['fold%d/' % i for i in range(1, 11)]
n_label = 10 # 0 - 9

### Audio stuff
* **Modify these if you want. **

In [6]:
# audio
SR = 12000 # [Hz]
max_len = 4.0 # [Seconds]. should be < 4.0. I recommend not to change it. 
n_mels = 96
n_fft = 512
n_hop = 256
len_raw = int(SR * max_len)
n_freq = n_fft/2 + 1

In [7]:
mel_shape = librosa.feature.melspectrogram(np.zeros(SR*max_len), SR, n_fft=n_fft, hop_length=n_hop, n_mels=n_mels).shape
print mel_shape
n_mel_fr = mel_shape[1]
stft_shape = librosa.stft(np.zeros(SR*max_len), n_fft, n_hop).shape
print stft_shape
n_stft_fr = stft_shape[1]

(96, 188)
(257, 188)


  if __name__ == '__main__':


## Load csv

In [8]:
df = pd.read_csv(path_csv, header=0)
print df.shape
n_data_all = df.shape[0]
n_valid = len(df[df['fold']==9])
n_test = len(df[df['fold']==10])
n_train = n_data_all - n_valid - n_test
print n_train, n_valid, n_test

(8732, 8)
7079 816 837


## Shuffling
Remember that shuffling should be WITHIN each dataset. 

In [9]:
if not os.path.exists('shuffled_idxs.npy'):
    [train_shfl_idxs, valid_shfl_idxs, test_shfl_idxs] = np.load('shuffled_idxs.npy')
    print 'Generated'
else:
    np.random.seed(1337)  # for reproducibility
    train_shfl_idxs = np.random.permutation(n_train)
    valid_shfl_idxs = np.random.permutation(n_valid)
    test_shfl_idxs = np.random.permutation(n_test)
    np.save('shuffled_idxs.npy', [train_shfl_idxs, valid_shfl_idxs, test_shfl_idxs])
    print 'Loaded from the previous ones'    

Loaded from the previous ones


### A function to load audio, compute melgram, and store into HDF

If you wanna edit or add...
1. Add a new feature in `create_dataset_for()`
2. Add a new feature in `row_to()` that call `row_to_something()`
3. Implement a new function `row_to_something()`


In [10]:
def create_dataset_for(f_hdf, ds_name, num_data):
    if ds_name == 'melgram':
        return f_hdf.create_dataset('melgram', (num_data, n_mels, n_mel_fr), dtype='float32')
    elif ds_name == 'y':
        return f_hdf.create_dataset('y', (num_data, n_label), dtype='bool')
    elif ds_name == 'raw':
        return f_hdf.create_dataset('raw', (num_data, len_raw), dtype='float32')
    elif ds_name == 'stft':
        return f_hdf.create_dataset('stft', (num_data, n_freq, n_stft_fr), dtype='float32')
    else:
        print 'ha? %s?' % ds_name

def row_to(ds_name, row_idx, row, dataset):
    if ds_name == 'melgram':
        row_to_melgram(row_idx, row, dataset)
    elif ds_name == 'y':
        row_to_y(row_idx, row, dataset)
    elif ds_name == 'raw':
        row_to_raw(row_idx, row, dataset)
    elif ds_name == 'stft':
        row_to_stft(row_idx, row, dataset)
        

In [11]:
def row_to_stft(row_idx, row, dataset):
    '''
    row: row of dataframe of pandas
    dataset: a dataset of hdf file '''
    fname, fold = row[1], row[6]
    folder = fold_folders[fold-1]
    src, sr = librosa.load(PATH_US + folder + fname, SR)
    stft = np.abs(librosa.stft(src, n_fft, n_hop)) ** 2
    dataset[row_idx, :, :min(n_stft_fr, stft.shape[1])] = stft[:, :n_stft_fr]

In [12]:
def row_to_melgram(row_idx, row, dataset):
    '''
    row: row of dataframe of pandas
    dataset: a dataset of hdf file '''
    fname, fold = row[1], row[6]
    folder = fold_folders[fold-1]
    src, sr = librosa.load(PATH_US + folder + fname, SR)

    melgram = librosa.feature.melspectrogram(src, sr, n_fft=n_fft, 
                                             hop_length=n_hop, n_mels=n_mels)
    dataset[row_idx, :, :min(n_mel_fr, melgram.shape[1])] = melgram[:, :n_mel_fr]

In [13]:
def row_to_y(row_idx, row, dataset):
    y = row[7]
    dataset[row_idx, y] = True

In [14]:
def row_to_raw(row_idx, row, dataset):
    fname, fold = row[1], row[6]
    folder = fold_folders[fold-1]
    src, sr = librosa.load(PATH_US + folder + fname, SR)
    dataset[row_idx, :min(len_raw, len(src))] = src[:len_raw]

## function to save in hdf

In [35]:
def set_to_hdf(hdf_filepath, df_subset, shfl_idxs, ds_name):
    '''
    Either create (w) or append (a) to a hdf file
    hdf_filepath; string, full file path to store hdf
    df_subset: pandas data frame,  of the set to store
    shfl_idxs: numpy integer array, shuffled index
    ds_name: 'melgram', 'y', 'raw', ...
    '''
    assert len(df_subset) == len(shfl_idxs), 'data frame length != indices list'
    start_time = time.time()
    num_data = len(df_subset)
    if os.path.exists(hdf_filepath):
        mode = 'a'
    else:
        mode = 'w'
    with h5py.File(hdf_filepath, mode) as f_hdf:
        dataset = create_dataset_for(f_hdf, ds_name, num_data)
        for row_idx, row in enumerate(df_subset.iloc[shfl_idxs].itertuples()):
            row_to(ds_name, row_idx, row, dataset)
            if row_idx % 20 == 0:
                sys.stdout.write('\r%d/%d-th sample (%s) was written.' % (row_idx+1, num_data, ds_name))
    print '\n--- Done: It took %d seconds for %s, %s ---' % \
          (int(time.time() - start_time), ds_name, hdf_filepath.split('/')[-1])


### Do it!

In [None]:
for ds_name in ['stft', 'raw', 'melgram', 'y']:
    set_to_hdf(PATH_HDF+'train.h5', df[df['fold'] < 9], train_shfl_idxs, ds_name)
    set_to_hdf(PATH_HDF+'valid.h5', df[df['fold']==9], valid_shfl_idxs, ds_name)
    set_to_hdf(PATH_HDF+'test.h5', df[df['fold']==10], test_shfl_idxs, ds_name)

421/7079-th sample (stft) was written.

## Done. Wanna standardise them?

In [19]:
for fname in ['train.h5', 'valid.h5', 'test.h5']:
    for dname in ['melgram', 'stft']:
        with h5py.File(PATH_HDF + fname, 'a') as f:
            mean = np.mean(f[dname])
            std = np.std(f[dname])
            f[dname][:] = (f[dname][:] - mean)/(std + np.finfo(np.float32).eps)