In [1]:
# General Imports
import glob
import os
import sys
import numpy as np
import pandas as pd
from os import listdir
from os.path import isfile, join
from scipy.signal import spectrogram

# Audio Preprocessing
import librosa
from librosa.display import specshow

# Machine Learning Preprocessing and Evaluation
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# for Plotting
import matplotlib.pyplot as plt
%matplotlib inline



In [3]:
metadata = pd.read_csv('/home/enterprise.internal.city.ac.uk/acvn476/UrbanSound8K/metadata/UrbanSound8K.csv')
parent_dir = '/home/enterprise.internal.city.ac.uk/acvn476/UrbanSound8K/audio'
save_dir = 'folds_2channel'
file = parent_dir + '/fold1/7061-6-0-0.wav'

In [4]:
# number of classes - for one-hot encoding and parallel processing
num_total_classes = 10
# desired output parameters
n_frames = 101   # x axis, 345 346
n_bands = 60
# some FFT parameters

sample_rate=22050
n_window = int(sample_rate * 4. / n_frames * 2) - 4 * 2
fft_overlap = 0.5
window_size = 1024 # defaults to n_fft
hop_size = 512 # or int(window_size*(1-fft_overlap)) half the window_size

In [5]:
def read_audio(audio_path, target_fs=None, duration=4):
    (audio, fs) = librosa.load(audio_path, sr=None, duration=duration)
    # if this is not a mono sounds file
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)
    if target_fs is not None and fs != target_fs:
        audio = librosa.resample(audio, orig_sr=fs, target_sr=target_fs)
        fs = target_fs
    return audio, fs
def pad_trunc_seq_rewrite(x, max_len):
    if x.shape[1] < max_len:
        pad_shape = (x.shape[0], max_len - x.shape[1])
        pad = np.ones(pad_shape) * np.log(1e-8)
        #x_new = np.concatenate((x, pad), axis=1)
        x_new = np.hstack((x, pad))
    # no pad necessary - truncate
    else:
        x_new = x[:, 0:max_len]
    return x_new

In [6]:
def extract_features(parent_dir, sub_dirs, bands, frames, file_ext="*.wav"):
# 4 second clip with 50% window overlap with small offset to guarantee frames
    n_window = int(sample_rate * 4. / frames * 2) - 4 * 2
    # 50% overlap
    n_overlap = int(n_window / 2.)
    # Mel filter bank
    melW = librosa.filters.mel(sr=sample_rate, n_fft=n_window, n_mels=bands, fmin=0., fmax=8000.)
    # Hamming window
    ham_win = np.hamming(n_window)
    log_specgrams_list = []
    labels = []
    for l, sub_dir in enumerate(sub_dirs):
        for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
            # print("processing", fn)
            sound_clip, fn_fs = read_audio(fn, target_fs=sample_rate)
            assert (int(fn_fs) == sample_rate)

            if sound_clip.shape[0] < n_window:
                print("File %s is shorter than window size - DISCARDING - look into making the window larger." % fn)
                continue

            label = fn.split('fold')[1].split('-')[1]
            # Skip corrupted wavs
            if sound_clip.shape[0] == 0:
                print("File %s is corrupted!" % fn)
                continue
                # raise NameError("Check filename - it's an empty sound clip.")

            # Compute spectrogram                
            [f, t, x] = spectrogram(
                x=sound_clip,
                window=ham_win,
                nperseg=n_window,
                noverlap=n_overlap,
                detrend=False,
                return_onesided=True,
                mode='magnitude')
            x = np.dot(x.T, melW.T)
            x = np.log(x + 1e-8)
            x = x.astype(np.float32).T
            x = pad_trunc_seq_rewrite(x, frames)

            log_specgrams_list.append(x)
            labels.append(label)

    log_specgrams = np.asarray(log_specgrams_list).reshape(len(log_specgrams_list), bands, frames, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis=3)
    features = np.concatenate((features, np.zeros(np.shape(log_specgrams))), axis=3)
    for i in range(len(features)):
        # first order difference, computed over 9-step window
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
        
    return np.array(features), np.array(labels, dtype=np.int)  

# convert labels to one-hot encoding
def one_hot_encode(labels):
    n_labels = len(labels)
    n_unique_labels = num_total_classes
    one_hot_encode = np.zeros((n_labels, n_unique_labels))
    one_hot_encode[np.arange(n_labels), labels] = 1
    return one_hot_encode

In [7]:
def save_folds(data_dir, k, bands, frames):
    fold_name = 'fold' + str(k)
    print("Saving " + fold_name)

    features, labels = extract_features(parent_dir, [fold_name], bands=bands, frames=frames)
    labels = one_hot_encode(labels)

    print("Features of", fold_name, " = ", features.shape)
    print("Labels of", fold_name, " = ", labels.shape)

    feature_file = os.path.join(data_dir, fold_name + '_x.npy')
    labels_file = os.path.join(data_dir, fold_name + '_y.npy')
    np.save(feature_file, features)
    print("Saved " + feature_file)
    np.save(labels_file, labels)
    print("Saved " + labels_file)

def assure_path_exists(path):
    mydir = os.path.join(os.getcwd(), path)
    if not os.path.exists(mydir):
        os.makedirs(mydir)

In [8]:
assure_path_exists(save_dir)

In [9]:
from joblib import Parallel, delayed
Parallel(n_jobs=num_total_classes)(delayed(save_folds)(save_dir, k, bands=n_bands, frames=n_frames) for k in range(1, 11))

Saving fold3
Saving fold5
Saving fold2
Saving fold4
Saving fold6
Saving fold7
Saving fold8
Saving fold10
Saving fold9
Saving fold1
File /home/enterprise.internal.city.ac.uk/acvn476/UrbanSound8K/audio/fold1/87275-1-2-0.wav is shorter than window size - DISCARDING - look into making the window larger.
File /home/enterprise.internal.city.ac.uk/acvn476/UrbanSound8K/audio/fold2/17307-1-0-0.wav is shorter than window size - DISCARDING - look into making the window larger.
File /home/enterprise.internal.city.ac.uk/acvn476/UrbanSound8K/audio/fold1/87275-1-4-0.wav is shorter than window size - DISCARDING - look into making the window larger.
File /home/enterprise.internal.city.ac.uk/acvn476/UrbanSound8K/audio/fold1/87275-1-1-0.wav is shorter than window size - DISCARDING - look into making the window larger.
File /home/enterprise.internal.city.ac.uk/acvn476/UrbanSound8K/audio/fold1/87275-1-3-0.wav is shorter than window size - DISCARDING - look into making the window larger.
Features of fold6  

[None, None, None, None, None, None, None, None, None, None]