In [1]:
%%html
<style>@import url('style.css')</style><script>IPython.OutputArea.prototype._should_scroll = function(){return false}</script>

Common methods for dataset loading and preprocessing

In [2]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import multiprocessing
from joblib import Parallel, delayed
import glob
import os
import sys
import gc

import numpy as np
import seaborn as sb
import pandas as pd
import librosa
import pydub

import IPython.display
import cPickle
import logging
import time
import datetime

sb.set(style="white", palette="muted")
CPU_COUNT = multiprocessing.cpu_count()

from ctypes import cdll, CDLL
cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")    

In [3]:
def _load_audio(path, duration):
    audio = pydub.AudioSegment.silent(duration=duration)
    audio = audio.overlay(pydub.AudioSegment.from_file(path).set_frame_rate(22050).set_channels(1))[0:duration]
    raw = (np.fromstring(audio._data, dtype="int16") + 0.5) / (0x7FFF + 0.5)   # convert to float
    return raw

In [4]:
def load_urbansound():
    """Load raw audio and metadata content from the UrbanSound8K dataset."""
    
    if os.path.isfile(URBAN_PATH + 'urban_meta.pkl') and os.path.isfile(URBAN_PATH + 'urban_audio.dat'):
        rows_meta = pd.read_pickle(URBAN_PATH + 'urban_meta.pkl')
        rows_audio = np.memmap(URBAN_PATH + 'urban_audio.dat', dtype='float32', mode='r', shape=(8732, 88200))
        return rows_meta, rows_audio
    
    metadata = pd.read_csv(URBAN_PATH + 'UrbanSound8K/metadata/UrbanSound8K.csv')
    
    b = 0
    batch_size = 1000
    rows_meta = []
    rows_audio = []
    while len(metadata[b * batch_size:(b + 1) * batch_size]):
        for row in metadata[b * batch_size:(b + 1) * batch_size].iterrows():
            filename = row[1]['slice_file_name']
            fold = row[1]['fold']
            category = row[1]['classID']
            category_name = row[1]['class']
            rows_meta.append(pd.DataFrame({'filename': filename, 'fold': fold, 'category': category, 'category_name': category_name}, index=[0]))
            rows_audio.append(_load_audio(URBAN_PATH + 'UrbanSound8K/audio/fold{}/{}'.format(fold, filename), 4000))
        libc.malloc_trim(0)
        b = b + 1
        rows_meta = [pd.concat(rows_meta, ignore_index=True)]
        rows_audio = [np.vstack(rows_audio)]
        IPython.display.clear_output(wait=True)
        print 'Loaded batch {} ({} / {})'.format(b, b * batch_size, len(metadata))
    
    rows_meta = rows_meta[0]
    rows_meta[['category', 'fold']] = rows_meta[['category', 'fold']].astype(int)
    
    rows_meta.to_pickle(URBAN_PATH + 'urban_meta.pkl')
    mm = np.memmap(URBAN_PATH + 'urban_audio.dat', dtype='float32', mode='w+', shape=(8732, 88200))
    mm[:] = rows_audio[0][:]
    mm.flush()
    del rows_audio
    return rows_meta, mm

In [5]:
def load_esc(variant):
    """Load raw audio and metadata content from the ESC-10/ESC-50 dataset."""
    
    if variant == 10:
        if os.path.isfile(ESC_PATH + 'esc10_meta.pkl') and os.path.isfile(ESC_PATH + 'esc10_audio.dat'):
            rows_meta = pd.read_pickle(ESC_PATH + 'esc10_meta.pkl')
            rows_audio = np.memmap(ESC_PATH + 'esc10_audio.dat', dtype='float32', mode='r', shape=(400, 110250))
            return rows_meta, rows_audio
        path = ESC_PATH + 'ESC-10'
    else:
        if os.path.isfile(ESC_PATH + 'esc50_meta.pkl') and os.path.isfile(ESC_PATH + 'esc50_audio.dat'):
            rows_meta = pd.read_pickle(ESC_PATH + 'esc50_meta.pkl')
            rows_audio = np.memmap(ESC_PATH + 'esc50_audio.dat', dtype='float32', mode='r', shape=(2000, 110250))
            return rows_meta, rows_audio
        path = ESC_PATH + 'ESC-50'
    
    rows_meta = []
    rows_audio = []
    category_counter=0
    
    for directory in sorted(os.listdir('{0}'.format(path))):
        directory = '{0}/{1}'.format(path, directory)        
        if not (os.path.isdir(directory) and os.path.basename(directory)[0:3].isdigit()):
            continue
        print('Parsing ' + directory)
        for clip in sorted(os.listdir(directory)):
            if clip[-3:] != 'ogg':
                continue
            filepath = '{0}/{1}'.format(directory, clip)
            filename = os.path.basename(filepath)
            fold = filename[0]
            category = category_counter
            category_name = os.path.dirname(filepath).split('/')[-1]
            rows_meta.append(pd.DataFrame({'filename': filename, 'fold': fold, 'category': category, 'category_name': category_name}, index=[0]))
            rows_audio.append(_load_audio(filepath, 5000))
        libc.malloc_trim(0)
        rows_meta = [pd.concat(rows_meta, ignore_index=True)]
        rows_audio = [np.vstack(rows_audio)]
        category_counter = category_counter + 1
    
    rows_meta = rows_meta[0]
    rows_meta[['category', 'fold']] = rows_meta[['category', 'fold']].astype(int)
    
    if variant == 10:
        rows_meta.to_pickle(ESC_PATH + 'esc10_meta.pkl')
        mm = np.memmap(ESC_PATH + 'esc10_audio.dat', dtype='float32', mode='w+', shape=(400, 110250))
    else:
        rows_meta.to_pickle(ESC_PATH + 'esc50_meta.pkl')
        mm = np.memmap(ESC_PATH + 'esc50_audio.dat', dtype='float32', mode='w+', shape=(2000, 110250))
    mm[:] = rows_audio[0][:]
    mm.flush()
    del rows_audio
    return rows_meta, mm

    return rows_meta, rows_audio[0]

In [6]:
def _extract_segments(args):
    clip, filename, fold, category, category_name, augmented, frames = args

    # Due to an off-by-one bug which has not been caught earlier,
    # actually both variants (long and short) use the same
    # overlap setting (half of window size) - whereas different settings
    # were mentioned in the paper.
    #
    # The code below has been already cleaned up to reflect those changes.
    #
    # Apart from that, for reproducibility purposes it is required that
    # librosa v0.3.1 is used, as further versions drastically change
    # the delta computations.
    
    FRAMES_PER_SEGMENT = frames - 1  # 41 frames ~= 950 ms segment length @ 22050 Hz
    WINDOW_SIZE = 512 * FRAMES_PER_SEGMENT   # 23 ms per frame @ 22050 Hz
    STEP_SIZE = 512 * FRAMES_PER_SEGMENT // 2
    BANDS = 60
    
    s = 0
    segments = []

    normalization_factor = 1 / np.max(np.abs(clip)) 
    clip = clip * normalization_factor

    while len(clip[s * STEP_SIZE:s * STEP_SIZE + WINDOW_SIZE]) == WINDOW_SIZE:
        signal = clip[s * STEP_SIZE:s * STEP_SIZE + WINDOW_SIZE]

        melspec = librosa.feature.melspectrogram(signal, sr=22050, n_fft=1024, hop_length=512, n_mels=BANDS)
        logspec = librosa.logamplitude(melspec)
        logspec = logspec.T.flatten()[:, np.newaxis].T
        logspec = pd.DataFrame(data=logspec, dtype='float32', index=[0], columns=list('logspec_b{}_f{}'.format(i % BANDS, i / BANDS) for i in range(np.shape(logspec)[1])))

        if np.mean(logspec.as_matrix()) > -70.0:   # drop silent frames
            segment_meta = pd.DataFrame({'filename': filename, 'fold': fold, 'category': category, 'category_name': category_name,
                                        's_begin': s * STEP_SIZE, 's_end': s * STEP_SIZE + WINDOW_SIZE, 'augmented': augmented}, index=[0])
            segments.append(pd.concat((segment_meta, logspec), axis=1))
        s = s + 1

    segments = pd.concat(segments, ignore_index=True)
    libc.malloc_trim(0)
    return segments
    
def _augment(audio, category_name):
    limits = ((0, 0), (1.0, 1.0))   # pitch shift in half-steps, time stretch
    # ESC-10:
    if category_name == '001 - Dog bark': limits = ((-4, 4), (0.95, 1.1))    
    if category_name == '003 - Sea waves': limits = ((0, 0), (0.9, 1.2))  
    if category_name == '004 - Baby cry': limits = ((-3, 6), (0.8, 1.3))    
    if category_name == '006 - Person sneeze': limits = ((-4, 4), (0.9, 1.2))    
    if category_name == '007 - Helicopter': limits = ((0, 0), (0.9, 1.2))    
    if category_name == '008 - Chainsaw': limits = ((-4, 2), (0.9, 1.2))    
    if category_name == '009 - Rooster': limits = ((-3, 2), (0.95, 1.1))
    
    # UrbanSound8K:
    if category_name == 'car_horn': limits = ((-2, 2), (0.9, 1.1))
    if category_name == 'children_playing': limits = ((-2, 3), (0.9, 1.2))
    if category_name == 'dog_bark': limits = ((-4, 4), (0.95, 1.1))        
    if category_name == 'drilling': limits = ((-1, 1), (0.9, 1.1))
    if category_name == 'gun_shot': limits = ((-1, 1), (0.9, 1.1))
    if category_name == 'siren': limits = ((-2, 3), (0.9, 1.2))
    if category_name == 'street_music': limits = ((-6, 6), (0.9, 1.2))
    
    pitch_shift = np.random.randint(limits[0][1], limits[0][1] + 1)
    time_stretch = np.random.random() * (limits[1][1] - limits[1][0]) + limits[1][0]
    time_shift = np.random.randint(22050)
    
    return np.hstack((np.zeros((time_shift)),
                      librosa.effects.time_stretch(librosa.effects.pitch_shift(audio, 22050, pitch_shift), time_stretch)))
        

def extract_features(meta, audio, augmentations=0, frames=41):
    np.random.seed(20150520)
    batch_size = 100
    segments = []
    for b in range(len(audio) // batch_size + 1):
        start = b * batch_size
        end = (b + 1) * batch_size
        if end > len(audio):
            end = len(audio)
        
        segments.extend(Parallel(n_jobs=CPU_COUNT)(delayed(_extract_segments)((
                audio[i, :],
                meta.loc[i, 'filename'],
                meta.loc[i, 'fold'],
                meta.loc[i, 'category'],
                meta.loc[i, 'category_name'],
                0,
                frames
            )) for i in range(start, end)))
        for _ in range(augmentations):
            segments.extend(Parallel(n_jobs=CPU_COUNT)(delayed(_extract_segments)((
                    _augment(audio[i, :], meta.loc[i, 'category_name']),
                    meta.loc[i, 'filename'],
                    meta.loc[i, 'fold'],
                    meta.loc[i, 'category'],
                    meta.loc[i, 'category_name'],
                    1,
                    frames
                )) for i in range(start, end)))
        segments = [pd.concat(segments, ignore_index=True)]
        IPython.display.clear_output(wait=True)
        print '{} / {}'.format(end, len(audio))
    return segments[0]   

In [7]:
def to_percentage(number):
    return int(number * 1000) / 10.0