In [1]:
import sys
import numpy as np 
import os
import pandas as pd 
from scipy.io import wavfile

import librosa
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from keras import regularizers, activations
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Activation, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.utils import np_utils, to_categorical

from tensorflow.keras.preprocessing.image import ImageDataGenerator

from datetime import datetime 

from matplotlib import pyplot as plt

from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
import wave

ModuleNotFoundError: No module named 'librosa'

In [71]:
US8K_AUDIO_PATH = os.path.abspath('../UrbanSound8K/audio/')
US8K_METADATA_PATH = os.path.abspath('../UrbanSound8K/metadata/UrbanSound8K.csv')
metadata = pd.read_csv(US8K_METADATA_PATH)

In [72]:
# load the csv metadata file into a Pandas DataFrame structure
us8k_metadata_df = pd.read_csv(US8K_METADATA_PATH,
                               usecols=["slice_file_name", "fold", "classID"],
                               dtype={"fold": "uint8", "classID" : "uint8"})

us8k_metadata_df

Unnamed: 0,slice_file_name,fold,classID
0,100032-3-0-0.wav,5,3
1,100263-2-0-117.wav,5,2
2,100263-2-0-121.wav,5,2
3,100263-2-0-126.wav,5,2
4,100263-2-0-137.wav,5,2
...,...,...,...
8726,99812-1-2-0.wav,7,1
8727,99812-1-3-0.wav,7,1
8728,99812-1-4-0.wav,7,1
8729,99812-1-5-0.wav,7,1


In [73]:
HOP_LENGTH = 512        # number of samples between successive frames
WINDOW_LENGTH = 512     # length of the window in samples
N_MEL = 128             # number of Mel bands to generate



def compute_melspectrogram_with_fixed_length(audio, sampling_rate, num_of_samples=128):
    try:
        # compute a mel-scaled spectrogram
        melspectrogram = librosa.feature.melspectrogram(y=audio, 
                                                        sr=sampling_rate, 
                                                        hop_length=HOP_LENGTH,
                                                        win_length=WINDOW_LENGTH, 
                                                        n_mels=N_MEL)

        # convert a power spectrogram to decibel units (log-mel spectrogram)
        melspectrogram_db = librosa.power_to_db(melspectrogram, ref=np.max)
        
        melspectrogram_length = melspectrogram_db.shape[1]
        
        # pad or fix the length of spectrogram 
        if melspectrogram_length != num_of_samples:
            melspectrogram_db = librosa.util.fix_length(melspectrogram_db, 
                                                        size=num_of_samples, 
                                                        axis=1, 
                                                        constant_values=(0, -80.0))
    except Exception as e:
        print("\nError encountered while parsing files\n>>", e)
        return None 
    
    return melspectrogram_db

In [74]:
augmente_PitchShift = Compose([
    PitchShift(min_semitones=-4, max_semitones=4, p=1),
])

augmente_GaussianNoise = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=1),
])


augmente_TimeStretch = Compose([
    TimeStretch(min_rate=0.8, max_rate=1.25, p=1),
])


In [75]:
def augmentation(sound, label, sample_rate):
    data = []
    if label == 0 : 
        data.append(augmente_PitchShift(sound,sample_rate))
    
    if label == 1 : 
        data.append(augmente_PitchShift(sound,sample_rate))
        data.append(augmente_GaussianNoise(sound,sample_rate))
        data.append(augmente_TimeStretch(sound,sample_rate))
    if label == 2 : 
        data.append(augmente_GaussianNoise(sound,sample_rate))
        
    if label == 5 : 
        data.append(augmente_PitchShift(sound,sample_rate))
    if label == 6 : 
        data.append(augmente_PitchShift(sound,sample_rate))
        data.append(augmente_GaussianNoise(sound,sample_rate))
        data.append(augmente_TimeStretch(sound,sample_rate))
    if label == 7 : 
        data.append(augmente_PitchShift(sound,sample_rate))
        data.append(augmente_GaussianNoise(sound,sample_rate))
        data.append(augmente_TimeStretch(sound,sample_rate))
     
    return data 
        
    

In [76]:
features = []
if not(usk)
# iterate through all dataset examples and compute log-mel spectrograms
for index, row in tqdm(us8k_metadata_df.iterrows(), total=len(us8k_metadata_df)):
    file_path = f'{US8K_AUDIO_PATH}/fold{row["fold"]}/{row["slice_file_name"]}'
    audio, sample_rate = librosa.load(file_path, duration=SOUND_DURATION, res_type='kaiser_fast')

    label = row["classID"]
    fold = row["fold"]
    
    augmented_samples = augmentation(audio,label, sample_rate)
    
    melspectrogram = compute_melspectrogram_with_fixed_length(audio, sample_rate)
    features.append([melspectrogram, label, fold])
    
    for i in range(len(augmented_samples)) :
        melspectrogram_a = compute_melspectrogram_with_fixed_length(augmented_samples[i],sample_rate)
        features.append([melspectrogram_a, label, fold])



# convert into a Pandas DataFrame 
us8k_df = pd.DataFrame(features, columns=["melspectrogram", "label", "fold"])

100%|██████████| 8731/8731 [15:10<00:00,  9.59it/s]


In [77]:
us8k_df.shape

(17137, 3)

In [78]:
# write the Pandas DataFrame object to .pkl file
WRITE_DATA = True

if WRITE_DATA:
  us8k_df.to_pickle("us8k_augmented_df.pkl")