In [41]:
import sys
import os
import IPython as IP
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pickle
import helpers
import glob
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from IPython.display import clear_output, display
from scipy.stats import kurtosis, skew
from tensorflow.keras.utils import to_categorical
import warnings
warnings.filterwarnings('ignore')

# Loading Meta data

In [10]:
import pandas as pd
dataset = pd.read_csv('C:/Users/ishaa/OneDrive/Desktop/dataset/UrbanSound8K.csv')
audio_dataset_path=('C:/Users/ishaa/OneDrive/Desktop/dataset/audio/')
print(dataset.shape)
dataset.head(10) 

(8732, 8)


Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing
5,100263-2-0-143.wav,100263,71.5,75.5,1,5,2,children_playing
6,100263-2-0-161.wav,100263,80.5,84.5,1,5,2,children_playing
7,100263-2-0-3.wav,100263,1.5,5.5,1,5,2,children_playing
8,100263-2-0-36.wav,100263,18.0,22.0,1,5,2,children_playing
9,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn


# Method of MFCC coefficients extraction

In [15]:
# Iterate through all audio files and extract MFCC
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(dataset)
n_mfcc = 40

for index, row in dataset.iterrows():
    file_path = os.path.join(os.path.abspath(audio_dataset_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]

    # Extract MFCCs (do not add padding)
    mfccs = helpers.get_mfcc(file_path, 0, n_mfcc)
    
    # Save current frame count
    num_frames = mfccs.shape[1]
    
    # Add row (feature / label)
    features.append(mfccs)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames

    # Notify update every N files
    if (counter == 500):
        print("Status: {}/{}".format(index+1, total_samples))
        counter = 0

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))


Status: 501/8732
Status: 1001/8732
Status: 1501/8732
Status: 2001/8732
Status: 2501/8732
Status: 3001/8732
Status: 3501/8732
Status: 4001/8732
Status: 4501/8732
Status: 5001/8732
Status: 5501/8732
Status: 6001/8732
Status: 6501/8732
Status: 7001/8732
Status: 7501/8732
Status: 8001/8732
Status: 8501/8732
Finished: 8731/8732


In [16]:
padded = []

# Add padding
mels_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if (size < mels_max_padding):
        pad_width = mels_max_padding - size
        px = np.pad(features[i], 
                    pad_width=((0, 0), (0, pad_width)), 
                    mode='constant', 
                    constant_values=(0,))
    
    padded.append(px)
    
# Add padding to features with less than frames than frames_max
padded_features = helpers.add_padding(features, frames_max)

# Save MFCC features

In [17]:
# Convert features (X) and labels (y) to Numpy arrays
X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("C:/Users/ishaa/OneDrive/Desktop/Urban Sound_Final/Features/x-mfcc", X)
np.save("C:/Users/ishaa/OneDrive/Desktop/Urban Sound_Final/Features/y-mfcc", y)


# Feature Extraction : Mel Spectogram

In [59]:
import helpers

In [75]:
def get_mel_spectrogram(file_path, padding_frames, n_mels=40):
    # Load audio file
    y, sr = librosa.load(file_path)
    
    # Compute the Mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    
    # If padding_frames is greater than 0, pad the spectrogram
    if padding_frames > 0:
        mel_spectrogram = librosa.util.pad_center(mel_spectrogram, pad_width=(0, padding_frames))
    
    # Convert to log scale
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
    
    return log_mel_spectrogram

# Iterate through all audio files and extract mel spectogram
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(dataset)
n_mels=40

for index, row in dataset.iterrows():
    file_path = os.path.join(os.path.abspath(audio_dataset_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]

    # Extract Log-Mel Spectrograms (do not add padding)
    mels = get_mel_spectrogram(file_path, 0, n_mels=n_mels)
    
    # Save current frame count
    num_frames = mels.shape[1]
    
    # Add row (feature / label)
    features.append(mels)
    labels.append(class_label)

    # Update frames maximum
    if (num_frames > frames_max):
        frames_max = num_frames

    # Notify update every N files
    if (counter == 500):
        print("Status: {}/{}".format(index+1, total_samples))
        counter = 0

    counter += 1
    
print("Finished: {}/{}".format(index, total_samples))

Status: 501/8732
Status: 1001/8732
Status: 1501/8732
Status: 2001/8732
Status: 2501/8732
Status: 3001/8732
Status: 3501/8732
Status: 4001/8732
Status: 4501/8732
Status: 5001/8732
Status: 5501/8732
Status: 6001/8732
Status: 6501/8732
Status: 7001/8732
Status: 7501/8732
Status: 8001/8732
Status: 8501/8732
Finished: 8731/8732


In [76]:
padded = []

# Add padding
mels_max_padding = frames_max
for i in range(len(features)):
    size = len(features[i][0])
    if (size < mels_max_padding):
        pad_width = mels_max_padding - size
        px = np.pad(features[i], 
                    pad_width=((0, 0), (0, pad_width)), 
                    mode='constant', 
                    constant_values=(0,))
    
    padded.append(px)
    
# Add padding to features with less than frames than frames_max
padded_features = helpers.add_padding(features, frames_max)

In [77]:
## Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded)
y = np.array(labels)

# Optionally save the features to disk
np.save("C:/Users/ishaa/OneDrive/Desktop/Urban Sound_Final/Features/x-melSpec", X)
np.save("C:/Users/ishaa/OneDrive/Desktop/Urban Sound_Final/Features/y-melSpec", y)

# Feature Extraction : Chromagram

In [79]:
# Iterate through all audio files and extract chromagram
features = []
labels = []
frames_max = 0
counter = 0
total_samples = len(dataset)
n_chroma = 40


#Iterating through all audio files and extracting chromagram
for index, row in dataset.iterrows():
    file_path = os.path.join(os.path.abspath('audio_dataset_path'), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]
    try:
        y, sr = librosa.load(file_path)
        normalized_y = librosa.util.normalize(y)
        chromagram = librosa.feature.chroma_stft(y=normalized_y,
                                        sr=sr)
        
        normalized_chroma = librosa.util.normalize(chromagram)
        shape = normalized_chroma.shape[1]
        chroma = normalized_chroma
        num_frames = chroma.shape[1]
        features.append(chroma)
        labels.append(class_label)
        if (num_frames > frames_max):
            frames_max = num_frames
        if (counter == 1):
            print("Status: {}/{}".format(index+1, total_samples))
            counter = 0
        counter += 1
    except Exception:
        pass
print("Finished: {}/{}".format(index, total_samples))

Finished: 8731/8732


# Padding the feature variable

In [80]:
# Given an numpy array of features, zero-pads each ocurrence to max_padding
def add_padding(features, chroma_max_padding=174):
    padded = []
    for i in range(len(features)):
        px = features[i]
        size = len(px[0])
        # Add padding if required
        if (size < chroma_max_padding):
            xDiff = chroma_max_padding - size
            xLeft = xDiff//2
            xRight = xDiff-xLeft
            px = np.pad(px, pad_width=((0,0), (xLeft, xRight)), mode='constant')
        
        padded.append(px)

    return padded

# Add padding to features with less than frames than frames_max
padded_features = add_padding(features, frames_max)


In [81]:
# Convert features (X) and labels (y) to Numpy arrays

X = np.array(padded) #Padded Feature are converted to numpy array & stored in X
y = np.array(labels) #Labels are coverted to numpy array & stored in y

# Optionally save the features to disk
np.save("C:/Users/ishaa/OneDrive/Desktop/Urban Sound_Final/Features/x-chroma", X)
np.save("C:/Users/ishaa/OneDrive/Desktop/Urban Sound_Final/Features/y-chroma", y)

# Other feature extraction

In [93]:
os.path.abspath(audio_dataset_path)

'C:\\Users\\ishaa\\OneDrive\\Desktop\\dataset\\audio'

In [94]:
for index, row in dataset.iterrows():
    file_path = os.path.join(os.path.abspath(audio_dataset_path), 'fold' + str(row["fold"]), str(row["slice_file_name"]))
    class_label = row["class"]
    
print(file_path)

C:\Users\ishaa\OneDrive\Desktop\dataset\audio\fold7\99812-1-6-0.wav


In [92]:
y,sr=librosa.load(file_path)
melspectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
mel1 = np.max(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
mel2 = np.min(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
mel3 = np.var(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
mel4 = np.median(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
mfccs_min = np.min(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  
mfccs_min.resize((40,),refcheck=False)
mfccs_max = np.max(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  
mfccs_max.resize((40,),refcheck=False)
mfccs_median = np.median(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0) 
mfccs_median.resize((40,),refcheck=False)
mfccs_mean = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  
mfccs_mean.resize((40,),refcheck=False)
mfccs_variance = np.var(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0) 
mfccs_variance.resize((40,),refcheck=False)
mfccs_skewness = skew(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  
mfccs_skewness.resize((40,),refcheck=False)
mfccs_kurtosis = kurtosis(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0) 
mfccs_kurtosis.resize((40,),refcheck=False)
chroma_stft=np.mean(librosa.feature.chroma_stft(y=y, sr=sr,n_chroma=40).T,axis=0)
chrom2=np.std(librosa.feature.chroma_stft(y=y, sr=sr,n_chroma=40).T,axis=0)
rmse1 = np.mean(librosa.feature.rms(y=y).T,axis=0)
rmse1.resize((40,),refcheck=False)
rmse2 = np.std(librosa.feature.rms(y=y).T,axis=0)
rmse2.resize((40,),refcheck=False)
spec_bw1 = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr,n_fft=2048,hop_length=512).T,axis=0)
spec_bw1.resize((40,),refcheck=False)
spec_bw2 = np.std(librosa.feature.spectral_bandwidth(y=y, sr=sr,n_fft=2048,hop_length=512).T,axis=0)
spec_bw2.resize((40,),refcheck=False)
rolloff1 = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr,n_fft=2048,hop_length=512).T,axis=0)
rolloff1.resize((40,),refcheck=False)
rolloff2 = np.std(librosa.feature.spectral_rolloff(y=y, sr=sr,n_fft=2048,hop_length=512).T,axis=0)
rolloff2.resize((40,),refcheck=False)
centroid1 = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr,n_fft=2048,hop_length=512).T,axis=0)
centroid1.resize((40,),refcheck=False)
centroid2 = np.std(librosa.feature.spectral_centroid(y=y, sr=sr,n_fft=2048,hop_length=512).T,axis=0)
centroid2.resize((40,),refcheck=False)
zcr1 = np.mean(librosa.feature.zero_crossing_rate(y).T,axis=0)
zcr1.resize((40,),refcheck=False)
zcr2 = np.std(librosa.feature.zero_crossing_rate(y).T,axis=0)
zcr2.resize((40,),refcheck=False)
chrcn1=np.mean(librosa.feature.chroma_cens(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
chrcn1.resize((40,),refcheck=False)
chrcn2=np.std(librosa.feature.chroma_cens(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
chrcn2.resize((40,),refcheck=False)
chrcqt1=np.mean(librosa.feature.chroma_cqt(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
chrcqt1.resize((40,),refcheck=False)
chrcqt2=np.std(librosa.feature.chroma_cens(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
chrcqt2.resize((40,),refcheck=False)
melspectrogram.shape,mel1.shape,mel2.shape,mel3.shape,mel4.shape,mfccs_min.shape,mfccs_max.shape,mfccs_median.shape,mfccs_mean.shape,mfccs_variance.shape,mfccs_skewness.shape,mfccs_kurtosis.shape,chroma_stft.shape,chrom2.shape,rmse1.shape,rmse2.shape,spec_bw1.shape,spec_bw2.shape,rolloff1.shape,rolloff2.shape,centroid1.shape,centroid2.shape,zcr1.shape,zcr2.shape,chrcn1.shape,chrcn2.shape,chrcqt1.shape,chrcqt2.shape

((40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,),
 (40,))

In [95]:
features=np.vstack((melspectrogram,mel1,mel2,mel3,mel4,mfccs_min,mfccs_max,mfccs_median,mfccs_mean,mfccs_variance,mfccs_skewness,mfccs_kurtosis,chroma_stft,chrom2,rmse1,rmse2,spec_bw1,spec_bw2,rolloff1,rolloff2,centroid1,centroid2,zcr1,zcr2,chrcn1,chrcn2,chrcqt1,chrcqt2))
features.reshape

<function ndarray.reshape>

In [96]:
features

array([[7.91354328e-02, 6.10220879e-02, 6.43379465e-02, ...,
        2.78551015e-04, 1.75420006e-04, 9.97673269e-05],
       [3.01072776e-01, 3.24417651e-01, 2.38359332e-01, ...,
        2.71630543e-03, 1.31788035e-03, 8.62481364e-04],
       [9.81561653e-03, 6.90746401e-03, 5.78295300e-03, ...,
        1.12634343e-05, 1.77016409e-05, 8.39356289e-06],
       ...,
       [4.56163362e-02, 1.60106719e-02, 4.67957743e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.65843469e-01, 4.72705901e-01, 6.16207242e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.56163362e-02, 1.60106719e-02, 4.67957743e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [100]:
#preprocessing using entire feature set
from tqdm import tqdm
x_train=[]
x_test=[]
y_train=[]
y_test=[]
path="C:/Users/ishaa/OneDrive/Desktop/dataset/audio/fold"
for i in tqdm(range(len(dataset))):
    fold_no=str(dataset.iloc[i]["fold"])
    file=dataset.iloc[i]["slice_file_name"]
    label=dataset.iloc[i]["classID"]
    filename=path+fold_no+"/"+file
    y,sr=librosa.load(filename)
    mfccs_min = np.min(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  
    mfccs_max = np.max(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  
    mfccs_median = np.median(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0) 

    mfccs_mean = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  

    mfccs_variance = np.var(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0) 

    mfccs_skewness = skew(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0)  

    mfccs_kurtosis = kurtosis(librosa.feature.mfcc(y=y, sr=sr, n_mfcc = 20).T,axis=0) 
    melspectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
    mel1 = np.max(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
    mel2 = np.min(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
    mel3 = np.var(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
    mel4 = np.median(librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40,fmax=8000).T,axis=0)
    chroma_stft=np.mean(librosa.feature.chroma_stft(y=y, sr=sr,n_chroma=40).T,axis=0)
    chrom2=np.std(librosa.feature.chroma_stft(y=y, sr=sr,n_chroma=40).T,axis=0)
    rmse1 = np.mean(librosa.feature.rms(y=y).T,axis=0)
    rmse2 = np.std(librosa.feature.rms(y=y).T,axis=0)
    spec_bw1 = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr).T,axis=0)
    spec_bw2 = np.std(librosa.feature.spectral_bandwidth(y=y, sr=sr).T,axis=0)
    rolloff1 = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr).T,axis=0)
    rolloff2 = np.std(librosa.feature.spectral_rolloff(y=y, sr=sr).T,axis=0)
    centroid1 = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr).T,axis=0)
    centroid2 = np.std(librosa.feature.spectral_centroid(y=y, sr=sr).T,axis=0)
    zcr1 = np.mean(librosa.feature.zero_crossing_rate(y).T,axis=0)
    zcr2 = np.std(librosa.feature.zero_crossing_rate(y).T,axis=0)
    chrcn1=np.mean(librosa.feature.chroma_cens(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
    chrcn2=np.std(librosa.feature.chroma_cens(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
    chrcqt1=np.mean(librosa.feature.chroma_cqt(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
    chrcqt2=np.std(librosa.feature.chroma_cens(y=y, sr=sr,C=None,n_chroma=12,bins_per_octave=36).T,axis=0)
    features
    if(fold_no!='10'):
      x_train.append(features)
      y_train.append(label)
    else:
      x_test.append(features)
      y_test.append(label)

100%|████████████████████████████████████████████████████████████████████████████| 8732/8732 [7:03:27<00:00,  2.91s/it]


In [101]:
len(x_train)+len(x_test)

8732

In [102]:
len(dataset)

8732

In [103]:
#converting the lists into numpy arrays
x_train1=np.array(x_train)
x_test1=np.array(x_test)
y_train1=np.array(y_train)
y_test1=np.array(y_test)
x_train1.shape,x_test1.shape,y_train1.shape,y_test1.shape

((7895, 28, 40), (837, 28, 40), (7895,), (837,))

In [104]:
#reshaping into 2d to save in csv format
x_train_2d=np.reshape(x_train1,(x_train1.shape[0],x_train1.shape[1]*x_train1.shape[2]))
x_test_2d=np.reshape(x_test1,(x_test1.shape[0],x_test1.shape[1]*x_test1.shape[2]))
x_train_2d.shape,x_test_2d.shape

((7895, 1120), (837, 1120))

In [105]:
#saving the data numpy arrays
np.savetxt("train_data1.csv", x_train_2d, delimiter=",")
np.savetxt("test_data1.csv",x_test_2d,delimiter=",")
np.savetxt("train_labels1.csv",y_train,delimiter=",")
np.savetxt("test_labels1.csv",y_test,delimiter=",")