In [2]:
import pandas as pd
import librosa 
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from FeatureExtractor import FeatureExtractor

In [3]:
%matplotlib inline

In [4]:
FRAME_SIZE = 512
HOP_LENGTH = 256

In [5]:
sound_db = pd.read_csv('./db/UrbanSound8K.csv')

In [6]:
sound_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8732 entries, 0 to 8731
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   slice_file_name  8732 non-null   object 
 1   fsID             8732 non-null   int64  
 2   start            8732 non-null   float64
 3   end              8732 non-null   float64
 4   salience         8732 non-null   int64  
 5   fold             8732 non-null   int64  
 6   classID          8732 non-null   int64  
 7   class            8732 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 545.9+ KB


In [7]:
sound_db.describe()

Unnamed: 0,fsID,start,end,salience,fold,classID
count,8732.0,8732.0,8732.0,8732.0,8732.0,8732.0
mean,116033.493816,38.645409,42.253312,1.347,5.385937,4.592877
std,57991.017218,74.292126,74.369669,0.476043,2.84682,2.894544
min,344.0,0.0,0.105962,1.0,1.0,0.0
25%,69942.25,3.0,6.839398,1.0,3.0,2.0
50%,118279.0,10.376492,14.0,1.0,5.0,4.0
75%,166942.0,35.131372,38.866979,2.0,8.0,7.0
max,209992.0,600.125356,604.125356,2.0,10.0,9.0


In [8]:
sound_db.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [13]:
class FeatureExtractor:
    @staticmethod
    def amplitude_envelope(signal: np.ndarray, frame_size: int, hop_length: int) -> np.array:
        ae = []
        for i in range(0, len(signal), hop_length):
            ae.append(max(signal[i:i + frame_size]))
        return np.array(ae)

    @staticmethod
    def root_mean_square(signal: np.ndarray, frame_size: int, hop_length: int) -> np.array:
        return librosa.feature.rms(y=signal, frame_length=frame_size, hop_length=hop_length)

    @staticmethod
    def zero_crossing_rate(signal: np.ndarray, frame_size: int, hop_length: int) -> np.ndarray:
        return librosa.feature.zero_crossing_rate(y=signal, frame_length=frame_size, hop_length=hop_length)

    @staticmethod
    def short_time_fourier_transform(signal: np.ndarray, hop_length: int) -> np.ndarray:
        return librosa.stft(y=signal, hop_length=hop_length)

    @staticmethod
    def normalize_feature(feature: np.ndarray) -> float:
        return np.sqrt(np.sum(feature ** 2))

In [10]:
class AudioDataLoader:
    def load_from_df(self, df: pd.DataFrame):
        entries = []
        for i in range(len(df)):
            if i > 10:
                break
            
            data = dict()
            file_name, fold, class_ = df.iloc[i][["slice_file_name", "fold", "class"]]

            y, sr = librosa.load(f'./db/fold{fold}/{file_name}')
            data['class'] = class_
            data['fold'] = fold
            data['amplitude_envelope'] = FeatureExtractor.normalize_feature(FeatureExtractor.amplitude_envelope(y, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH))
            data['root_mean_square'] = FeatureExtractor.normalize_feature(FeatureExtractor.root_mean_square(y, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH))
            data['zero_crossing_rate'] = FeatureExtractor.normalize_feature(FeatureExtractor.zero_crossing_rate(y, frame_size=FRAME_SIZE, hop_length=HOP_LENGTH))
                
            entries.append(data)
        
        return entries

In [14]:
data_loader = AudioDataLoader()
new_data_list = data_loader.load_from_df(sound_db)
new_data_list

[[ 2.96968711e-03+0.0000000e+00j -2.17948109e-03+0.0000000e+00j
  -1.08762998e-02+0.0000000e+00j ... -6.21335441e-03+0.0000000e+00j
  -8.27242155e-03+0.0000000e+00j -9.37773567e-03+0.0000000e+00j]
 [-1.81993016e-03-1.7402240e-03j -2.79605296e-03-4.6142540e-03j
   1.52700674e-03-1.1973148e-02j ...  9.86241386e-04-2.1774424e-03j
   4.40360093e-03-4.1657127e-03j  8.21537431e-03-1.2010298e-03j]
 [ 2.79980386e-03+9.1600008e-03j -7.31137348e-03-7.1084253e-03j
   1.84865892e-02+6.6573259e-05j ...  8.52918893e-04-4.5833900e-03j
   6.01946888e-03+6.0609244e-03j -9.39422753e-03+4.6192296e-03j]
 ...
 [ 1.08013701e-04+3.1733591e-05j  3.82775761e-05-1.1194242e-04j
  -1.14763723e-04-1.4671189e-05j ...  1.67667331e-05+1.6730117e-05j
   2.57257816e-05-2.9688028e-05j -3.68145775e-05-2.9631730e-05j]
 [-1.18011681e-04-1.5817373e-05j -7.87453973e-05+6.2321822e-05j
  -1.38074965e-05+4.1881227e-05j ... -1.18769649e-05+9.4693587e-06j
   4.73842874e-06+2.9375940e-05j  3.50911287e-05+1.8470835e-05j]
 [ 1.16822

[{'class': 'dog_bark',
  'fold': 5,
  'amplitude_envelope': 2.5011067,
  'root_mean_square': 0.85973305,
  'zero_crossing_rate': 0.701484631675251},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.20763576,
  'root_mean_square': 0.07694757,
  'zero_crossing_rate': 2.7154218977469546},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.14198396,
  'root_mean_square': 0.056997657,
  'zero_crossing_rate': 1.999477318175537},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.2545674,
  'root_mean_square': 0.09267064,
  'zero_crossing_rate': 2.646289055292365},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.15470709,
  'root_mean_square': 0.060585294,
  'zero_crossing_rate': 2.3008832147927967},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_envelope': 0.17995471,
  'root_mean_square': 0.0806928,
  'zero_crossing_rate': 1.5342353213377127},
 {'class': 'children_playing',
  'fold': 5,
  'amplitude_e

In [None]:
features_data_frame = pd.DataFrame(new_data_list)
features_data_frame

Unnamed: 0,class,fold,amplitude_envelope,root_mean_square,zero_crossing_rate
0,dog_bark,5,2.501107,0.859733,0.701485
1,children_playing,5,0.207636,0.076948,2.715422
2,children_playing,5,0.141984,0.056998,1.999477
3,children_playing,5,0.254567,0.092671,2.646289
4,children_playing,5,0.154707,0.060585,2.300883
5,children_playing,5,0.179955,0.080693,1.534235
6,children_playing,5,0.123882,0.053665,1.46118
7,children_playing,5,0.137885,0.059891,1.455495
8,children_playing,5,0.149603,0.065169,1.404993
9,car_horn,10,0.889928,0.321656,0.939122


In [None]:
features_data_frame.head()

Unnamed: 0,class,fold,amplitude_envelope,root_mean_square,zero_crossing_rate
0,dog_bark,5,2.501107,0.859733,0.701485
1,children_playing,5,0.207636,0.076948,2.715422
2,children_playing,5,0.141984,0.056998,1.999477
3,children_playing,5,0.254567,0.092671,2.646289
4,children_playing,5,0.154707,0.060585,2.300883


In [None]:
features_data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   class               11 non-null     object 
 1   fold                11 non-null     int64  
 2   amplitude_envelope  11 non-null     float32
 3   root_mean_square    11 non-null     float32
 4   zero_crossing_rate  11 non-null     float64
dtypes: float32(2), float64(1), int64(1), object(1)
memory usage: 480.0+ bytes


In [None]:
features_data_frame.describe()

Unnamed: 0,fold,amplitude_envelope,root_mean_square,zero_crossing_rate
count,11.0,11.0,11.0,11.0
mean,5.909091,0.507286,0.185277,1.682394
std,2.0226,0.718446,0.244904,0.656144
min,5.0,0.123882,0.053665,0.701485
25%,5.0,0.145793,0.060238,1.376374
50%,5.0,0.179955,0.076948,1.46118
75%,5.0,0.546729,0.201358,2.15018
max,10.0,2.501107,0.859733,2.715422


In [None]:
features_data_frame.to_csv('features_data.csv', index=False)