ideas taken from: https://www.kaggle.com/code/salimhammadi07/esc-50-environmental-sound-classification

In [None]:
!ls data

In [None]:
!ls data/background

In [None]:
from IPython.display import Audio
Audio('data/background/background_00.wav')

In [None]:
Audio('data/chainsaw/chainsaw_00.wav')

In [None]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

In [None]:
y, sr = librosa.load('data/chainsaw/chainsaw_00.wav')
print('y:', y, '\n')
print('y shape:', np.shape(y), '\n')
print('Sample Rate (KHz):', sr, '\n')

# The duration is equal to the number of frames divided by the framerate
print('Duration of the audio file:', np.shape(y)[0]/sr, 'second')

Load all sounds

In [None]:
import os

all_waves = {}
base_path = "data"
for category in ['background', 'chainsaw', 'engine', 'storm']:
    all_waves[category] = []
    for audio_file in os.listdir(os.path.join(base_path, category)):
        file_name = os.path.join(base_path, category, audio_file)
        y, sr = librosa.load(file_name)
        all_waves[category].append((y, sr, file_name))

# Sound Waves

In [None]:
y, sr, file_name = all_waves['background'][0]
Audio(file_name)

In [None]:
librosa.display.waveshow(y[10000:11000])
plt.show()

In [None]:
def show_in_plots(fn, cant_per_row=3, xlabel=None, ylabel=None):
    plt.figure(figsize=(30,30))
    idx = 1
    for cat_name, items in all_waves.items():
        for y, sr, _ in items[:cant_per_row]:
            plt.subplot(4,cant_per_row,idx)
            idx += 1
            fn(y, sr)
            if xlabel:
                plt.xlabel(xlabel)
            if ylabel:
                plt.ylabel(ylabel)
            plt.title(cat_name)

In [None]:
def _waveshow(y, sr):
    librosa.display.waveshow(y)
    
show_in_plots(_waveshow, xlabel="Time")

# Visualize Audio : Fourier Transform

The Fourier transform is a mathematical technique used to decompose a signal into its constituent frequency components. It is widely used in audio signal processing to analyze, filter and manipulate sound signals.

The Fourier transform of a time-domain signal, such as an audio signal, produces a frequency-domain representation of the signal. This representation shows the relative amplitudes of the different frequency components that make up the signal. This information is useful for understanding the characteristics of the sound, such as its pitch and timbre, and for filtering or modifying specific frequency ranges.

There are different types of Fourier transforms, the most common is the discrete Fourier transform (DFT), which is used to convert a discrete-time signal into a discrete-frequency representation. The DFT requires a large amount of computation, so in practice, the fast Fourier transform (FFT) algorithm is often used to efficiently calculate the DFT.

The short-time Fourier transform (STFT) is a variation of the DFT that is used to analyze audio signals. It breaks the audio signal into short segments and applies the DFT to each segment, providing a time-frequency representation of the signal. This is useful for analyzing the frequency content of a sound over time, and for tasks such as pitch detection and audio compression.


In [None]:
y, sr, file_name = all_waves['chainsaw'][0]
Audio(file_name)

In [None]:
# Default FFT window size
n_fft = 2048 # FFT window size
hop_length = 512 # number audio of frames between STFT columns 

X = np.abs(librosa.stft(y, n_fft = n_fft, hop_length = hop_length))
plt.plot(X)
plt.xlabel("Frequency")
plt.ylabel("Amplitude")
plt.show()

In [None]:
def _fftshow(y, sr):
    X = np.abs(librosa.stft(y, n_fft = n_fft, hop_length = hop_length))
    plt.plot(X)
    
show_in_plots(_fftshow, xlabel="Frequency", ylabel="Amplitude")

# Spectrogram

A spectrogram is a time-frequency representation of a signal, such as an audio signal. It is a graphical representation of the frequency content of a signal over time, and is often used to visualize and analyze audio signals.

A spectrogram is typically represented as a 2D image, with the x-axis representing time, the y-axis representing frequency, and the intensity of the color or grayscale representing the amplitude of the frequency component at that point in time.

The spectrogram is calculated by applying the Short-Time Fourier Transform (STFT) to the audio signal, which breaks the audio into short segments and applies the Fourier transform to each segment. This produces a set of complex numbers representing the frequency content of the audio for each segment, which are then plotted in the spectrogram.

A spectrogram can be useful for visualizing the frequency content of a sound over time, and for identifying patterns in the audio signal, such as pitch, timbre, and transient events. It can also be used to analyze the characteristics of different sounds, such as the spectral envelope or the harmonic structure, and to segment an audio file into different sound events.

A spectrogram can be used in many audio-related tasks, such as speech recognition, audio source separation, and audio event detection, and it is an essential tool in the field of audio signal processing.

We can display a spectrogram using. librosa.display.specshow.

In [None]:
def _expectrogramshow(y, sr):
    X = librosa.stft(y)
    Xdb = librosa.amplitude_to_db(abs(X))
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar()
    
show_in_plots(_expectrogramshow)

# Mel Spectrogram
## The Mel Scale

Studies have shown that humans do not perceive frequencies on a linear scale. We are better at detecting differences in lower frequencies than higher frequencies. For example, we can easily tell the difference between 500 and 1000 Hz, but we will hardly be able to tell a difference between 10,000 and 10,500 Hz, even though the distance between the two pairs are the same.

In 1937, Stevens, Volkmann, and Newmann proposed a unit of pitch such that equal distances in pitch sounded equally distant to the listener. This is called the mel scale. We perform a mathematical operation on frequencies to convert them to the mel scale.

## The Mel Spectrogram

- A mel spectrogram is a spectrogram where the frequencies are converted to the mel scale.

- A mel spectrogram logarithmically renders frequencies above a certain threshold (the corner frequency). For example, in the linearly scaled spectrogram, the vertical space between 1,000 and 2,000Hz is half of the vertical space between 2,000Hz and 4,000Hz. In the mel spectrogram, the space between those ranges is approximately the same. This scaling is analogous to human hearing, where we find it easier to distinguish between similar low frequency sounds than similar high frequency sounds.

- A mel spectrogram computes its output by multiplying frequency-domain values by a filter bank.

In [None]:
y, sr, file_name = all_waves['background'][1]
Audio(file_name)

In [None]:
X, _ = librosa.effects.trim(y)
XS = librosa.feature.melspectrogram(y=X, sr=sr)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar() 
plt.show()

In [None]:
y, sr, file_name = all_waves['storm'][2]
Audio(file_name)

In [None]:
X, _ = librosa.effects.trim(y)
XS = librosa.feature.melspectrogram(y=X, sr=sr)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar() 
plt.show()

In [None]:
def _melexpectrogramshow(y, sr):
    X, _ = librosa.effects.trim(y)
    XS = librosa.feature.melspectrogram(y=X, sr=sr)
    Xdb = librosa.amplitude_to_db(XS, ref=np.max)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar()   
    
show_in_plots(_melexpectrogramshow)

## Filtering Mel Spectrograms

In [None]:
threshold = -60

def _filtmelexpectrogramshow(y, sr):
    X, _ = librosa.effects.trim(y)
    XS = librosa.feature.melspectrogram(y=X, sr=sr)
    Xdb = librosa.amplitude_to_db(XS, ref=np.max)
    Xdb[Xdb < threshold] = threshold
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar()   
    
show_in_plots(_filtmelexpectrogramshow)

## Mel spectrograms parameters

In [None]:
y, sr, _ = all_waves['storm'][2]
print(y.shape, sr)

In [None]:
# Default parameters
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128, fmin=0.0, fmax=None)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

The resultant array shape is (128, 431).
- 128 is the number of mel bands
- 431 is the number of time slots calculated. 430 aprox 220500/512

In [None]:
220500/512

In [None]:
# Number of mel bands. The more bands, the more filter details
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

In [None]:
# increase hop_lengths
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=4096, n_mels=128, fmin=0.0, fmax=None)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

In [None]:
# decrease hop_lengths
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=128, n_mels=128, fmin=0.0, fmax=None)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

In [None]:
# Changing frequencies
fmin = 2000
fmax = 4000
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128, fmin=fmin, fmax=fmax)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', hop_length=512, y_axis='mel', fmin=fmin, fmax=fmax)
plt.colorbar(format='%+2.0f dB')
plt.show()

# Other features
## Zero Crossing Rate

Zero-Crossing Rate: The zero-crossing rate (ZCR) is the rate at which a signal transitions from positive to zero to negative or negative to zero to positive. Its value has been extensively used in both speech recognition and music information retrieval for classifying percussive sounds.

The zero-crossing rate can be utilized as a basic pitch detection algorithm for monophonic tonal signals. Voice activity detection (VAD), which determines whether or not human speech is present in an audio segment, also makes use of zero-crossing rates.

In [None]:
y, sr, file_name = all_waves['engine'][1]
Audio(file_name)

In [None]:
zoom=50
y = y[0:zoom]
librosa.display.waveshow(y)
crossings = librosa.zero_crossings(y, pad=False)
t = np.linspace(0, 0.0022, num=zoom)
plt.scatter(t[crossings],y[crossings]*0, color='r',linewidth=7.0)

In [None]:
def _zerocross(y, sr):
    y = y[0:zoom]
    librosa.display.waveshow(y)
    crossings = librosa.zero_crossings(y, pad=False)
    t = np.linspace(0, 0.0022, num=zoom)
    plt.scatter(t[crossings],y[crossings]*0, color='r',linewidth=7.0)

show_in_plots(_zerocross, xlabel="Time")

## Harmonics and Percussive

Decompose an audio time series into harmonic and percussive components

In [None]:
def _harmperc(y, sr):
    y_harm, y_perc = librosa.effects.hpss(y)
    plt.plot(y_harm, alpha=0.4);
    plt.plot(y_perc, color = 'purple', alpha=0.8);

show_in_plots(_harmperc, xlabel="Time")

## Tempo BMP (Beats Per Minute)

What's a "beat?" People commonly use the word "beat" to mean "a pattern (or rhythm) played by drums." The thing you're making when you create and play patterns in these lessons is "a beat."

But, confusingly, there's another use of the word "beat," which means "a regular, repeating pulse that underlies a musical pattern." People tap their foot along with "the beat" in this context.

Tempo The speed at which your patterns play back is called the tempo. Tempo is measured in beats per minute or BPM. So if we talk about a piece of music being "at 120 BPM," we mean that there are 120 beats (pulses) every minute.

Some types of musical patterns have a very clear underlying beat, while others have a more subtle or implied one.

In [None]:
for cat_name, items in all_waves.items():
    print(cat_name)
    for y, sr, file_name in items[:4]:
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        print('-', tempo)

In [None]:
tempos = {}
for cat_name, items in all_waves.items():
    values = [librosa.beat.beat_track(y=y, sr=sr)[0] for y, sr, _ in items]
    print(f"{cat_name}, mean: {np.mean(values):.2f}, stdev: {np.std(values):.2f}")

## Spectral Centroid

The spectral centroid indicates at which frequency the energy of a spectrum is centered upon or in other words It indicates where the ” center of mass” for a sound is located. This is like a weighted mean

In [None]:
import sklearn 

def normalize(x, axis=0):
    return sklearn.preprocessing.minmax_scale(x, axis=axis)

def _spectcentro(y, sr):
    librosa.display.waveshow(y, alpha=0.4)
    spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    frames = range(len(spectral_centroids))
    t = librosa.frames_to_time(frames)
    plt.plot(t, normalize(spectral_centroids), color='r')

show_in_plots(_spectcentro, xlabel="Time")

## Spectral Rolloff

The spectral rolloff is a measure of the shape of the spectrum of an audio signal. It is defined as the frequency below which a certain percentage of the total energy of the signal lies. The roll-off point is often expressed as a percentage of the total energy, such as 85% or 95%.

The spectral rolloff can provide information about the tonality of a sound, as sounds that are more tonal will have a lower rolloff point than sounds that are more noise-like. For example, a piano playing a sustained note will have a lower rolloff point than a snare drum hit.

The spectral rolloff can be calculated by first computing the power spectrum of the audio signal, and then finding the frequency below which a certain percentage of the total energy of the signal lies. The spectral rolloff can be computed for different percentage values, and the result can be a single value, or a set of values for different percentages.

The spectral rolloff can be used in various audio-related tasks, such as music genre classification, sound event detection, and speech analysis, as it can be a useful feature to distinguish different audio classes. It can also be used in combination with other features, such as Mel-Frequency Cepstral Coefficients (MFCCs) or Chroma feature, to improve the performance of audio classification tasks.


In [None]:
def _spectrolloff(y, sr):
    librosa.display.waveshow(y,alpha=0.4)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
    frames = range(len(spectral_rolloff))
    t = librosa.frames_to_time(frames)
    plt.plot(t, normalize(spectral_rolloff), color='r')

show_in_plots(_spectrolloff, xlabel="Time")

## Spectral Bandwidth

The spectral bandwidth is defined as the width of the band of light at one-half the peak maximum (or full width at half maximum [FWHM]) and is represented by the two vertical red lines and λSB on the wavelength axis

In [None]:
def _spectbandwidth(y, sr):
    librosa.display.waveshow(y, alpha=0.4)
    spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(y=y+0.01, sr=sr)[0]
    spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(y=y+0.01, sr=sr, p=3)[0]
    spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(y=y+0.01, sr=sr, p=4)[0]
    frames = range(len(spectral_bandwidth_2))
    t = librosa.frames_to_time(frames)
    plt.plot(t, normalize(spectral_bandwidth_2), color='r')
    plt.plot(t, normalize(spectral_bandwidth_3), color='g')
    plt.plot(t, normalize(spectral_bandwidth_4), color='y')
    plt.legend(('p = 2', 'p = 3', 'p = 4'))

show_in_plots(_spectbandwidth, xlabel="Time")

## Chroma feature

Chroma feature is a representation of the harmonic structure of an audio signal. It is a powerful feature for music analysis and can be used for tasks such as music genre classification, chord recognition, and tonality analysis.

A Chroma feature is calculated by first transforming the audio signal into the frequency domain using a Fourier transform. The signal is then mapped into a new feature space called the Chroma space, which consists of 12 bins corresponding to the 12 distinct semitones of Western music.

Each bin represents the energy of the audio signal at a specific pitch class (C, C#, D, D#, etc.) and is calculated by summing the energy of all the notes in the signal that belong to that pitch class. The resulting Chroma feature is a 12-dimensional vector, where each dimension represents the energy of the audio signal at a specific pitch class.

The Chroma feature is robust to changes in tempo and instrumentation, which makes it a useful feature for tasks such as music genre classification and chord recognition. It can also be used in conjunction with other features, such as Mel-Frequency Cepstral Coefficients (MFCCs) or spectral rolloff, to improve the performance of audio classification tasks.


In [None]:
def _chromafeat(y, sr):
    chromagram = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
    librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=hop_length, cmap='coolwarm')

show_in_plots(_chromafeat, xlabel="Time")

## Mel-Frequency Cepstral Coefficients(MFCCs)

Mel-Frequency Cepstral Coefficients (MFCCs) are a set of features commonly used in speech and music processing applications to represent the spectral characteristics of an audio signal. They are based on the human perception of sound and are designed to capture the spectral envelope of the audio signal, which is the shape of the signal's power spectrum over time.

The MFCCs are calculated in several steps:

- The audio signal is transformed into the frequency domain using a Fourier transform.
- The Mel scale is applied to the frequency axis of the signal, which approximates the non-linear frequency response of the human ear.
- The logarithm of the energy in each Mel-frequency bin is taken to obtain the Mel-frequency spectrogram.
- A Discrete Cosine Transform (DCT) is applied to the Mel-frequency spectrogram to obtain the MFCCs.

The result of these steps is a set of coefficients that represent the spectral envelope of the audio signal. The number of MFCCs used in a given application can vary, but typically, between 12 and 40 coefficients are used.

MFCCs are robust to variations in the audio signal, such as changes in pitch, speed, and noise. They are widely used in speech and music processing tasks such as speech recognition, music genre classification, and speaker identification. They can also be used in conjunction with other features, such as Chroma feature or Spectral rolloff, to improve the performance of audio classification tasks.


In [None]:
def _mfccfeat(y, sr):
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    librosa.display.specshow(mfccs, sr=sr, x_axis='time')
    
show_in_plots(_mfccfeat, xlabel="Time")

# Data Preparation
## Data Augmentation

Data augmentation is a technique used to artificially increase the size of a dataset by applying various transformations to the existing data. This can be useful in situations where the amount of available data is limited, such as with the ESC-50 dataset.

Data augmentation can be used to improve the robustness and generalization of machine learning models by introducing variation into the training data. This can help the model to learn more robust features that are less dependent on specific variations in the data.

There are several types of data augmentation that can be applied to the ESC-50 dataset:

- Time stretching : This technique changes the duration of the audio signal by speeding it up or slowing it down. This can be useful for simulating variations in the tempo of the audio.

- Pitch shifting: This technique changes the pitch of the audio signal by shifting it up or down. This can be useful for simulating variations in the pitch of the audio.

- Volume scaling: This technique changes the volume of the audio signal by scaling it up or down. This can be useful for simulating variations in the loudness of the audio.

- Add noise : This technique adds noise to the audio signal to simulate different noise conditions.

- Time shifting: This technique changes the position of the audio signal in time by shifting it forwards or backwards. This can be useful for simulating variations in the timing of the audio.

- Echo: This technique adds a delayed copy of the audio signal to simulate an echo effect


In [None]:
def add_noise(data, *, mean=0, std=0.1):
    noise = np.random.normal(mean, std, len(data))
    audio_noisy = data + noise
    return audio_noisy
    
def pitch_shifting(data):
    sr  = 16000
    bins_per_octave = 12
    pitch_pm = 2
    pitch_change =  pitch_pm * 2*(np.random.uniform())   
    data = librosa.effects.pitch_shift(y=data.astype('float64'),  sr=sr, n_steps=pitch_change, 
                                          bins_per_octave=bins_per_octave)
    return data

def random_shift(data):
    timeshift_fac = 0.2 *2*(np.random.uniform()-0.5)  # up to 20% of length
    start = int(data.shape[0] * timeshift_fac)
    if (start > 0):
        data = np.pad(data,(start,0),mode='constant')[0:data.shape[0]]
    else:
        data = np.pad(data,(0,-start),mode='constant')[0:data.shape[0]]
    return data

def volume_scaling(data):
    sr  = 16000
    dyn_change = np.random.uniform(low=1.5,high=2.5)
    data = data * dyn_change
    return data
    
def time_stretching(data, rate=1.5):
    input_length = len(data)
    streching = data.copy()
    streching = librosa.effects.time_stretch(streching, rate)
    
    if len(streching) > input_length:
        streching = streching[:input_length]
    else:
        streching = np.pad(data, (0, max(0, input_length - len(streching))), "constant")
    return streching



In [None]:
y, sr = librosa.load('data/background/background_00.wav')
Audio(y, rate=sr)

In [None]:
Audio(add_noise(y, std=0.01), rate=sr)

In [None]:
Audio(pitch_shifting(y), rate=sr)