ideas taken from: https://www.kaggle.com/code/salimhammadi07/esc-50-environmental-sound-classification

In [None]:
!ls data/sounds/

In [None]:
!ls data/sounds/background/

In [None]:
from IPython.display import Audio
Audio('data/sounds/background/background_00.wav')

In [None]:
Audio('data/sounds/chainsaw/chainsaw_00.wav')

In [None]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

In [None]:
y, sr = librosa.load('data/sounds/chainsaw/chainsaw_00.wav')
print('y:', y, '\n')
print('y shape:', np.shape(y), '\n')
print('Sample Rate (KHz):', sr, '\n')

# The duration is equal to the number of frames divided by the framerate
print('Duration of the audio file:', np.shape(y)[0]/sr, 'second')

Load all sounds

In [None]:
import os

all_waves = {}
base_path = "data/sounds"
for category in ['background', 'chainsaw', 'engine', 'storm']:
    all_waves[category] = []
    for audio_file in os.listdir(os.path.join(base_path, category)):
        file_name = os.path.join(base_path, category, audio_file)
        y, sr = librosa.load(file_name)
        all_waves[category].append((y, sr, file_name))

# Sound Waves

In [None]:
y, sr, file_name = all_waves['background'][0]
Audio(file_name)

In [None]:
librosa.display.waveshow(y[10000:11000])
plt.show()

In [None]:
def show_in_plots(fn, cant_per_row=3, xlabel=None, ylabel=None):
    plt.figure(figsize=(30,30))
    idx = 1
    for cat_name, items in all_waves.items():
        for y, sr, _ in items[:cant_per_row]:
            plt.subplot(4,cant_per_row,idx)
            idx += 1
            fn(y, sr)
            if xlabel:
                plt.xlabel(xlabel)
            if ylabel:
                plt.ylabel(ylabel)
            plt.title(cat_name)

In [None]:
def _waveshow(y, sr):
    librosa.display.waveshow(y)
    
show_in_plots(_waveshow, xlabel="Time")

# Visualize Audio : Fourier Transform

The Fourier transform is a mathematical technique used to decompose a signal into its constituent frequency components. It is widely used in audio signal processing to analyze, filter and manipulate sound signals.

The Fourier transform of a time-domain signal, such as an audio signal, produces a frequency-domain representation of the signal. This representation shows the relative amplitudes of the different frequency components that make up the signal. This information is useful for understanding the characteristics of the sound, such as its pitch and timbre, and for filtering or modifying specific frequency ranges.

There are different types of Fourier transforms, the most common is the discrete Fourier transform (DFT), which is used to convert a discrete-time signal into a discrete-frequency representation. The DFT requires a large amount of computation, so in practice, the fast Fourier transform (FFT) algorithm is often used to efficiently calculate the DFT.

The short-time Fourier transform (STFT) is a variation of the DFT that is used to analyze audio signals. It breaks the audio signal into short segments and applies the DFT to each segment, providing a time-frequency representation of the signal. This is useful for analyzing the frequency content of a sound over time, and for tasks such as pitch detection and audio compression.


In [None]:
y, sr, file_name = all_waves['chainsaw'][0]
Audio(file_name)

In [None]:
# Default FFT window size
n_fft = 2048 # FFT window size
hop_length = 512 # number audio of frames between STFT columns 

X = np.abs(librosa.stft(y, n_fft = n_fft, hop_length = hop_length))
plt.plot(X)
plt.xlabel("Frequency")
plt.ylabel("Amplitude")
plt.show()

In [None]:
def _fftshow(y, sr):
    X = np.abs(librosa.stft(y, n_fft = n_fft, hop_length = hop_length))
    plt.plot(X)
    
show_in_plots(_fftshow, xlabel="Frequency", ylabel="Amplitude")

# Spectrogram

A spectrogram is a time-frequency representation of a signal, such as an audio signal. It is a graphical representation of the frequency content of a signal over time, and is often used to visualize and analyze audio signals.

A spectrogram is typically represented as a 2D image, with the x-axis representing time, the y-axis representing frequency, and the intensity of the color or grayscale representing the amplitude of the frequency component at that point in time.

The spectrogram is calculated by applying the Short-Time Fourier Transform (STFT) to the audio signal, which breaks the audio into short segments and applies the Fourier transform to each segment. This produces a set of complex numbers representing the frequency content of the audio for each segment, which are then plotted in the spectrogram.

A spectrogram can be useful for visualizing the frequency content of a sound over time, and for identifying patterns in the audio signal, such as pitch, timbre, and transient events. It can also be used to analyze the characteristics of different sounds, such as the spectral envelope or the harmonic structure, and to segment an audio file into different sound events.

A spectrogram can be used in many audio-related tasks, such as speech recognition, audio source separation, and audio event detection, and it is an essential tool in the field of audio signal processing.

We can display a spectrogram using. librosa.display.specshow.

In [None]:
def _expectrogramshow(y, sr):
    X = librosa.stft(y)
    Xdb = librosa.amplitude_to_db(abs(X))
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='hz')
    plt.colorbar()
    
show_in_plots(_expectrogramshow)

# Mel Spectrogram
## The Mel Scale

Studies have shown that humans do not perceive frequencies on a linear scale. We are better at detecting differences in lower frequencies than higher frequencies. For example, we can easily tell the difference between 500 and 1000 Hz, but we will hardly be able to tell a difference between 10,000 and 10,500 Hz, even though the distance between the two pairs are the same.

In 1937, Stevens, Volkmann, and Newmann proposed a unit of pitch such that equal distances in pitch sounded equally distant to the listener. This is called the mel scale. We perform a mathematical operation on frequencies to convert them to the mel scale.

## The Mel Spectrogram

- A mel spectrogram is a spectrogram where the frequencies are converted to the mel scale.

- A mel spectrogram logarithmically renders frequencies above a certain threshold (the corner frequency). For example, in the linearly scaled spectrogram, the vertical space between 1,000 and 2,000Hz is half of the vertical space between 2,000Hz and 4,000Hz. In the mel spectrogram, the space between those ranges is approximately the same. This scaling is analogous to human hearing, where we find it easier to distinguish between similar low frequency sounds than similar high frequency sounds.

- A mel spectrogram computes its output by multiplying frequency-domain values by a filter bank.

In [None]:
y, sr, file_name = all_waves['background'][1]
Audio(file_name)

In [None]:
X, _ = librosa.effects.trim(y)
XS = librosa.feature.melspectrogram(y=X, sr=sr)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar() 
plt.show()

In [None]:
y, sr, file_name = all_waves['storm'][2]
Audio(file_name)

In [None]:
X, _ = librosa.effects.trim(y)
XS = librosa.feature.melspectrogram(y=X, sr=sr)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar() 
plt.show()

In [None]:
def _melexpectrogramshow(y, sr):
    X, _ = librosa.effects.trim(y)
    XS = librosa.feature.melspectrogram(y=X, sr=sr)
    Xdb = librosa.amplitude_to_db(XS, ref=np.max)
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar()   
    
show_in_plots(_melexpectrogramshow)

## Filtering Mel Spectrograms

In [None]:
threshold = -60

def _filtmelexpectrogramshow(y, sr):
    X, _ = librosa.effects.trim(y)
    XS = librosa.feature.melspectrogram(y=X, sr=sr)
    Xdb = librosa.amplitude_to_db(XS, ref=np.max)
    Xdb[Xdb < threshold] = threshold
    librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
    plt.colorbar()   
    
show_in_plots(_filtmelexpectrogramshow)

## Mel spectrograms parameters

In [None]:
y, sr, _ = all_waves['storm'][2]
print(y.shape, sr)

In [None]:
# Default parameters
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128, fmin=0.0, fmax=None)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

The resultant array shape is (128, 431).
- 128 is the number of mel bands
- 431 is the number of time slots calculated. 430 aprox 220500/512

In [None]:
220500/512

In [None]:
# Number of mel bands. The more bands, the more filter details
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=40)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

In [None]:
# increase hop_lengths
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=4096, n_mels=128, fmin=0.0, fmax=None)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

In [None]:
# decrease hop_lengths
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=128, n_mels=128, fmin=0.0, fmax=None)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.show()

In [None]:
# Changing frequencies
fmin = 2000
fmax = 4000
XS = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512, n_mels=128, fmin=fmin, fmax=fmax)
print(XS.shape)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', hop_length=512, y_axis='mel', fmin=fmin, fmax=fmax)
plt.colorbar(format='%+2.0f dB')
plt.show()

## Animal sounds

In [None]:
! ls data/sounds/animals

In [None]:
audio_file = 'data/sounds/animals/Katze_miaut.mp3'
audio_file = 'data/sounds/animals/Puma.mp3'
audio_file = 'data/sounds/animals/Tiger.mp3'
audio_file = 'data/sounds/animals/donkey.mp3'
audio_file = 'data/sounds/animals/Elefant.mp3'
Audio(audio_file)


In [None]:
y, sr = librosa.load(audio_file, duration=5)
XS = librosa.feature.melspectrogram(y=y, sr=sr)
Xdb = librosa.amplitude_to_db(XS, ref=np.max)
librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar() 
plt.show()