https://thecleverprogrammer.com/2024/12/16/audio-data-processing-and-analysis-with-python/

TensorFlow can be used to develop models for various tasks, including natural language processing, image recognition, handwriting recognition, and different computational-based simulations such as partial differential equations.


TensorFlow is a powerful open-source machine-learning framework developed by Google, that empowers developers to construct and train ML models. It is used to implement machine learning and deep learning applications, for the development and research of fascinating ideas in artificial intelligence. TensorFlow is designed with the Python programming language, which makes it an easily understandable framework.



In [4]:

import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
dataset, info = tfds.load('nsynth', split='train', with_info='True')
print(info)


[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\jagad\tensorflow_datasets\nsynth\full\2.3.3...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

In [None]:
dataset


In [None]:
for sample in dataset.take(1):
    print("Available keys:")
    for key in sample.keys():
      print(key)

In [None]:
def preprocessed_nsynth(sample):
  audio = sample['audio']
  label = sample['pitch']
  return audio, label

processed_dataset = dataset.map(preprocessed_nsynth)

for audio, label in processed_dataset.take(1):
  print(f"Audio Shape: {audio.shape}")
  print(f"Label (Pitch): {label.numpy()}")

convert the audio tensor to a NumPy array and play it using the IPython Audio display

In [None]:
pip install IPython

In [None]:
from IPython.display import Audio
import numpy as np

audio_np = audio.numpy()

audio_np



In [None]:

Audio(audio_np,rate=16000)

In [None]:
import plotly.graph_objects as go
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        y = audio_np,
        mode = 'lines',
        name = 'Audio'
    )
)

fig.update_layout(
    xaxis_title = 'Time',
    yaxis_title = 'Amplitude',
    title = 'Audio waveform',
    width=800,
    height=400
)
fig.show()

analyze the spectrogram, which provides a time-frequency representation of audio.
A spectrogram is a visual way of representing the signal strength, or “loudness”, of a signal over time at various frequencies present in a particular waveform

In [None]:
import librosa
spectrogram = librosa.stft(audio_np, n_fft= 1024)
spectrogram_db = librosa.amplitude_to_db(abs(spectrogram))

time = np.linspace(0, len(audio_np)/16000,spectrogram_db.shape[1])
frequency = np.linspace(0, 16000/2, spectrogram_db.shape[0])


In [None]:
fig = go.Figure(
    fig.add_trace(
        go.Heatmap(
            z = spectrogram_db,
            x = time,
            y = frequency,
            #colorscale = 'viridis',
            colorbar=dict(title='Amplitude (dB)')

        )
    )
)
fig.update_layout(
    title="Spectrogram",
    xaxis_title="Time (seconds)",
    yaxis_title="Frequency (Hz)",
    yaxis = dict(type= 'log'),
    template = 'plotly_dark'
    )

fig.show()

In [None]:
mapped_family_counts = {instrument_families[family_id]: count 
                        for family_id, count in instrument_counts.items()}

In [None]:
from collections import Counter
import numpy

instrument_counts = Counter()

for sample in dataset.take(1000):
  instrument = sample['instrument']['family'].numpy()
  instrument_counts[instrument] += 1

instrument_families = ["Bass", "Brass", "Flute", "Guitar", "Keyboard", "Mallet", "Organ", "Reed", "String", "Synth Lead", "Synth Pad", "Vocal"]

'''x = instrument_families[family_id]
print(x) '''

mapped_family_counts = {instrument_families[family_id]: count 
                        for family_id, count in instrument_counts.items()}
print(mapped_family_counts)


''' mapped_family_counts = {instrument_families[family_id] : count 
                        for family_id,count in instrument_counts.items()}  '''   



In [None]:
import plotly.express as px

fig = px.bar(
    x = list(mapped_family_counts.keys()),
    y = list(mapped_family_counts.values()),
    title='instrument family distrubution'
)

fig.update_layout(
    xaxis_title = 'instrument family',
    yaxis_title = 'count of instruments'
)
fig.show()

analyze the Mel spectrogram, which translates audio frequencies into the Mel scale, to simulate human perception of sound

In [None]:
mel_spectrogram = librosa.feature.melspectrogram(y=audio_np, sr=16000, n_fft=1024, hop_length=512)
mel_spectrogram_db = librosa.power_to_db(mel_spectrogram)
mel_spectrogram_db



In [None]:
fig = go.Figure(
    go.Heatmap(
        z = mel_spectrogram_db,
        x = time,
        y = frequency,
        colorbar=dict(title="Amplitude (dB)")
    )
)
fig.update_layout(
    title = 'mel spectrogram',
    xaxis_title = 'time',
    yaxis_title = 'frequency',
    template = 'plotly_dark'
)
fig.show()

In [None]:
mfcc = librosa.feature.mfcc(y=audio_np,sr=16000,n_mfcc=13)
mfcc

In [None]:
fig = go.Figure(
    go.Heatmap(
        z = mfcc,
        x = time,
        y= frequency,
        colorbar=dict(title='mfcc value')
    )
)

fig.update_layout(
    title = 'mfcc',
    xaxis_title = 'time',
    yaxis_title = 'frequency',
    template = 'plotly_dark'

)
fig.show()

In [None]:

audio_pitch_shifted = librosa.effects.pitch_shift(audio_np, sr=16000, n_steps=2)

audio_time_stretched = librosa.effects.time_stretch(audio_np, rate=1.5)

fig = go.Figure()
fig.add_trace(go.Scatter(y=audio_np, mode='lines', name='Original'))
fig.add_trace(go.Scatter(y=audio_pitch_shifted, mode='lines', name='Pitch Shifted'))
fig.add_trace(go.Scatter(y=audio_time_stretched, mode='lines', name='Time Stretched'))
fig.show()