# Advanced Audio Visualization Techniques

This notebook demonstrates advanced techniques for visualizing audio data using the CTC-SpeechRefinement package. We'll explore various visualization methods that can provide insights into different aspects of audio signals.

## Setup

First, let's import the necessary libraries and set up the environment.

In [None]:
# Add the project root to the Python path
import sys
import os
sys.path.append(os.path.abspath('..'))

# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pandas as pd
import seaborn as sns
from IPython.display import Audio, display
import glob
from pathlib import Path
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go
import plotly.express as px
from scipy import signal

# Import from the project
from ctc_speech_refinement.core.preprocessing.audio import preprocess_audio

# Set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

## Load Audio Data

Let's load an audio file and examine its basic properties.

In [None]:
# Define the path to an audio file
audio_file = "../data/test1/test1_01.wav"  # Update this path to your audio file

# Load the audio file
audio_data, sample_rate = librosa.load(audio_file, sr=None)

# Print basic information
print(f"Audio file: {audio_file}")
print(f"Sample rate: {sample_rate} Hz")
print(f"Duration: {len(audio_data) / sample_rate:.2f} seconds")
print(f"Number of samples: {len(audio_data)}")

# Play the audio
display(Audio(audio_data, rate=sample_rate))

## 1. Enhanced Waveform Visualization

Let's create enhanced visualizations of the audio waveform.

In [None]:
# Basic waveform plot
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_data, sr=sample_rate, alpha=0.6)
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()

In [None]:
# Enhanced waveform with envelope
def compute_envelope(audio_data, frame_length=2048, hop_length=512):
    return np.array([max(audio_data[i:i+frame_length]) for i in range(0, len(audio_data), hop_length)])

# Compute envelope
frame_length = 2048
hop_length = 512
envelope = compute_envelope(audio_data, frame_length, hop_length)
envelope_times = np.arange(len(envelope)) * (hop_length / sample_rate)

# Plot waveform with envelope
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_data, sr=sample_rate, alpha=0.6)
plt.plot(envelope_times, envelope, color='red', linewidth=2, label='Envelope')
plt.plot(envelope_times, -envelope, color='red', linewidth=2)
plt.title('Audio Waveform with Envelope')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Waveform with color-coded amplitude
plt.figure(figsize=(14, 5))
times = np.arange(len(audio_data)) / sample_rate
plt.scatter(times, audio_data, c=np.abs(audio_data), cmap='viridis', s=1, alpha=0.5)
plt.colorbar(label='Absolute Amplitude')
plt.title('Audio Waveform with Color-Coded Amplitude')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()

## 2. Enhanced Spectrogram Visualization

Let's create enhanced visualizations of the audio spectrogram.

In [None]:
# Compute STFT
n_fft = 2048
hop_length = 512
stft = librosa.stft(audio_data, n_fft=n_fft, hop_length=hop_length)
stft_magnitude = np.abs(stft)
stft_db = librosa.amplitude_to_db(stft_magnitude, ref=np.max)

# Basic spectrogram
plt.figure(figsize=(14, 5))
librosa.display.specshow(stft_db, sr=sample_rate, x_axis='time', y_axis='log', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
# Enhanced spectrogram with custom colormap
# Create a custom colormap
colors = [(0, 0, 0), (0, 0, 1), (0, 1, 1), (1, 1, 0), (1, 0, 0)]
custom_cmap = LinearSegmentedColormap.from_list('custom_cmap', colors)

plt.figure(figsize=(14, 5))
librosa.display.specshow(stft_db, sr=sample_rate, x_axis='time', y_axis='log', hop_length=hop_length, cmap=custom_cmap)
plt.colorbar(format='%+2.0f dB')
plt.title('Enhanced Spectrogram with Custom Colormap')
plt.tight_layout()
plt.show()

In [None]:
# Spectrogram with waveform overlay
plt.figure(figsize=(14, 8))

# Plot spectrogram
ax1 = plt.subplot(2, 1, 1)
librosa.display.specshow(stft_db, sr=sample_rate, x_axis='time', y_axis='log', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')

# Plot waveform
ax2 = plt.subplot(2, 1, 2, sharex=ax1)
librosa.display.waveshow(audio_data, sr=sample_rate)
plt.title('Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')

plt.tight_layout()
plt.show()

## 3. 3D Visualization

Let's create 3D visualizations of the audio data.

In [None]:
# 3D spectrogram using matplotlib
fig = plt.figure(figsize=(14, 8))
ax = fig.add_subplot(111, projection='3d')

# Prepare data for 3D plot
times = librosa.times_like(stft[0], sr=sample_rate, hop_length=hop_length)
freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)

# Downsample for better visualization
downsample_factor_time = 4
downsample_factor_freq = 4
times_downsampled = times[::downsample_factor_time]
freqs_downsampled = freqs[::downsample_factor_freq]
stft_db_downsampled = stft_db[::downsample_factor_freq, ::downsample_factor_time]

# Create meshgrid
time_grid, freq_grid = np.meshgrid(times_downsampled, freqs_downsampled)

# Plot 3D surface
surf = ax.plot_surface(time_grid, freq_grid, stft_db_downsampled, cmap='viridis', alpha=0.8)

# Add colorbar
fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5, label='Magnitude (dB)')

# Set labels
ax.set_xlabel('Time (s)')
ax.set_ylabel('Frequency (Hz)')
ax.set_zlabel('Magnitude (dB)')
ax.set_title('3D Spectrogram')

# Set frequency axis to log scale
ax.set_yscale('log')
ax.set_ylim(20, sample_rate/2)

plt.tight_layout()
plt.show()

In [None]:
# 3D spectrogram using plotly for interactive visualization
# Prepare data for 3D plot
times = librosa.times_like(stft[0], sr=sample_rate, hop_length=hop_length)
freqs = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)

# Downsample for better visualization
downsample_factor_time = 4
downsample_factor_freq = 4
times_downsampled = times[::downsample_factor_time]
freqs_downsampled = freqs[::downsample_factor_freq]
stft_db_downsampled = stft_db[::downsample_factor_freq, ::downsample_factor_time]

# Create meshgrid
time_grid, freq_grid = np.meshgrid(times_downsampled, freqs_downsampled)

# Create 3D surface plot
fig = go.Figure(data=[go.Surface(z=stft_db_downsampled, x=time_grid, y=freq_grid, colorscale='Viridis')])

# Update layout
fig.update_layout(
    title='Interactive 3D Spectrogram',
    scene=dict(
        xaxis_title='Time (s)',
        yaxis_title='Frequency (Hz)',
        zaxis_title='Magnitude (dB)',
        yaxis=dict(type='log', range=[np.log10(20), np.log10(sample_rate/2)])
    ),
    width=900,
    height=700
)

fig.show()

## 4. Time-Frequency Analysis Visualization

Let's create visualizations for time-frequency analysis.

In [None]:
# Compute CQT (Constant-Q Transform)
cqt = librosa.cqt(audio_data, sr=sample_rate, hop_length=hop_length)
cqt_db = librosa.amplitude_to_db(np.abs(cqt), ref=np.max)

# Plot CQT
plt.figure(figsize=(14, 5))
librosa.display.specshow(cqt_db, sr=sample_rate, x_axis='time', y_axis='cqt_note', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Constant-Q Transform (CQT)')
plt.tight_layout()
plt.show()

In [None]:
# Compute wavelet transform
# Using continuous wavelet transform from scipy
widths = np.arange(1, 31)
cwtmatr = signal.cwt(audio_data[:sample_rate], signal.ricker, widths)

# Plot wavelet transform
plt.figure(figsize=(14, 5))
plt.imshow(cwtmatr, extent=[0, 1, 1, 31], cmap='viridis', aspect='auto', vmax=abs(cwtmatr).max(), vmin=-abs(cwtmatr).max())
plt.colorbar(label='Amplitude')
plt.title('Wavelet Transform (First Second of Audio)')
plt.xlabel('Time (s)')
plt.ylabel('Scale')
plt.tight_layout()
plt.show()

## 5. Feature Visualization

Let's create visualizations for various audio features.

In [None]:
# Compute MFCCs
n_mfcc = 13
mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)

# Plot MFCCs
plt.figure(figsize=(14, 5))
librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time', hop_length=hop_length)
plt.colorbar()
plt.title('MFCCs')
plt.tight_layout()
plt.show()

In [None]:
# Compute chroma features
chroma = librosa.feature.chroma_stft(y=audio_data, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)

# Plot chroma features
plt.figure(figsize=(14, 5))
librosa.display.specshow(chroma, sr=sample_rate, x_axis='time', y_axis='chroma', hop_length=hop_length)
plt.colorbar()
plt.title('Chromagram')
plt.tight_layout()
plt.show()

In [None]:
# Compute spectral features
spectral_centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)[0]
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)[0]
spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)
spectral_flatness = librosa.feature.spectral_flatness(y=audio_data, n_fft=n_fft, hop_length=hop_length)[0]
spectral_rolloff = librosa.feature.spectral_rolloff(y=audio_data, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)[0]

# Create time axis
feature_times = librosa.times_like(spectral_centroid, sr=sample_rate, hop_length=hop_length)

# Plot spectral features
plt.figure(figsize=(14, 10))

plt.subplot(3, 1, 1)
plt.semilogy(feature_times, spectral_centroid, label='Centroid')
plt.semilogy(feature_times, spectral_rolloff, label='Rolloff', alpha=0.7)
plt.title('Spectral Centroid and Rolloff')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.legend()

plt.subplot(3, 1, 2)
plt.semilogy(feature_times, spectral_bandwidth, label='Bandwidth')
plt.title('Spectral Bandwidth')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.legend()

plt.subplot(3, 1, 3)
plt.plot(feature_times, spectral_flatness, label='Flatness')
plt.title('Spectral Flatness')
plt.xlabel('Time (s)')
plt.ylabel('Flatness')
plt.legend()

plt.tight_layout()
plt.show()

## 6. Combined Visualization

Let's create a combined visualization that shows multiple aspects of the audio signal.

In [None]:
# Create a comprehensive visualization
plt.figure(figsize=(14, 15))

# Plot waveform
ax1 = plt.subplot(5, 1, 1)
librosa.display.waveshow(audio_data, sr=sample_rate)
plt.title('Waveform')
plt.xlabel('')

# Plot spectrogram
ax2 = plt.subplot(5, 1, 2, sharex=ax1)
librosa.display.specshow(stft_db, sr=sample_rate, x_axis='time', y_axis='log', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.xlabel('')

# Plot MFCCs
ax3 = plt.subplot(5, 1, 3, sharex=ax1)
librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time', hop_length=hop_length)
plt.colorbar()
plt.title('MFCCs')
plt.xlabel('')

# Plot chroma
ax4 = plt.subplot(5, 1, 4, sharex=ax1)
librosa.display.specshow(chroma, sr=sample_rate, x_axis='time', y_axis='chroma', hop_length=hop_length)
plt.colorbar()
plt.title('Chromagram')
plt.xlabel('')

# Plot spectral features
ax5 = plt.subplot(5, 1, 5, sharex=ax1)
plt.semilogy(feature_times, spectral_centroid, label='Centroid')
plt.semilogy(feature_times, spectral_rolloff, label='Rolloff', alpha=0.7)
plt.title('Spectral Features')
plt.xlabel('Time (s)')
plt.legend()

plt.tight_layout()
plt.show()

## Conclusion

In this notebook, we've explored various advanced techniques for visualizing audio data. We've created enhanced waveform visualizations, enhanced spectrogram visualizations, 3D visualizations, time-frequency analysis visualizations, feature visualizations, and combined visualizations.

These visualization techniques provide valuable insights into different aspects of audio signals, which can be useful for understanding the characteristics of speech signals and for feature extraction in speech recognition tasks. By using these techniques, we can gain a deeper understanding of the audio data and make more informed decisions about preprocessing and feature extraction.