# Basic Audio Exploratory Data Analysis

This notebook demonstrates how to perform basic exploratory data analysis on audio files using the CTC-SpeechRefinement package.

## Setup

First, let's import the necessary libraries and set up the environment.

In [None]:
# Add the project root to the Python path
import sys
import os
sys.path.append(os.path.abspath('..'))

# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pandas as pd
import seaborn as sns
from IPython.display import Audio, display
import glob
from pathlib import Path

# Import from the project
from ctc_speech_refinement.core.preprocessing.audio import preprocess_audio
from ctc_speech_refinement.core.eda.descriptive_stats import analyze_descriptive_stats
from ctc_speech_refinement.core.eda.time_domain import analyze_time_domain
from ctc_speech_refinement.core.eda.frequency_domain import analyze_frequency_domain

# Set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

## Load Audio Data

Let's load an audio file and examine its basic properties.

In [None]:
# Define the path to an audio file
audio_file = "../data/test1/test1_01.wav"  # Update this path to your audio file

# Load the audio file
audio_data, sample_rate = librosa.load(audio_file, sr=None)

# Print basic information
print(f"Audio file: {audio_file}")
print(f"Sample rate: {sample_rate} Hz")
print(f"Duration: {len(audio_data) / sample_rate:.2f} seconds")
print(f"Number of samples: {len(audio_data)}")

# Play the audio
display(Audio(audio_data, rate=sample_rate))

## Visualize Waveform

Let's visualize the waveform of the audio file.

In [None]:
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_data, sr=sample_rate)
plt.title('Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()

## Descriptive Statistics

Let's compute and visualize some descriptive statistics of the audio data.

In [None]:
# Compute descriptive statistics
stats = {
    'Mean': np.mean(audio_data),
    'Median': np.median(audio_data),
    'Std Dev': np.std(audio_data),
    'Min': np.min(audio_data),
    'Max': np.max(audio_data),
    'Range': np.max(audio_data) - np.min(audio_data),
    'RMS': np.sqrt(np.mean(audio_data**2))
}

# Display statistics
pd.DataFrame(stats, index=['Value']).T

In [None]:
# Plot amplitude distribution
plt.figure(figsize=(14, 5))
plt.hist(audio_data, bins=100, alpha=0.7)
plt.axvline(np.mean(audio_data), color='r', linestyle='dashed', linewidth=2, label=f'Mean: {np.mean(audio_data):.4f}')
plt.axvline(np.median(audio_data), color='g', linestyle='dashed', linewidth=2, label=f'Median: {np.median(audio_data):.4f}')
plt.title('Amplitude Distribution')
plt.xlabel('Amplitude')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
plt.show()

## Time Domain Analysis

Let's analyze the audio in the time domain.

In [None]:
# Compute envelope
def compute_envelope(audio_data, frame_length=2048, hop_length=512):
    return np.array([max(audio_data[i:i+frame_length]) for i in range(0, len(audio_data), hop_length)])

envelope = compute_envelope(audio_data)
envelope_times = np.arange(len(envelope)) * (hop_length / sample_rate)

# Plot envelope
plt.figure(figsize=(14, 5))
plt.plot(envelope_times, envelope)
plt.title('Audio Envelope')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()

In [None]:
# Compute energy
energy = librosa.feature.rms(y=audio_data, frame_length=2048, hop_length=512)[0]
energy_times = librosa.times_like(energy, sr=sample_rate, hop_length=512)

# Plot energy
plt.figure(figsize=(14, 5))
plt.plot(energy_times, energy)
plt.title('Energy (RMS)')
plt.xlabel('Time (s)')
plt.ylabel('Energy')
plt.tight_layout()
plt.show()

In [None]:
# Compute zero crossing rate
zcr = librosa.feature.zero_crossing_rate(audio_data, frame_length=2048, hop_length=512)[0]
zcr_times = librosa.times_like(zcr, sr=sample_rate, hop_length=512)

# Plot zero crossing rate
plt.figure(figsize=(14, 5))
plt.plot(zcr_times, zcr)
plt.title('Zero Crossing Rate')
plt.xlabel('Time (s)')
plt.ylabel('Rate')
plt.tight_layout()
plt.show()

## Frequency Domain Analysis

Let's analyze the audio in the frequency domain.

In [None]:
# Compute spectrogram
D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_data, n_fft=2048, hop_length=512)), ref=np.max)

# Plot spectrogram
plt.figure(figsize=(14, 5))
librosa.display.specshow(D, sr=sample_rate, x_axis='time', y_axis='log', hop_length=512)
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
# Compute mel spectrogram
mel_spec = librosa.feature.melspectrogram(y=audio_data, sr=sample_rate, n_fft=2048, hop_length=512, n_mels=128)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

# Plot mel spectrogram
plt.figure(figsize=(14, 5))
librosa.display.specshow(mel_spec_db, sr=sample_rate, x_axis='time', y_axis='mel', hop_length=512)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
# Compute spectral centroid
spectral_centroid = librosa.feature.spectral_centroid(y=audio_data, sr=sample_rate, n_fft=2048, hop_length=512)[0]
spectral_centroid_times = librosa.times_like(spectral_centroid, sr=sample_rate, hop_length=512)

# Plot spectral centroid
plt.figure(figsize=(14, 5))
plt.semilogy(spectral_centroid_times, spectral_centroid)
plt.title('Spectral Centroid')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.tight_layout()
plt.show()

In [None]:
# Compute spectral bandwidth
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_data, sr=sample_rate, n_fft=2048, hop_length=512)[0]
spectral_bandwidth_times = librosa.times_like(spectral_bandwidth, sr=sample_rate, hop_length=512)

# Plot spectral bandwidth
plt.figure(figsize=(14, 5))
plt.semilogy(spectral_bandwidth_times, spectral_bandwidth)
plt.title('Spectral Bandwidth')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.tight_layout()
plt.show()

## Pitch and Timbre Analysis

Let's analyze the pitch and timbre of the audio.

In [None]:
# Compute pitch (fundamental frequency)
pitches, magnitudes = librosa.piptrack(y=audio_data, sr=sample_rate, n_fft=2048, hop_length=512)
pitch_times = librosa.times_like(pitches[0], sr=sample_rate, hop_length=512)

# Extract the pitch with highest magnitude at each time
pitch = []
for t in range(pitches.shape[1]):
    index = magnitudes[:, t].argmax()
    pitch.append(pitches[index, t])

# Plot pitch
plt.figure(figsize=(14, 5))
plt.semilogy(pitch_times, pitch)
plt.title('Pitch (Fundamental Frequency)')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.tight_layout()
plt.show()

In [None]:
# Compute MFCCs
mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=13, n_fft=2048, hop_length=512)

# Plot MFCCs
plt.figure(figsize=(14, 5))
librosa.display.specshow(mfccs, sr=sample_rate, x_axis='time', hop_length=512)
plt.colorbar()
plt.title('MFCCs')
plt.tight_layout()
plt.show()

## Silence Detection

Let's detect silent regions in the audio.

In [None]:
# Detect silent regions
threshold = 0.01  # Adjust this threshold as needed
silent_regions = librosa.effects.split(audio_data, top_db=20)

# Convert to time
silent_regions_time = [(start / sample_rate, end / sample_rate) for start, end in silent_regions]

# Print silent regions
print("Non-silent regions:")
for i, (start, end) in enumerate(silent_regions_time):
    print(f"Region {i+1}: {start:.2f}s - {end:.2f}s (duration: {end-start:.2f}s)")

# Plot waveform with non-silent regions highlighted
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_data, sr=sample_rate, alpha=0.5)

# Highlight non-silent regions
for start, end in silent_regions:
    plt.axvspan(start / sample_rate, end / sample_rate, color='red', alpha=0.3)

plt.title('Waveform with Non-Silent Regions Highlighted')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()

## Conclusion

In this notebook, we've performed a basic exploratory data analysis of an audio file. We've examined its waveform, computed descriptive statistics, and analyzed it in both the time and frequency domains. We've also looked at pitch and timbre features, and detected silent regions.

This analysis provides a good starting point for understanding the characteristics of the audio data, which can be useful for preprocessing and feature extraction for speech recognition tasks.