# Audio Anomaly Detection

This notebook demonstrates how to detect anomalies in audio files using the CTC-SpeechRefinement package. We'll explore various techniques for identifying unusual patterns in audio signals that might indicate noise, artifacts, or other issues that could affect speech recognition performance.

## Setup

First, let's import the necessary libraries and set up the environment.

In [None]:
# Add the project root to the Python path
import sys
import os
sys.path.append(os.path.abspath('..'))

# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
import pandas as pd
import seaborn as sns
from IPython.display import Audio, display
import glob
from pathlib import Path
from scipy import stats
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Import from the project
from ctc_speech_refinement.core.preprocessing.audio import preprocess_audio
from ctc_speech_refinement.core.eda.anomaly_detection import analyze_anomalies

# Set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

## Load Audio Data

Let's load an audio file and examine its basic properties.

In [None]:
# Define the path to an audio file
audio_file = "../data/test1/test1_01.wav"  # Update this path to your audio file

# Load the audio file
audio_data, sample_rate = librosa.load(audio_file, sr=None)

# Print basic information
print(f"Audio file: {audio_file}")
print(f"Sample rate: {sample_rate} Hz")
print(f"Duration: {len(audio_data) / sample_rate:.2f} seconds")
print(f"Number of samples: {len(audio_data)}")

# Play the audio
display(Audio(audio_data, rate=sample_rate))

## 1. Amplitude Anomaly Detection

Let's detect anomalies in the amplitude of the audio signal.

In [None]:
# Plot waveform
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_data, sr=sample_rate)
plt.title('Audio Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()

In [None]:
# Detect amplitude anomalies using z-score
def detect_amplitude_anomalies_zscore(audio_data, threshold=3.0):
    # Compute z-scores
    z_scores = np.abs(stats.zscore(audio_data))
    
    # Find anomalies
    anomalies = np.where(z_scores > threshold)[0]
    
    return anomalies, z_scores

# Detect amplitude anomalies
amplitude_anomalies, z_scores = detect_amplitude_anomalies_zscore(audio_data, threshold=4.0)

# Print number of anomalies
print(f"Number of amplitude anomalies detected: {len(amplitude_anomalies)}")
print(f"Percentage of anomalies: {len(amplitude_anomalies) / len(audio_data) * 100:.4f}%")

# Plot waveform with anomalies highlighted
plt.figure(figsize=(14, 8))

plt.subplot(2, 1, 1)
librosa.display.waveshow(audio_data, sr=sample_rate, alpha=0.5)
if len(amplitude_anomalies) > 0:
    plt.scatter(amplitude_anomalies / sample_rate, audio_data[amplitude_anomalies], color='red', s=10, label='Anomalies')
plt.title('Audio Waveform with Amplitude Anomalies')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(z_scores)
plt.axhline(y=4.0, color='r', linestyle='--', label='Threshold')
plt.title('Z-scores')
plt.xlabel('Sample')
plt.ylabel('Z-score')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Detect amplitude anomalies using Isolation Forest
def detect_amplitude_anomalies_iforest(audio_data, contamination=0.01):
    # Reshape data for Isolation Forest
    X = audio_data.reshape(-1, 1)
    
    # Apply Isolation Forest
    clf = IsolationForest(contamination=contamination, random_state=42)
    y_pred = clf.fit_predict(X)
    
    # Find anomalies (y_pred == -1)
    anomalies = np.where(y_pred == -1)[0]
    
    return anomalies, y_pred

# Detect amplitude anomalies using Isolation Forest
amplitude_anomalies_iforest, y_pred = detect_amplitude_anomalies_iforest(audio_data, contamination=0.01)

# Print number of anomalies
print(f"Number of amplitude anomalies detected (Isolation Forest): {len(amplitude_anomalies_iforest)}")
print(f"Percentage of anomalies: {len(amplitude_anomalies_iforest) / len(audio_data) * 100:.4f}%")

# Plot waveform with anomalies highlighted
plt.figure(figsize=(14, 5))
librosa.display.waveshow(audio_data, sr=sample_rate, alpha=0.5)
if len(amplitude_anomalies_iforest) > 0:
    plt.scatter(amplitude_anomalies_iforest / sample_rate, audio_data[amplitude_anomalies_iforest], color='red', s=10, label='Anomalies (Isolation Forest)')
plt.title('Audio Waveform with Amplitude Anomalies (Isolation Forest)')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.legend()
plt.tight_layout()
plt.show()

## 2. Spectral Anomaly Detection

Let's detect anomalies in the spectral content of the audio signal.

In [None]:
# Compute spectrogram
n_fft = 2048
hop_length = 512
stft = librosa.stft(audio_data, n_fft=n_fft, hop_length=hop_length)
stft_magnitude = np.abs(stft)
stft_db = librosa.amplitude_to_db(stft_magnitude, ref=np.max)

# Plot spectrogram
plt.figure(figsize=(14, 5))
librosa.display.specshow(stft_db, sr=sample_rate, x_axis='time', y_axis='log', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
# Detect spectral anomalies using spectral contrast
def detect_spectral_anomalies(audio_data, sample_rate, n_fft=2048, hop_length=512, threshold=3.0):
    # Compute spectral contrast
    contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sample_rate, n_fft=n_fft, hop_length=hop_length)
    
    # Compute mean contrast across frequency bands
    mean_contrast = np.mean(contrast, axis=0)
    
    # Compute z-scores
    z_scores = np.abs(stats.zscore(mean_contrast))
    
    # Find anomalies
    anomalies = np.where(z_scores > threshold)[0]
    
    return anomalies, z_scores, mean_contrast

# Detect spectral anomalies
spectral_anomalies, spectral_z_scores, mean_contrast = detect_spectral_anomalies(audio_data, sample_rate, threshold=3.0)

# Print number of anomalies
print(f"Number of spectral anomalies detected: {len(spectral_anomalies)}")
print(f"Percentage of anomalies: {len(spectral_anomalies) / len(mean_contrast) * 100:.4f}%")

# Plot spectral contrast with anomalies highlighted
plt.figure(figsize=(14, 8))

plt.subplot(2, 1, 1)
plt.plot(librosa.times_like(mean_contrast, sr=sample_rate, hop_length=hop_length), mean_contrast)
if len(spectral_anomalies) > 0:
    plt.scatter(spectral_anomalies * hop_length / sample_rate, mean_contrast[spectral_anomalies], color='red', s=30, label='Anomalies')
plt.title('Mean Spectral Contrast with Anomalies')
plt.xlabel('Time (s)')
plt.ylabel('Mean Contrast')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(spectral_z_scores)
plt.axhline(y=3.0, color='r', linestyle='--', label='Threshold')
plt.title('Spectral Z-scores')
plt.xlabel('Frame')
plt.ylabel('Z-score')
plt.legend()

plt.tight_layout()
plt.show()

## 3. Temporal Anomaly Detection

Let's detect anomalies in the temporal patterns of the audio signal.

In [None]:
# Compute energy (RMS)
rms = librosa.feature.rms(y=audio_data, frame_length=n_fft, hop_length=hop_length)[0]
rms_times = librosa.times_like(rms, sr=sample_rate, hop_length=hop_length)

# Compute zero crossing rate
zcr = librosa.feature.zero_crossing_rate(audio_data, frame_length=n_fft, hop_length=hop_length)[0]
zcr_times = librosa.times_like(zcr, sr=sample_rate, hop_length=hop_length)

# Plot energy and zero crossing rate
plt.figure(figsize=(14, 8))

plt.subplot(2, 1, 1)
plt.plot(rms_times, rms)
plt.title('Energy (RMS)')
plt.xlabel('Time (s)')
plt.ylabel('Energy')

plt.subplot(2, 1, 2)
plt.plot(zcr_times, zcr)
plt.title('Zero Crossing Rate')
plt.xlabel('Time (s)')
plt.ylabel('Rate')

plt.tight_layout()
plt.show()

In [None]:
# Detect temporal anomalies using energy and zero crossing rate
def detect_temporal_anomalies(rms, zcr, threshold=3.0):
    # Standardize features
    scaler = StandardScaler()
    features = np.column_stack((rms, zcr))
    features_scaled = scaler.fit_transform(features)
    
    # Apply Isolation Forest
    clf = IsolationForest(contamination=0.05, random_state=42)
    y_pred = clf.fit_predict(features_scaled)
    
    # Find anomalies (y_pred == -1)
    anomalies = np.where(y_pred == -1)[0]
    
    return anomalies, y_pred

# Detect temporal anomalies
temporal_anomalies, temporal_y_pred = detect_temporal_anomalies(rms, zcr)

# Print number of anomalies
print(f"Number of temporal anomalies detected: {len(temporal_anomalies)}")
print(f"Percentage of anomalies: {len(temporal_anomalies) / len(rms) * 100:.4f}%")

# Plot energy and zero crossing rate with anomalies highlighted
plt.figure(figsize=(14, 8))

plt.subplot(2, 1, 1)
plt.plot(rms_times, rms, alpha=0.7)
if len(temporal_anomalies) > 0:
    plt.scatter(rms_times[temporal_anomalies], rms[temporal_anomalies], color='red', s=30, label='Anomalies')
plt.title('Energy (RMS) with Temporal Anomalies')
plt.xlabel('Time (s)')
plt.ylabel('Energy')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(zcr_times, zcr, alpha=0.7)
if len(temporal_anomalies) > 0:
    plt.scatter(zcr_times[temporal_anomalies], zcr[temporal_anomalies], color='red', s=30, label='Anomalies')
plt.title('Zero Crossing Rate with Temporal Anomalies')
plt.xlabel('Time (s)')
plt.ylabel('Rate')
plt.legend()

plt.tight_layout()
plt.show()

## 4. Using the Package's Anomaly Detection

Let's use the package's built-in anomaly detection function.

In [None]:
# Use the package's anomaly detection function
anomaly_results = analyze_anomalies(
    audio_data, 
    sample_rate, 
    title_prefix="Sample Audio"
)

# Display the figures
for fig_name, fig in anomaly_results['figures'].items():
    plt.figure(fig.number)
    plt.tight_layout()
    plt.show()

In [None]:
# Display anomaly detection results
print("Amplitude Anomalies:")
print(f"Number of anomalies: {len(anomaly_results['amplitude_anomalies'])}")
print(f"Percentage: {anomaly_results['amplitude_anomaly_percentage']:.4f}%")

print("\nSpectral Anomalies:")
print(f"Number of anomalies: {len(anomaly_results['spectral_anomalies'])}")
print(f"Percentage: {anomaly_results['spectral_anomaly_percentage']:.4f}%")

print("\nTemporal Anomalies:")
print(f"Number of anomalies: {len(anomaly_results['temporal_anomalies'])}")
print(f"Percentage: {anomaly_results['temporal_anomaly_percentage']:.4f}%")

## 5. Listening to Anomalous Segments

Let's extract and listen to the anomalous segments of the audio.

In [None]:
# Extract and play amplitude anomalies
def extract_anomalous_segments(audio_data, sample_rate, anomalies, window_size=1024):
    segments = []
    for anomaly_idx in anomalies:
        start_idx = max(0, anomaly_idx - window_size // 2)
        end_idx = min(len(audio_data), anomaly_idx + window_size // 2)
        segments.append(audio_data[start_idx:end_idx])
    return segments

# Extract amplitude anomalous segments
if len(amplitude_anomalies) > 0:
    amplitude_anomaly_segments = extract_anomalous_segments(audio_data, sample_rate, amplitude_anomalies)
    
    # Play the first few anomalous segments
    print("Amplitude Anomalous Segments:")
    for i, segment in enumerate(amplitude_anomaly_segments[:5]):
        print(f"Segment {i+1}:")
        display(Audio(segment, rate=sample_rate))
else:
    print("No amplitude anomalies detected.")

In [None]:
# Extract and play spectral anomalies
if len(spectral_anomalies) > 0:
    # Convert frame indices to sample indices
    spectral_anomaly_samples = [idx * hop_length for idx in spectral_anomalies]
    
    # Extract spectral anomalous segments
    spectral_anomaly_segments = extract_anomalous_segments(audio_data, sample_rate, spectral_anomaly_samples, window_size=n_fft)
    
    # Play the first few anomalous segments
    print("Spectral Anomalous Segments:")
    for i, segment in enumerate(spectral_anomaly_segments[:5]):
        print(f"Segment {i+1}:")
        display(Audio(segment, rate=sample_rate))
else:
    print("No spectral anomalies detected.")

## Conclusion

In this notebook, we've explored various techniques for detecting anomalies in audio signals. We've detected amplitude anomalies using z-scores and Isolation Forest, spectral anomalies using spectral contrast, and temporal anomalies using energy and zero crossing rate.

Anomaly detection is an important step in audio data analysis, as it can help identify unusual patterns that might indicate noise, artifacts, or other issues that could affect speech recognition performance. By detecting and addressing these anomalies, we can improve the quality of the audio data and enhance the performance of speech recognition systems.