# Pipeline 2 Imports

In [None]:
!pip install torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
import torch
import torchaudio
import torchaudio.functional as F

# Pipeline 2 Functions

[STFT Window Functions](https://www.audiolabs-erlangen.de/resources/MIR/FMP/C2/C2_STFT-Window.html)
- Hann Window => raised cosine window, drops smoothly to 0 at section boundaries
  - BUT smears some frequencies = reduction of ripple artifacts

[Types of Window Functions](https://en.wikipedia.org/wiki/Window_function)

In [1]:
# BASE Short-Time Fourier Transform (STFT) Computation
def compute_stft(waveform, window, sample_rate=16000):
    """
    Compute STFT with specified parameters.

    Parameters:
        waveform (Tensor): Audio waveform, shape (B, N)
        window (str): Window function type
        sample_rate (int): Sampling rate

    Return tensor: STFT magnitude, shape (B, 257, 498)
    """
    n_fft = 512
    # window size = win_length
    win_length = int(0.025 * sample_rate)  # 25 ms → 400 samples
    hop_length = 160

    # Different types of window functions
    if window == "Rectangular":
      window = torch.ones(win_length).to(waveform.device)
    elif window == "Triangle":
      window = torch.trianglular_window(win_length).to(waveform.device)
    elif window == "Hann":
      window = torch.hann_window(win_length).to(waveform.device)
    elif window == "Tukey":
      window = torch.tukey_window(win_length).to(waveform.device)

    # Compute stft, output shape: (B, 257, 498)
    stft = torch.stft(
        waveform, n_fft=n_fft, hop_length=hop_length, win_length=win_length,
        window=window, return_complex=True
    )

    # Magnitude (abs) of complex STFT
    stft_mag = torch.abs(stft)
    return stft_mag

In [None]:
# MEL Spectrogram Conversion
def stft_to_mel(stft_mag, sample_rate=16000, n_mels=80):
    """
    Convert STFT magnitude to Mel Spectrogram.

    Parameters:
        stft_mag (Tensor): Tensor of shape (B, 257, 498)
        sample_rate (int): Sampling rate of the original signal
        n_mels (int): Number of Mel filters

    Returns tensor: Mel spectrogram of shape (B, 498, 80)
    """
    # Create Mel filter bank, (257, 80)
    mel_filter_bank = torchaudio.functional.create_fb_matrix(
        n_freqs=257, f_min=0.0, f_max=sample_rate / 2.0,
        n_mels=n_mels, sample_rate=sample_rate
    )

    # Apply the Mel filter bank, transpose to time-first format
    mel_spec = torch.matmul(stft_mag.transpose(1, 2), mel_filter_bank)
    # (B, 498, 80)
    return mel_spec

In [None]:
# MFCC Extraction
def compute_mfcc_stack(mel_spec):
    """
    Compute MFCCs, deltas, and delta-deltas from a Mel spectrogram.

    Parameters:
        mel_spec (Tensor): Input Mel spectrogram of shape (B, 80, T)

    Returns tensor: Final stacked features of shape (B, T, 39)
    """
    # Batch, Frequency, Time
    B, M, T = mel_spec.shape
    # (B, 80, 498)

    # DCT (Discrete Cosine Transform) Type-II over frequency axis
    mfcc = F.compute_dct(mel_spec, norm="ortho", type=2)
    # (B, 80, T)

    # Keep first 13 MFCCs (most informative)
    mfcc = mfcc[:, :13, :]
    # (B, 13, T)

    # Delta (first derivative)
    delta = F.compute_deltas(mfcc)
    # (B, 13, T)

    # Delta-Delta (second derivative)
    delta_delta = F.compute_deltas(delta)
    # (B, 13, T)

    # Stack all coefficcients (MFCC, delta, delta-delta)
    mfcc_stack = torch.cat([mfcc, delta, delta_delta], dim=1)
    # (B, 39, T)

    # Step 6: Transpose to time-first format
    return mfcc_stack.transpose(1, 2)
    # (B, 498, 39)