# Lecture 10B — Notebook 10B.3: MFCC Pipeline (Log-Mel → DCT → Cepstra + Deltas)

**Purpose:** Implement MFCCs step-by-step and connect them back to cepstrum ideas (but now on mel bands).


In [None]:
import os, json, math, re
from pathlib import Path

import numpy as np
import scipy.signal as sig
import scipy.fft as fft
import matplotlib.pyplot as plt

# Optional audio playback
try:
    from IPython.display import Audio, display
    HAS_IPY_AUDIO = True
except Exception:
    HAS_IPY_AUDIO = False

# Optional recording
try:
    import sounddevice as sd
    HAS_SD = True
except Exception as e:
    HAS_SD = False
    print("sounddevice not available (recording disabled).", e)

# ---------- Project paths ----------
PROJECT_ROOT = Path.cwd() / "EE519_L10B_Project"
REC_DIR = PROJECT_ROOT / "recordings"
FIG_DIR = PROJECT_ROOT / "figures"
RES_DIR = PROJECT_ROOT / "results"
MANIFEST_PATH = PROJECT_ROOT / "manifest.json"

for d in [REC_DIR, FIG_DIR, RES_DIR]:
    d.mkdir(parents=True, exist_ok=True)

def load_manifest(path=MANIFEST_PATH):
    if path.exists():
        return json.loads(path.read_text())
    return {"course":"EE519","lecture":"10B","created_utc":None,"clips":[]}

def save_manifest(manifest, path=MANIFEST_PATH):
    if manifest.get("created_utc") is None:
        manifest["created_utc"] = str(np.datetime64("now"))
    path.write_text(json.dumps(manifest, indent=2))
    print("Saved manifest:", path)

def save_fig(fig, name, dpi=150):
    out = FIG_DIR / name
    fig.savefig(out, dpi=dpi, bbox_inches="tight")
    print("Saved figure:", out)
    return out

# ---------- WAV I/O ----------
import wave
def write_wav(path: Path, x: np.ndarray, fs: int):
    x = np.asarray(x, dtype=np.float32)
    x = np.clip(x, -1.0, 1.0)
    x_i16 = (x * 32767.0).astype(np.int16)
    with wave.open(str(path), "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(fs)
        wf.writeframes(x_i16.tobytes())

def read_wav(path: Path):
    with wave.open(str(path), "rb") as wf:
        fs = wf.getframerate()
        n = wf.getnframes()
        x = np.frombuffer(wf.readframes(n), dtype=np.int16).astype(np.float32) / 32768.0
    return fs, x

def peak_normalize(x, target=0.98):
    m = np.max(np.abs(x)) + 1e-12
    return (x / m) * target

def play_audio(x, fs, label="audio"):
    if not HAS_IPY_AUDIO:
        print("(Audio playback not available)", label)
        return
    display(Audio(x, rate=fs))

def record_clip(seconds=2.0, fs=16000):
    if not HAS_SD:
        raise RuntimeError("sounddevice not available. Load wav files instead.")
    print(f"Recording {seconds:.1f}s @ {fs} Hz ...")
    x = sd.rec(int(seconds*fs), samplerate=fs, channels=1, dtype="float32")
    sd.wait()
    return fs, x.squeeze()

def add_clip_to_manifest(filename, label, fs, notes=""):
    clip = {
        "filename": filename,
        "label": label,
        "fs": int(fs),
        "notes": notes,
        "added_utc": str(np.datetime64("now")),
        "selections": {}
    }
    manifest["clips"].append(clip)
    save_manifest(manifest)
    return len(manifest["clips"]) - 1

def list_clips():
    for i,c in enumerate(manifest["clips"]):
        print(f"[{i}] {c['label']:14s}  {c['filename']}  fs={c['fs']}  notes={c.get('notes','')}")

# ---------- Selection + framing helpers ----------
def seconds_to_samples(t0, t1, fs, xlen):
    s0 = int(max(0, round(t0*fs)))
    s1 = int(min(xlen, round(t1*fs)))
    if s1 <= s0:
        raise ValueError("Bad selection: t1 must be > t0")
    return s0, s1

def samples_to_frame_range(s0, s1, N, H, xlen):
    f0 = max(0, int((s0 - N)//H) + 1)
    f1 = min(int((s1)//H), int((xlen-N)//H))
    if f1 < f0:
        f0 = max(0, int(s0//H))
        f1 = min(int((xlen-N)//H), f0)
    return f0, f1

def frame_signal(x, N, H):
    if len(x) < N:
        x = np.pad(x, (0, N-len(x)))
    num = 1 + (len(x) - N)//H
    idx = np.arange(N)[None,:] + H*np.arange(num)[:,None]
    return x[idx]

def db(x):
    return 20*np.log10(np.maximum(x, 1e-12))

# ---------- Auditory scale helpers ----------
def hz_to_mel(hz):
    return 2595.0 * np.log10(1.0 + hz/700.0)

def mel_to_hz(mel):
    return 700.0 * (10**(mel/2595.0) - 1.0)

def mel_filterbank(fs, nfft, n_mels=26, fmin=0.0, fmax=None):
    if fmax is None:
        fmax = fs/2
    # mel points
    mmin, mmax = hz_to_mel(fmin), hz_to_mel(fmax)
    m_pts = np.linspace(mmin, mmax, n_mels+2)
    hz_pts = mel_to_hz(m_pts)
    # fft bin frequencies
    freqs = np.linspace(0, fs/2, nfft//2 + 1)
    bins = np.floor((nfft+1) * hz_pts / fs).astype(int)
    fb = np.zeros((n_mels, len(freqs)), dtype=np.float64)
    for i in range(n_mels):
        b0, b1, b2 = bins[i], bins[i+1], bins[i+2]
        b0 = np.clip(b0, 0, len(freqs)-1)
        b1 = np.clip(b1, 0, len(freqs)-1)
        b2 = np.clip(b2, 0, len(freqs)-1)
        if b1 == b0: b1 += 1
        if b2 == b1: b2 += 1
        # rising
        fb[i, b0:b1] = (np.arange(b0, b1) - b0) / (b1 - b0 + 1e-12)
        # falling
        fb[i, b1:b2] = (b2 - np.arange(b1, b2)) / (b2 - b1 + 1e-12)
    return fb, freqs, hz_pts

manifest = load_manifest()
print("Project root:", PROJECT_ROOT)
print("Clips in manifest:", len(manifest["clips"]))


## 1) Choose clip + selection


In [None]:
list_clips()
CLIP_IDX = 0
segment_name = "seg1"

clip = manifest["clips"][CLIP_IDX]
fs, x = read_wav(REC_DIR/clip["filename"])
x = peak_normalize(x)
sel = clip["selections"]["analysis_segments"][segment_name]

N = sel["N"]; H = sel["H"]
xseg = x[sel["s0"]:sel["s1"]]

window = sig.windows.hann(N, sym=False)
frames = frame_signal(xseg, N, H)
NFFT = 2048 if fs <= 16000 else 4096

X = fft.rfft(frames * window[None,:], n=NFFT, axis=1)
P = (np.abs(X)**2) / NFFT  # (T, F)

n_mels = 40
fb, _, _ = mel_filterbank(fs, NFFT, n_mels=n_mels)
M = P @ fb.T
logM = np.log(M + 1e-12)

print("logM:", logM.shape)


## 2) DCT → MFCCs

MFCCs are essentially a DCT (decorrelation) of log-mel energies.

We implement DCT-II manually to avoid extra dependencies.


In [None]:
def dct2_matrix(N, K):
    # N input dims (mel bands), K output cepstra
    n = np.arange(N)[None,:]
    k = np.arange(K)[:,None]
    D = np.cos(np.pi/N * (n + 0.5) * k)
    # Orthonormal scaling
    D[0,:] *= 1/np.sqrt(N)
    D[1:,:] *= np.sqrt(2/N)
    return D

n_ceps = 13
D = dct2_matrix(n_mels, n_ceps)       # (n_ceps, n_mels)
mfcc = logM @ D.T                     # (T, n_ceps)

print("mfcc:", mfcc.shape, "mean/std:", mfcc.mean(), mfcc.std())


In [None]:
fig = plt.figure(figsize=(10,4))
plt.imshow(mfcc.T, origin="lower", aspect="auto")
plt.title("MFCCs (image view)")
plt.xlabel("Frame index"); plt.ylabel("Cepstral index")
plt.colorbar(label="value")
plt.tight_layout(); plt.show()
save_fig(fig, f"L10B_mfcc_clip{CLIP_IDX}_{re.sub(r'[^a-zA-Z0-9_]+','_',segment_name)}.png")


## 3) Cepstral liftering (optional)

A simple lifter can emphasize mid-order coefficients.


In [None]:
L = 22  # lifter parameter
n = np.arange(n_ceps)
lifter = 1 + (L/2)*np.sin(np.pi*n/L)
mfcc_lift = mfcc * lifter[None,:]

fig = plt.figure(figsize=(10,4))
plt.imshow(mfcc_lift.T, origin="lower", aspect="auto")
plt.title("MFCCs after cepstral liftering")
plt.xlabel("Frame index"); plt.ylabel("Cepstral index")
plt.colorbar(label="value")
plt.tight_layout(); plt.show()
save_fig(fig, f"L10B_mfcc_lift_clip{CLIP_IDX}_{re.sub(r'[^a-zA-Z0-9_]+','_',segment_name)}.png")


## 4) Delta and delta-delta (dynamic features)

We compute simple regression deltas across time.


In [None]:
def deltas(feat, N=2):
    # feat: (T, D)
    T, D = feat.shape
    denom = 2 * sum([n*n for n in range(1, N+1)])
    out = np.zeros_like(feat)
    for t in range(T):
        num = np.zeros(D)
        for n in range(1, N+1):
            t1 = min(T-1, t+n)
            t2 = max(0, t-n)
            num += n*(feat[t1]-feat[t2])
        out[t] = num/denom
    return out

d1 = deltas(mfcc_lift, N=2)
d2 = deltas(d1, N=2)

print("delta shapes:", d1.shape, d2.shape)


### Quick visualization: compare static vs delta (first coefficient)


In [None]:
fig = plt.figure(figsize=(10,3))
plt.plot(mfcc_lift[:,0], label="c0 (static)")
plt.plot(d1[:,0], label="Δc0")
plt.plot(d2[:,0], label="ΔΔc0")
plt.title("MFCC dynamics (first coefficient)")
plt.xlabel("Frame index"); plt.legend()
plt.tight_layout(); plt.show()
save_fig(fig, f"L10B_mfcc_deltas_clip{CLIP_IDX}_{re.sub(r'[^a-zA-Z0-9_]+','_',segment_name)}.png")


## Wrap-up
**What you learned:** MFCC = DCT(log-mel), plus liftering and deltas for dynamics.  

## Reflection
1) Why does DCT of log-mel resemble a “cepstrum” idea?  
2) Which MFCC indices are most sensitive to spectral slope vs formants?  
3) Why do deltas help classifiers?
