# Voice Input Pipeline (PI1–PI5)

This notebook implements the microphone recording pipeline requirements with explicit sections:

- **PI1 (Basic):** Capture audio and save in a standard format (`.wav`)
- **PI2 (Basic):** Start/stop recording UI
- **PI3 (Expected):** Real-time audio level monitoring + quality feedback
- **PI4 (Expected):** Save metadata (timestamp, duration, sample rate, path)
- **PI5 (Advanced):** Automated preprocessing (noise reduction + normalization)

> If a package is missing, install it in your environment (examples in next cell).

In [25]:
# PI1–PI5 imports and setup
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import sounddevice as sd
# print(sd.query_devices()) # Uncomment the above line to see the list of audio devices and find the correct index for your microphone input if you have multiple devices.
from scipy.io.wavfile import write as wav_write
import ipywidgets as widgets
from IPython.display import display, clear_output
SAMPLE_RATE = 16000
CHANNELS = 1
RECORDINGS_DIR = Path("data/raw/recordings")
PROCESSED_DIR = Path("data/processed/recordings")
METADATA_CSV = Path("data/processed/recordings_metadata.csv")
RECORDINGS_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
METADATA_CSV.parent.mkdir(parents=True, exist_ok=True)
state = {
    "is_recording": False,
    "stream": None,
    "frames": [],
    "started_at": None,
    "latest_raw_path": None,
    "latest_processed_path": None,
    "latest_duration_sec": 0.0,
    "latest_level": 0.0,
}
# MIC_DEVICE_INDEX = <your device index>  # Replace with the correct index # If you have multiple audio input devices, set this to the index of the one you want to use. You can find the index from the output of `sd.query_devices()`.
print("Setup complete.")

   0 Microsoft Sound Mapper - Input, MME (2 in, 0 out)
>  1 Microphone Array (Intel® Smart , MME (4 in, 0 out)
   2 Microsoft Sound Mapper - Output, MME (0 in, 2 out)
<  3 LG FULL HD (NVIDIA High Definit, MME (0 in, 2 out)
   4 Speakers (Realtek(R) Audio), MME (0 in, 2 out)
   5 Primary Sound Capture Driver, Windows DirectSound (2 in, 0 out)
   6 Microphone Array (Intel® Smart Sound Technology for Digital Microphones), Windows DirectSound (4 in, 0 out)
   7 Primary Sound Driver, Windows DirectSound (0 in, 2 out)
   8 LG FULL HD (NVIDIA High Definition Audio), Windows DirectSound (0 in, 2 out)
   9 Speakers (Realtek(R) Audio), Windows DirectSound (0 in, 2 out)
  10 Speakers (Realtek(R) Audio), Windows WASAPI (0 in, 2 out)
  11 LG FULL HD (NVIDIA High Definition Audio), Windows WASAPI (0 in, 2 out)
  12 Microphone Array (Intel® Smart Sound Technology for Digital Microphones), Windows WASAPI (2 in, 0 out)
  13 Microphone Array 1 (), Windows WDM-KS (2 in, 0 out)
  14 Microphone Array 2 (),

## PI1 (Basic): Capture audio from microphone and save in standard format

In [26]:
# PI1: Capture audio and save in standard format (.wav)
def save_wav(audio_float32: np.ndarray, sample_rate: int, out_path: Path) -> Path:
    audio_clipped = np.clip(audio_float32, -1.0, 1.0)
    audio_int16 = (audio_clipped * 32767).astype(np.int16)
    wav_write(str(out_path), sample_rate, audio_int16)
    return out_path

def record_fixed_duration(seconds: float = 3.0, sample_rate: int = SAMPLE_RATE, channels: int = CHANNELS) -> Path:
    print(f"Recording for {seconds:.1f}s...")
    recording = sd.rec(int(seconds * sample_rate), samplerate=sample_rate, channels=channels, dtype="float32")
    # recording = sd.rec(int(seconds * sample_rate), samplerate=sample_rate, channels=channels, dtype="float32", device=MIC_DEVICE_INDEX) # Uncomment this line and set MIC_DEVICE_INDEX if you have multiple audio input devices and want to specify which one to use.
    sd.wait()
    if channels == 1:
        recording = recording[:, 0]

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = RECORDINGS_DIR / f"mic_capture_{timestamp}.wav"
    save_wav(recording, sample_rate, out_path)

    state["latest_raw_path"] = str(out_path)
    state["latest_duration_sec"] = float(seconds)

    print(f"Saved: {out_path}")
    return out_path

## PI2 (Basic): Simple UI for start/stop recording


In [None]:
# PI2: Simple UI for start/stop recording
status_label = widgets.HTML(value="<b>Status:</b> Idle")
level_bar = widgets.FloatProgress(value=0.0, min=0.0, max=0.2, description="Level", bar_style="")
quality_label = widgets.HTML(value="<b>Quality:</b> N/A")
start_button = widgets.Button(description="Start Recording", button_style="success")
stop_button = widgets.Button(description="Stop Recording", button_style="danger", disabled=True)
out = widgets.Output()

def rms_level(x: np.ndarray) -> float:
    if x.size == 0:
        return 0.0
    return float(np.sqrt(np.mean(np.square(x))))

def level_to_quality(level: float) -> str:
    if level < 0.01:
        return "Too quiet"
    if level < 0.04:
        return "Good"
    if level < 0.12:
        return "Loud"
    return "Very loud / possible clipping"

def _audio_callback(indata, frames, time, status):
    chunk = indata.copy()
    if CHANNELS == 1:
        chunk = chunk[:, 0]
    state["frames"].append(chunk)
    state["latest_level"] = rms_level(chunk)

import time

def poll_ui_updates(interval=0.1):
    while state["is_recording"]:
        level_bar.value = min(state["latest_level"], level_bar.max)
        quality_label.value = f"<b>Quality:</b> {level_to_quality(state['latest_level'])}"
        time.sleep(interval)

def start_recording(_):
    if state["is_recording"]:
        return
    state["frames"] = []
    state["started_at"] = datetime.now()
    state["is_recording"] = True
    stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, dtype="float32", callback=_audio_callback)
    # stream = sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, dtype="float32", callback=_audio_callback, device=MIC_DEVICE_INDEX) # Uncomment this line and set MIC_DEVICE_INDEX if you have multiple audio input devices and want to specify which one to use.
    stream.start()
    state["stream"] = stream
    status_label.value = "<b>Status:</b> Recording..."
    start_button.disabled = True
    stop_button.disabled = False

def stop_recording(_):
    if not state["is_recording"]:
        return
    stream = state["stream"]
    if stream is not None:
        stream.stop()
        stream.close()
    state["stream"] = None
    state["is_recording"] = False
    start_button.disabled = False
    stop_button.disabled = True
    frames = state["frames"]
    if len(frames) == 0:
        status_label.value = "<b>Status:</b> No audio captured"
        return
    audio = np.concatenate(frames)
    duration_sec = len(audio) / SAMPLE_RATE
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = RECORDINGS_DIR / f"mic_capture_ui_{timestamp}.wav"
    save_wav(audio, SAMPLE_RATE, out_path)
    state["latest_raw_path"] = str(out_path)
    state["latest_duration_sec"] = float(duration_sec)
    status_label.value = f"<b>Status:</b> Saved {out_path.name} ({duration_sec:.2f}s)"
    with out:
        clear_output(wait=True)
        print(f"Saved raw audio: {out_path}")

start_button.on_click(start_recording)
stop_button.on_click(stop_recording)
display(widgets.VBox([widgets.HTML("<h4>Microphone Recorder</h4>"), widgets.HBox([start_button, stop_button]), status_label, level_bar, quality_label, out]))

VBox(children=(HTML(value='<h4>Microphone Recorder</h4>'), HBox(children=(Button(button_style='success', descr…

## PI3 (Expected): Real-time audio level monitoring and recording-quality feedback

In [28]:
# PI3: Real-time audio level monitoring and quality feedback
# (Handled in PI2 UI: level_bar and quality_label update in _audio_callback)

## PI4 (Expected): Save metadata (timestamp, duration, sample rate, path)

In [29]:
# PI4: Save metadata (timestamp, duration, sample rate, path)
def save_metadata(timestamp: str, duration: float, sample_rate: int, path: str):
    row = {"timestamp": timestamp, "duration": duration, "sample_rate": sample_rate, "path": path}
    if METADATA_CSV.exists():
        df = pd.read_csv(METADATA_CSV)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    else:
        df = pd.DataFrame([row])
    df.to_csv(METADATA_CSV, index=False)

## PI5 (Advanced): Automated preprocessing (noise reduction + normalization)

In [30]:
# PI5: Automated preprocessing (noise reduction + normalization)
def preprocess_audio(audio: np.ndarray, sample_rate: int) -> np.ndarray:
    # Simple noise reduction: estimate noise from first 0.5s, apply spectral gating
    noise_len = min(int(sample_rate * 0.5), len(audio))
    noise_clip = audio[:noise_len]
    noise_std = np.std(noise_clip)
    threshold = noise_std * 1.5
    reduced = np.where(np.abs(audio) < threshold, 0, audio)
    # Normalization
    normed = reduced / (np.max(np.abs(reduced)) + 1e-8)
    return normed.astype(np.float32)

# Example usage:
def record_and_preprocess(seconds: float = 3.0):
    print(f"Recording for {seconds:.1f}s...")
    recording = sd.rec(int(seconds * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=CHANNELS, dtype="float32")
    sd.wait()
    if CHANNELS == 1:
        recording = recording[:, 0]
    processed = preprocess_audio(recording, SAMPLE_RATE)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = RECORDINGS_DIR / f"mic_processed_{timestamp}.wav"
    save_wav(processed, SAMPLE_RATE, out_path)
    save_metadata(timestamp, seconds, SAMPLE_RATE, str(out_path))
    print(f"Saved processed audio: {out_path}")