# STOI Audio Analysis

This notebook demonstrates generating synthetic audio, adding noise and time shifting, and calculating STOI scores. Mirrored from `run_pesq.ipynb`.

In [4]:
# %% pip install librosa soundfile pystoi

import numpy as np
import librosa
import soundfile as sf
from pystoi import stoi

In [5]:
def generate_audio():

    # Generate synthetic "speech" (just for mechanical test)
    filename = "speech_sample_v0.wav"
    sr = 16000
    t = np.linspace(0, 3, 3*sr)
    audio = 0.5 * np.sin(2*np.pi*440*t) * np.exp(-t) + 0.3 * np.sin(2*np.pi*880*t)
    sf.write(filename, audio, sr)
    print("Generated synthetic audio.")

    # Load audio
    target_sr = 16000 
    y, sr = librosa.load(filename, sr=target_sr, mono=True)
    return y, sr

def add_noise(audio, snr_db=10):
    # Calculate signal power
    sig_power = np.mean(audio ** 2)
    # Calculate noise power
    noise_power = sig_power / (10 ** (snr_db / 10))
    # Generate noise
    noise = np.random.normal(0, np.sqrt(noise_power), audio.shape)
    return audio + noise

def time_shift(audio, shift_ms, sr):
    shift_samples = int(shift_ms * sr / 1000)
    output = np.zeros_like(audio)
    if shift_samples > 0:
        output[shift_samples:] = audio[:-shift_samples]
    elif shift_samples < 0:
        output[:shift_samples] = audio[-shift_samples:]
    else:
        output = audio
    return output

In [6]:
print("Loading audio...")
ref, sr = generate_audio()
print(f"Loaded audio: {len(ref)/sr:.2f}s at {sr}Hz")

# 1. Generate v1: Noisy
print("Generating v1 (Noisy)...")
v1 = add_noise(ref, snr_db=10)
# dump to file v1_stoi.wav
sf.write("speech_sample_v1_stoi.wav", v1, sr)

# 2. Generate v2: Time Shifted
print("Generating v2 (Shifted 200ms)...")
v2 = time_shift(ref, shift_ms=200, sr=sr)
# dump to file v2_stoi.wav
sf.write("speech_sample_v2_stoi.wav", v2, sr)

# Calculate STOI
# stoi(ref, deg, fs, extended=False)

print("\nCalculating STOI scores...")

score_v0 = stoi(ref, ref, sr, extended=False)
print(f"STOI (Reference vs Reference v0): {score_v0:.4f}")

score_v1 = stoi(ref, v1, sr, extended=False)
print(f"STOI (Reference vs Noisy v1): {score_v1:.4f}")

score_v2 = stoi(ref, v2, sr, extended=False)
print(f"STOI (Reference vs Shifted v2): {score_v2:.4f}")

print("\nNote: STOI requires time-aligned signals. A low score for v2 is expected due to the shift.")

Loading audio...
Generated synthetic audio.
Loaded audio: 3.00s at 16000Hz
Generating v1 (Noisy)...
Generating v2 (Shifted 200ms)...

Calculating STOI scores...
STOI (Reference vs Reference v0): 1.0000
STOI (Reference vs Noisy v1): 0.0891
STOI (Reference vs Shifted v2): 0.7289

Note: STOI requires time-aligned signals. A low score for v2 is expected due to the shift.
