In [1]:
# Kaggle Default setup code

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# ===========================
# Mel-spectrograms aligned to NOR augmentation params
# ===========================

# Assigning path to Sample audio file
SAMPLE_WAV = "/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/YAF_pleasant_surprised/YAF_witch_ps.wav"


import os, numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt


os.makedirs("images", exist_ok=True)


# Core constants 

SR        = 16000
DURATION  = 4.0                # MAX_SEC = 4 
TARGETLEN = int(SR * DURATION)

# NOR augmentation ranges 
RANGES = dict(
    snr=(15, 25),              # dB
    pitch=(-1.5, 1.5),         # semitones
    stretch=(0.9, 1.1),        # rate
    shift_frac=0.08,           # fraction of length (±8%)
    bp=(100, 7000)             # Hz
)

# ----------------
# I/O + normalisation
# ----------------
def load_16k_mono(path, target_sr=SR, peak=0.98):
    x, sr = librosa.load(path, sr=None, mono=True)
    if sr != target_sr:
        x = librosa.resample(x, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    m = np.max(np.abs(x)) + 1e-9
    return (peak * (x / m)).astype(np.float32), sr

def fix_duration(x, sr=SR, seconds=DURATION):
    n = int(seconds * sr)
    if len(x) >= n:
        return x[:n].astype(np.float32)
    y = np.zeros(n, dtype=np.float32); y[:len(x)] = x
    return y

# ----------------
# Augmentations 
# ----------------
def add_noise_snr(x, snr_db=20.0, rng=None):
    rng = np.random.default_rng() if rng is None else rng
    sig_pow = float(np.mean(x**2) + 1e-12)
    noise_pow = sig_pow / (10**(snr_db/10))
    noise = rng.normal(0.0, np.sqrt(noise_pow), size=len(x)).astype(np.float32)
    y = x + noise
    y = y / (np.max(np.abs(y)) + 1e-9)
    return (0.98 * y).astype(np.float32)

def pitch_shift(x, sr=SR, semitones=+1.0):
    y = librosa.effects.pitch_shift(x, sr=sr, n_steps=float(semitones))
    y = y / (np.max(np.abs(y)) + 1e-9)
    return (0.98 * fix_duration(y, sr)).astype(np.float32)

def time_stretch(x, rate=0.96):
    rate = float(np.clip(rate, 0.9, 1.1))  
    y = librosa.effects.time_stretch(x, rate=rate)
    y = y / (np.max(np.abs(y)) + 1e-9)
    return (0.98 * fix_duration(y)).astype(np.float32)

def time_shift_frac(x, sr=SR, frac=0.08):
    # positive means shift right; for a 4s clip and frac=0.08 => ~320 ms
    samples = int(frac * len(x))
    y = np.roll(x, samples)
    return y.astype(np.float32)

def bandpass(x, sr=SR, low_hz=100.0, high_hz=7000.0, order=4):
    nyq = 0.5 * sr
    low = max(1.0, low_hz) / nyq
    high = min(high_hz, sr/2 - 100.0) / nyq
    b, a = butter(order, [low, high], btype='bandpass')
    y = filtfilt(b, a, x).astype(np.float32)
    y = y / (np.max(np.abs(y)) + 1e-9)
    return (0.98 * y).astype(np.float32)

# ----------------
# Mel-spectrogram config
# ----------------
MEL_N_FFT   = 512
MEL_HOP     = 160      # ~10 ms @ 16 kHz
MEL_WIN     = 400      # ~25 ms @ 16 kHz
MEL_N_MELS  = 80
MEL_FMIN    = 50
MEL_FMAX    = 8000
DB_VMIN     = -80
DB_VMAX     = 0

def mel_db(x, sr=SR):
    S = librosa.feature.melspectrogram(
        y=x, sr=sr, n_fft=MEL_N_FFT, hop_length=MEL_HOP, win_length=MEL_WIN,
        n_mels=MEL_N_MELS, fmin=MEL_FMIN, fmax=MEL_FMAX, power=2.0
    )
    return librosa.power_to_db(S, ref=1.0)

def save_mel(Sdb, out_path, title=None):
    plt.figure(figsize=(4.2, 3.1))
    librosa.display.specshow(Sdb, sr=SR, hop_length=MEL_HOP,
                             x_axis='time', y_axis='mel',
                             vmin=DB_VMIN, vmax=DB_VMAX)
    if title: plt.title(title)
    plt.tight_layout(); plt.savefig(out_path, dpi=220, bbox_inches="tight"); plt.close()

def save_wave(x, out_path, title=None):
    plt.figure(figsize=(6.4, 2.3))
    librosa.display.waveshow(x, sr=SR)
    if title: plt.title(title)
    plt.tight_layout(); plt.savefig(out_path, dpi=220, bbox_inches="tight"); plt.close()


x, sr = load_16k_mono(SAMPLE_WAV, target_sr=SR)
x = fix_duration(x, sr=SR, seconds=DURATION)

# Waveform + original mel
save_wave(x, "images/waveplot_example.png", title="Waveform (4 s)")
save_mel(mel_db(x, SR), "images/mel_spectrogram_example.png", title="Mel-spectrogram (original)")

# Representative augmentations INSIDE NOR ranges
snr_mid     = float(np.mean(RANGES["snr"]))          # 20 dB
pitch_demo  = +1.0                                   # within [-1.5, +1.5]
stretch_dem = 0.96                                   # within [0.9, 1.1]
shift_frac  = float(RANGES["shift_frac"])            # 0.08 of length
bp_low, bp_high = RANGES["bp"]                       # 100–7000 Hz

x_noise   = add_noise_snr(x, snr_db=snr_mid)                         # + Noise (20 dB SNR)
x_pitch   = pitch_shift(x, sr=SR, semitones=pitch_demo)              # + Pitch (+1.0 st)
x_stretch = time_stretch(x, rate=stretch_dem)                        # + Stretch (×0.96)
x_shift   = time_shift_frac(x, sr=SR, frac=shift_frac)               # + Shift (+8% length ≈ +320 ms)
x_bpf     = bandpass(x, sr=SR, low_hz=bp_low, high_hz=bp_high)       # + Band-pass (100–7000 Hz)

# Saving individual mel panels with a fixed dB scale for comparability
save_mel(mel_db(x,        SR), "images/mel_orig.png",      "Original")
save_mel(mel_db(x_noise,  SR), "images/mel_noise.png",     f"+ Noise ({snr_mid:.0f} dB SNR)")
save_mel(mel_db(x_pitch,  SR), "images/mel_pitch.png",     f"+ Pitch (+{pitch_demo:.1f} st)")
save_mel(mel_db(x_stretch,SR), "images/mel_stretch.png",   f"+ Time-stretch (×{stretch_dem:.2f})")
# showing exact ms shift for a 4 s clip
shift_ms = int(shift_frac * DURATION * 1000)
save_mel(mel_db(x_shift,  SR), "images/mel_shift.png",     f"+ Time shift (+{shift_ms} ms)")
save_mel(mel_db(x_bpf,    SR), "images/mel_bandpass.png",  f"+ Band-pass ({int(bp_low)}–{int(bp_high)} Hz)")
