In [1]:
# %pip install -U pandas numpy scikit-learn pydub opensmile joblib

from pathlib import Path
import glob, json
import numpy as np, pandas as pd, joblib
from pydub import AudioSegment, effects as FX
import opensmile

# ========= CONFIG ============================================================
TEST_DIR   = Path(r"C:\Users\herie\OneDrive - Fundacion Universidad de las Americas Puebla\Proyectos\En Proceso\Paralinguistic Speech Classification for Human Vocalizations\data\raw\Testing_data")
OUT_DIR    = Path(r"C:\Users\herie\OneDrive - Fundacion Universidad de las Americas Puebla\Proyectos\En Proceso\Paralinguistic Speech Classification for Human Vocalizations\data\processed\Testing_data")
MODEL_PATH = Path("infantcry_eGeMAPS_logreg.joblib")  # from previous training
PRED_CSV   = OUT_DIR / "predictions_testing.csv"

# Audio processing (must match training)
TARGET_SR  = 16_000           # Hz
TARGET_SW  = 2                # 16-bit
MAKE_MONO  = True
HPF, LPF   = 150, 6000        # Hz
GATE_K     = 1.5              # gate threshold = K * RMS
NORM_MODE  = "rms"            # "peak" or "rms"
TARGET_RMS = 0.1
TH_ABSTAIN = 0.55
EPS        = 1e-12

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ========= HELPERS ===========================================================
def clean_seg(seg: AudioSegment) -> AudioSegment:
    if HPF: seg = FX.high_pass_filter(seg, HPF)
    if LPF: seg = FX.low_pass_filter(seg, LPF)
    return seg

def seg_to_float(seg: AudioSegment):
    if MAKE_MONO and seg.channels != 1: seg = seg.set_channels(1)
    if TARGET_SR   and seg.frame_rate != TARGET_SR: seg = seg.set_frame_rate(TARGET_SR)
    if TARGET_SW   and seg.sample_width != TARGET_SW: seg = seg.set_sample_width(TARGET_SW)
    sr = seg.frame_rate; sw = seg.sample_width
    arr = np.array(seg.get_array_of_samples())
    peak = float(1 << (8*sw - 1))  # 32768 for 16-bit
    y = arr.astype(np.float32) / peak
    return y, sr

def gate(y: np.ndarray, k: float) -> np.ndarray:
    rms = float(np.sqrt(np.mean(y**2) + EPS))
    thr = k * rms
    y2 = y.copy()
    y2[np.abs(y2) < thr] = 0.0
    return y2

def normalize(y: np.ndarray, mode: str, target_rms: float) -> np.ndarray:
    if mode == "peak":
        m = float(np.max(np.abs(y)) + EPS)
        return y / m
    rms = float(np.sqrt(np.mean(y**2) + EPS))
    y2  = y * (target_rms / rms)
    return np.clip(y2, -1.0, 1.0)

def float_to_seg(y: np.ndarray, sr: int) -> AudioSegment:
    y16 = np.clip(y * 32767.0, -32768, 32767).astype(np.int16)
    return AudioSegment(y16.tobytes(), frame_rate=int(sr), sample_width=2, channels=1)

# ========= LOAD MODEL ========================================================
bundle = joblib.load(MODEL_PATH)
clf      = bundle["model"]                     # CalibratedClassifierCV
cols_ref = bundle["columns"]                   # training feature columns order
class_map = bundle.get("class_map", None)      # optional numeric->text map
classes = clf.classes_.tolist()                # model's class labels (e.g., [1,2,3,4,5])

# openSMILE extractor (eGeMAPSv02 Functionals)
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

# ========= PROCESS + FEATURE + PREDICT ======================================
wav_paths = sorted(glob.glob(str(TEST_DIR / "*.wav")))
assert wav_paths, f"No WAVs found in {TEST_DIR}"

records = []
X_rows, idx = [], []

for p in wav_paths:
    p = Path(p)
    # 1) load + clean + gate + normalize
    seg = AudioSegment.from_file(str(p))
    seg = clean_seg(seg)
    y, sr = seg_to_float(seg)
    y = gate(y, GATE_K)
    y = normalize(y, NORM_MODE, TARGET_RMS)

    # 2) save processed wav
    seg_out = float_to_seg(y, sr)
    out_path = OUT_DIR / p.name
    seg_out.export(str(out_path), format="wav")

    # 3) extract features on processed wav
    feat = smile.process_file(str(out_path))  # 1-row DataFrame
    feat["file"] = p.name
    X_rows.append(feat.reset_index(drop=True))
    idx.append(p.name)

# Stack features and align columns to training reference
Xdf = pd.concat(X_rows, axis=0, ignore_index=True).set_index("file")
# Add missing columns as zeros, drop extras, then reorder
for c in cols_ref:
    if c not in Xdf.columns:
        Xdf[c] = 0.0
extra = [c for c in Xdf.columns if c not in cols_ref]
if extra:
    Xdf = Xdf.drop(columns=extra)
Xdf = Xdf[cols_ref]

# Predict probabilities
probs = clf.predict_proba(Xdf.to_numpy())
kmax  = probs.argmax(axis=1)
pmax  = probs[np.arange(len(pmax:=probs.max(axis=1))), kmax]
pred_raw = [classes[k] for k in kmax]

# Human label mapping and abstain
def to_label(c):
    if class_map:
        return class_map.get(c, c)
    return c

pred_label = [to_label(c) if p >= TH_ABSTAIN else "abstain" for c, p in zip(pred_raw, pmax)]

# Build output table
prob_cols = {f"p_{to_label(c)}": probs[:, i] for i, c in enumerate(classes)}
out = pd.DataFrame({
    "file": Xdf.index,
    "pred": pred_label,
    "p_max": pmax,
    "raw_class": [to_label(c) for c in pred_raw],
    **prob_cols
}).sort_values("file")

# Save CSV
PRED_CSV.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(PRED_CSV, index=False)

print(f"Processed WAVs saved to: {OUT_DIR}")
print(f"Predictions CSV:        {PRED_CSV}")
print(out.head(min(10, len(out))))




Processed WAVs saved to: C:\Users\herie\OneDrive - Fundacion Universidad de las Americas Puebla\Proyectos\En Proceso\Paralinguistic Speech Classification for Human Vocalizations\data\processed\Testing_data
Predictions CSV:        C:\Users\herie\OneDrive - Fundacion Universidad de las Americas Puebla\Proyectos\En Proceso\Paralinguistic Speech Classification for Human Vocalizations\data\processed\Testing_data\predictions_testing.csv
     file     pred     p_max  raw_class       p_1       p_2       p_3  \
0   1.wav        4  0.790200          4  0.064782  0.012788  0.006066   
1  10.wav        2  0.633429          2  0.005956  0.633429  0.019739   
2  11.wav        4  0.562059          4  0.000000  0.011775  0.004929   
3  12.wav  abstain  0.497636          2  0.000000  0.497636  0.024416   
4  13.wav        2  0.670719          2  0.000000  0.670719  0.007073   
5  14.wav        5  0.732311          5  0.000000  0.000000  0.019601   
6  15.wav        5  0.976580          5  0.000000  0.0

In [1]:
a = 1