In [1]:
# %pip install -U pandas numpy scikit-learn pydub opensmile matplotlib joblib

from pathlib import Path
import glob, numpy as np, pandas as pd, joblib
from pydub import AudioSegment, effects as FX
import opensmile
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.calibration import CalibratedClassifierCV




In [None]:
# ==== CONFIG =================================================================
CSV_LABELS = Path("data/raw/training_label.csv")
RAW_DIR    = Path("data/raw/Training_data")
PROC_DIR   = Path(r"C:\Users\herie\OneDrive - Fundacion Universidad de las Americas Puebla\Proyectos\En Proceso\Paralinguistic Speech Classification for Human Vocalizations\data\processed\Training_data")
PROC_DIR.mkdir(parents=True, exist_ok=True)

TARGET_SR  = 16_000
TARGET_SW  = 2            # 16-bit
MAKE_MONO  = True
HPF, LPF   = 150, 6000    # Hz
GATE_K     = 1.5          # τ = K·RMS
NORM_MODE  = "rms"        # "peak" or "rms"
TARGET_RMS = 0.1
TH_ABSTAIN = 0.55         # abstain if max prob < threshold

# Optional mapping if you know numeric→text classes:
# CLASS_MAP = {1:"healthy", 2:"asphyxia", 3:"hipoacusia", 4:"hiperbilirrubinemia", 5:"hipotiroidismo"}
CLASS_MAP = None

EPS = 1e-12

In [3]:

# ==== AUDIO HELPERS ==========================================================
def clean_seg(seg: AudioSegment) -> AudioSegment:
    if HPF: seg = FX.high_pass_filter(seg, HPF)
    if LPF: seg = FX.low_pass_filter(seg, LPF)
    return seg

def seg_to_float(seg: AudioSegment):
    if MAKE_MONO and seg.channels != 1: seg = seg.set_channels(1)
    if TARGET_SR   and seg.frame_rate != TARGET_SR: seg = seg.set_frame_rate(TARGET_SR)
    if TARGET_SW   and seg.sample_width != TARGET_SW: seg = seg.set_sample_width(TARGET_SW)
    sr = seg.frame_rate; sw = seg.sample_width
    arr = np.array(seg.get_array_of_samples())
    peak = float(1 << (8*sw - 1))  # 32768 for 16-bit
    y = arr.astype(np.float32) / peak
    return y, sr

def gate(y: np.ndarray, k: float) -> np.ndarray:
    rms = float(np.sqrt(np.mean(y**2) + EPS))
    thr = k * rms
    y2 = y.copy()
    y2[np.abs(y2) < thr] = 0.0
    return y2

def normalize(y: np.ndarray, mode: str, target_rms: float) -> np.ndarray:
    if mode == "peak":
        m = float(np.max(np.abs(y)) + EPS); return y / m
    rms = float(np.sqrt(np.mean(y**2) + EPS))
    y2  = y * (target_rms / rms)
    return np.clip(y2, -1.0, 1.0)

def float_to_seg(y: np.ndarray, sr: int) -> AudioSegment:
    y16 = np.clip(y * 32767.0, -32768, 32767).astype(np.int16)
    return AudioSegment(y16.tobytes(), frame_rate=int(sr), sample_width=2, channels=1)

In [4]:
# ==== LOAD LABELS & PROCESS AUDIO ===========================================
labels = pd.read_csv(CSV_LABELS)
labels["nombre_archivo"] = labels["nombre_archivo"].astype(str)
if CLASS_MAP: labels["clase_txt"] = labels["clase"].map(CLASS_MAP)

files = sorted(glob.glob(str(RAW_DIR/"*.wav")))
df = pd.DataFrame({"path": files, "nombre_archivo": [Path(f).name for f in files]})
df = df.merge(labels[["nombre_archivo","clase"]], on="nombre_archivo", how="inner")
assert len(df), "No labeled WAVs found."

proc_paths, y_numeric = [], []
for p, fname, cls in df[["path","nombre_archivo","clase"]].itertuples(index=False, name=None):
    seg = AudioSegment.from_file(p)
    seg = clean_seg(seg)
    y, sr = seg_to_float(seg)
    y = gate(y, GATE_K)
    y = normalize(y, NORM_MODE, TARGET_RMS)
    seg_out = float_to_seg(y, sr)
    outp = PROC_DIR / fname
    seg_out.export(str(outp), format="wav")
    proc_paths.append(str(outp))
    y_numeric.append(cls)

y = np.array(y_numeric)

In [5]:
# ==== FEATURE EXTRACTION (openSMILE) =========================================
# eGeMAPSv02 functionals (88-D) – robust low-dimensional baseline
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)
feats = []
for wav in proc_paths:
    df_feat = smile.process_file(wav)  # 1-row DataFrame
    df_feat["file"] = Path(wav).name
    feats.append(df_feat.reset_index(drop=True))
Xdf = pd.concat(feats, axis=0, ignore_index=True).set_index("file")
X = Xdf.to_numpy()

In [6]:
# ==== MODEL: CALIBRATED LOGISTIC REGRESSION =================================
base = make_pipeline(StandardScaler(with_mean=True), LogisticRegression(max_iter=2000, n_jobs=None))
clf  = CalibratedClassifierCV(base, method="isotonic", cv=3)  # probability calibration

# CV predictions for evaluation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
probs = cross_val_predict(clf, X, y, cv=skf, method="predict_proba", n_jobs=-1)
pred  = probs.argmax(1)
f1m   = f1_score(y, pred, average="macro")
print("CV macro-F1:", f1m)
print(confusion_matrix(y, pred))
print(classification_report(y, pred, digits=3))

CV macro-F1: 0.012131237937689549
[[ 0  0  0  0  0  0]
 [ 1  1  0  0  3  0]
 [ 0 22  0  0  9  0]
 [ 0  3  1  0  3  0]
 [ 0  0  0 29  1  0]
 [ 0  8  1  2 47  0]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000         0
           1      0.029     0.200     0.051         5
           2      0.000     0.000     0.000        31
           3      0.000     0.000     0.000         7
           4      0.016     0.033     0.022        30
           5      0.000     0.000     0.000        58

    accuracy                          0.015       131
   macro avg      0.008     0.039     0.012       131
weighted avg      0.005     0.015     0.007       131



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
# Fit on all data and persist
clf.fit(X, y)
joblib.dump({"model": clf, "columns": Xdf.columns.tolist(), "class_map": CLASS_MAP}, "infantcry_eGeMAPS_logreg.joblib")

['infantcry_eGeMAPS_logreg.joblib']

In [None]:
# ==== INFERENCE WRAPPER WITH ABSTAIN ========================================
def predict_wav(path_wav: str, model_bundle_path="infantcry_eGeMAPS_logreg.joblib", theta=TH_ABSTAIN):
    seg = AudioSegment.from_file(path_wav)
    seg = clean_seg(seg)
    y, sr = seg_to_float(seg)
    y = gate(y, GATE_K)
    y = normalize(y, NORM_MODE, TARGET_RMS)
    seg_out = float_to_seg(y, sr)  # optional: overwrite or keep temp
    tmp = Path(path_wav).with_suffix(".proc.wav"); seg_out.export(str(tmp), format="wav")

    mb = joblib.load(model_bundle_path)
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.eGeMAPSv02,
        feature_level=opensmile.FeatureLevel.Functionals,
    )
    xf = smile.process_file(str(tmp))
    xf = xf[mb["columns"]].to_numpy()
    probs = mb["model"].predict_proba(xf)[0]
    k = probs.argmax(); p = probs[k]
    label = mb["class_map"].get(k+1, k+1) if mb["class_map"] else int(k+1)  # numeric→text if map provided
    return {"label": label if p >= theta else "abstain", "p_max": float(p), "probs": probs.tolist()}

# ==== TEST INFERENCE =========================================================
# Example usage:
res = predict_wav(str(proc_paths[0]))
print(res)


# predict

In [None]:
# %pip install -U pandas numpy scikit-learn pydub opensmile joblib

from pathlib import Path
import glob, json
import numpy as np, pandas as pd, joblib
from pydub import AudioSegment, effects as FX
import opensmile

# ========= CONFIG ============================================================
TEST_DIR   = Path(r"C:\Users\herie\OneDrive - Fundacion Universidad de las Americas Puebla\Proyectos\En Proceso\Paralinguistic Speech Classification for Human Vocalizations\data\raw\Testing_data")
OUT_DIR    = Path(r"C:\Users\herie\OneDrive - Fundacion Universidad de las Americas Puebla\Proyectos\En Proceso\Paralinguistic Speech Classification for Human Vocalizations\data\processed\Testing_data")
MODEL_PATH = Path("models\infantcry_eGeMAPS_logreg.joblib")  # from previous training
PRED_CSV   = OUT_DIR / "predictions_testing.csv"

# Audio processing (must match training)
TARGET_SR  = 16_000           # Hz
TARGET_SW  = 2                # 16-bit
MAKE_MONO  = True
HPF, LPF   = 150, 6000        # Hz
GATE_K     = 1.5              # gate threshold = K * RMS
NORM_MODE  = "rms"            # "peak" or "rms"
TARGET_RMS = 0.1
TH_ABSTAIN = 0.55
EPS        = 1e-12

OUT_DIR.mkdir(parents=True, exist_ok=True)

# ========= HELPERS ===========================================================
def clean_seg(seg: AudioSegment) -> AudioSegment:
    if HPF: seg = FX.high_pass_filter(seg, HPF)
    if LPF: seg = FX.low_pass_filter(seg, LPF)
    return seg

def seg_to_float(seg: AudioSegment):
    if MAKE_MONO and seg.channels != 1: seg = seg.set_channels(1)
    if TARGET_SR   and seg.frame_rate != TARGET_SR: seg = seg.set_frame_rate(TARGET_SR)
    if TARGET_SW   and seg.sample_width != TARGET_SW: seg = seg.set_sample_width(TARGET_SW)
    sr = seg.frame_rate; sw = seg.sample_width
    arr = np.array(seg.get_array_of_samples())
    peak = float(1 << (8*sw - 1))  # 32768 for 16-bit
    y = arr.astype(np.float32) / peak
    return y, sr

def gate(y: np.ndarray, k: float) -> np.ndarray:
    rms = float(np.sqrt(np.mean(y**2) + EPS))
    thr = k * rms
    y2 = y.copy()
    y2[np.abs(y2) < thr] = 0.0
    return y2

def normalize(y: np.ndarray, mode: str, target_rms: float) -> np.ndarray:
    if mode == "peak":
        m = float(np.max(np.abs(y)) + EPS)
        return y / m
    rms = float(np.sqrt(np.mean(y**2) + EPS))
    y2  = y * (target_rms / rms)
    return np.clip(y2, -1.0, 1.0)

def float_to_seg(y: np.ndarray, sr: int) -> AudioSegment:
    y16 = np.clip(y * 32767.0, -32768, 32767).astype(np.int16)
    return AudioSegment(y16.tobytes(), frame_rate=int(sr), sample_width=2, channels=1)

# ========= LOAD MODEL ========================================================
bundle = joblib.load(MODEL_PATH)
clf      = bundle["model"]                     # CalibratedClassifierCV
cols_ref = bundle["columns"]                   # training feature columns order
class_map = bundle.get("class_map", None)      # optional numeric->text map
classes = clf.classes_.tolist()                # model's class labels (e.g., [1,2,3,4,5])

# openSMILE extractor (eGeMAPSv02 Functionals)
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals,
)

# ========= PROCESS + FEATURE + PREDICT ======================================
wav_paths = sorted(glob.glob(str(TEST_DIR / "*.wav")))
assert wav_paths, f"No WAVs found in {TEST_DIR}"

records = []
X_rows, idx = [], []

for p in wav_paths:
    p = Path(p)
    # 1) load + clean + gate + normalize
    seg = AudioSegment.from_file(str(p))
    seg = clean_seg(seg)
    y, sr = seg_to_float(seg)
    y = gate(y, GATE_K)
    y = normalize(y, NORM_MODE, TARGET_RMS)

    # 2) save processed wav
    seg_out = float_to_seg(y, sr)
    out_path = OUT_DIR / p.name
    seg_out.export(str(out_path), format="wav")

    # 3) extract features on processed wav
    feat = smile.process_file(str(out_path))  # 1-row DataFrame
    feat["file"] = p.name
    X_rows.append(feat.reset_index(drop=True))
    idx.append(p.name)

# Stack features and align columns to training reference
Xdf = pd.concat(X_rows, axis=0, ignore_index=True).set_index("file")
# Add missing columns as zeros, drop extras, then reorder
for c in cols_ref:
    if c not in Xdf.columns:
        Xdf[c] = 0.0
extra = [c for c in Xdf.columns if c not in cols_ref]
if extra:
    Xdf = Xdf.drop(columns=extra)
Xdf = Xdf[cols_ref]

# Predict probabilities
probs = clf.predict_proba(Xdf.to_numpy())
kmax  = probs.argmax(axis=1)
pmax  = probs[np.arange(len(pmax:=probs.max(axis=1))), kmax]
pred_raw = [classes[k] for k in kmax]

# Human label mapping and abstain
def to_label(c):
    if class_map:
        return class_map.get(c, c)
    return c

pred_label = [to_label(c) if p >= TH_ABSTAIN else "abstain" for c, p in zip(pred_raw, pmax)]

# Build output table
prob_cols = {f"p_{to_label(c)}": probs[:, i] for i, c in enumerate(classes)}
out = pd.DataFrame({
    "file": Xdf.index,
    "pred": pred_label,
    "p_max": pmax,
    "raw_class": [to_label(c) for c in pred_raw],
    **prob_cols
}).sort_values("file")

# Save CSV
PRED_CSV.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(PRED_CSV, index=False)

print(f"Processed WAVs saved to: {OUT_DIR}")
print(f"Predictions CSV:        {PRED_CSV}")
print(out.head(min(10, len(out))))
