In [10]:
import pandas as pd

In [11]:
fft_df = pd.read_parquet("results/fft_long_0_4000.parquet")
fft_df

Unnamed: 0,nombre_archivo,clase,freq_hz,mag
0,1.wav,1,0.000000,101.098053
1,1.wav,1,0.004443,23.934931
2,1.wav,1,0.008887,7.819663
3,1.wav,1,0.013330,6.795501
4,1.wav,1,0.017774,14.025899
...,...,...,...,...
16714549,182.wav,5,3999.537109,29.588720
16714550,182.wav,5,3999.634521,19.994854
16714551,182.wav,5,3999.731934,34.740459
16714552,182.wav,5,3999.829590,28.425756


In [12]:
import numpy as np
import pandas as pd

_EPS = 1e-18

def _pearson_kurtosis(x: np.ndarray) -> float:
    x = np.asarray(x, float)
    if x.size == 0: return np.nan
    mu = x.mean()
    v  = ((x-mu)**2).mean()
    if v <= 0: return np.nan
    m4 = ((x-mu)**4).mean()
    return m4/(v*v)

def _spec_entropy_from_power(p: np.ndarray) -> float:
    s = p.sum()
    n = p.size
    if n == 0 or s <= 0: return np.nan
    q = p/s
    return float((-(q*np.log(q+_EPS)).sum())/np.log(n))

def _rolloff(freq: np.ndarray, p: np.ndarray, frac: float) -> float:
    if p.size == 0 or p.sum() <= 0: return np.nan
    idx = np.argsort(freq)
    f = freq[idx]; w = p[idx].cumsum()
    k = frac * w[-1]
    j = np.searchsorted(w, k, side="left")
    j = min(j, f.size-1)
    return float(f[j])

def _linreg_slope_r2(x: np.ndarray, y: np.ndarray):
    x = np.asarray(x, float); y = np.asarray(y, float)
    vx = x.var()
    vy = y.var()
    if x.size < 2 or vx <= 0 or vy <= 0: return (np.nan, np.nan)
    cov = ((x-x.mean())*(y-y.mean())).mean()
    slope = cov / vx
    r2 = (cov**2)/(vx*vy)
    return float(slope), float(r2)

def _quartile_metrics(g: pd.DataFrame) -> pd.Series:
    f = g["freq_hz"].to_numpy(dtype=float)
    m = g["mag"].to_numpy(dtype=float)
    p = (m*m)

    n = p.size
    sp = p.sum()
    mean = m.mean() if n else np.nan
    std  = m.std(ddof=1) if n>1 else np.nan

    if sp > 0:
        cen = float((f*p).sum()/sp)
        var_f = float(((f-cen)**2 * p).sum()/sp)
        bw = np.sqrt(var_f)
        skew = float((( (f-cen)**3 * p).sum()/sp) / (bw**3)) if bw>0 else np.nan
        kurt = float((( (f-cen)**4 * p).sum()/sp) / (bw**4)) if bw>0 else np.nan
        ent = _spec_entropy_from_power(p)
    else:
        cen=bw=skew=kurt=ent=np.nan

    # flatness and crest (magnitude domain)
    am = p.mean() if n else np.nan
    gm = float(np.exp(np.log(p+_EPS).mean())) if n else np.nan
    flat = float(gm/am) if am and np.isfinite(gm) else np.nan
    rms = np.sqrt((m*m).mean()) if n else np.nan
    crest = float(m.max()/rms) if n and rms>0 else np.nan

    # slope on log power vs freq
    slope, r2 = _linreg_slope_r2(f, np.log(p+_EPS))

    # peak
    if n:
        j = int(np.argmax(m))
        f_pk = float(f[j]); m_pk = float(m[j])
    else:
        f_pk = m_pk = np.nan

    return pd.Series({
        "mean": mean, "std": std,
        "spec_centroid": cen, "spec_entropy": ent,
        "spec_bandwidth": bw, "spec_skewness": skew, "spec_kurtosis": kurt,
        "flatness": flat, "crest": crest,
        "slope": slope, "slope_r2": r2,
        "f_peak": f_pk, "mag_peak": m_pk,
        "sum_power": sp, "count": n
    })

def _global_metrics(g: pd.DataFrame) -> pd.Series:
    f = g["freq_hz"].to_numpy(dtype=float)
    m = g["mag"].to_numpy(dtype=float)
    p = (m*m)

    # magnitude stats
    generalmean = float(m.mean()) if m.size else np.nan
    generalstd  = float(m.std(ddof=1)) if m.size>1 else np.nan
    generalkurtosis = _pearson_kurtosis(m)

    # spectral moments
    sp = p.sum()
    if sp > 0:
        cen = float((f*p).sum()/sp)
        var_f = float(((f-cen)**2 * p).sum()/sp)
        bw = np.sqrt(var_f)
        skew = float((( (f-cen)**3 * p).sum()/sp) / (bw**3)) if bw>0 else np.nan
        kurt = float((( (f-cen)**4 * p).sum()/sp) / (bw**4)) if bw>0 else np.nan
        ent = _spec_entropy_from_power(p)
        r85 = _rolloff(f, p, 0.85)
        r95 = _rolloff(f, p, 0.95)
        r50 = _rolloff(f, p, 0.50)
    else:
        cen=bw=skew=kurt=ent=r85=r95=r50=np.nan

    # flatness, crest
    am = p.mean() if p.size else np.nan
    gm = float(np.exp(np.log(p+_EPS).mean())) if p.size else np.nan
    flat = float(gm/am) if am and np.isfinite(gm) else np.nan
    rms = np.sqrt((m*m).mean()) if m.size else np.nan
    crest = float(m.max()/rms) if m.size and rms>0 else np.nan

    # slope log-power vs freq
    slope, r2 = _linreg_slope_r2(f, np.log(p+_EPS))

    # peak
    if m.size:
        j = int(np.argmax(m))
        f_pk = float(f[j]); m_pk = float(m[j])
    else:
        f_pk = m_pk = np.nan

    return pd.Series({
        "generalmean": generalmean, "generalstd": generalstd, "generalkurtosis": generalkurtosis,
        "spec_centroid_all": cen, "spec_entropy_all": ent,
        "spec_bandwidth_all": bw, "spec_skewness_all": skew, "spec_kurtosis_all": kurt,
        "spec_rolloff85_hz": r85, "spec_rolloff95_hz": r95, "spec_median_freq_hz": r50,
        "flatness_all": flat, "crest_all": crest,
        "slope_all": slope, "slope_r2_all": r2,
        "f_peak_all": f_pk, "mag_peak_all": m_pk,
        "total_power": float(sp)
    })

def build_quartile_features_plus(
    fft_df: pd.DataFrame,
    fmax: float = 4000.0,
    nbins: int = 4,
    keep_class: bool = True
) -> pd.DataFrame:
    df = fft_df.copy()
    if "nombre_archivo" not in df.columns and "mbre_archivo" in df.columns:
        df = df.rename(columns={"mbre_archivo": "nombre_archivo"})
    for c in ("nombre_archivo","freq_hz","mag"):
        if c not in df.columns:
            raise ValueError(f"Missing column: {c}")

    # auto kHz→Hz if needed
    fmax_obs = pd.to_numeric(df["freq_hz"], errors="coerce").max()
    if np.isfinite(fmax_obs) and fmax_obs <= 10.0 and fmax > 10.0:
        df["freq_hz"] = df["freq_hz"].astype(float) * 1000.0

    # clip band
    cols = ["nombre_archivo","freq_hz","mag"] + (["clase"] if "clase" in df.columns else [])
    df = df.loc[(df["freq_hz"] >= 0) & (df["freq_hz"] <= fmax), cols].copy()
    if df.empty:
        return pd.DataFrame(columns=["nombre_archivo"])

    # quartiles
    edges = np.linspace(0.0, fmax, nbins + 1)
    labels = list(range(1, nbins + 1))
    df["q"] = pd.cut(df["freq_hz"], bins=edges, labels=labels, include_lowest=True, right=True)

    # per-quartile metrics
    grp_q = df.groupby(["nombre_archivo","q"], sort=False, observed=True)
    q_feats = grp_q.apply(_quartile_metrics)

    # add power ratio per quartile vs file total
    total_power = df.assign(power=lambda x: x["mag"].astype(float)**2) \
                    .groupby("nombre_archivo", observed=True)["power"].sum().rename("total_power")
    q_feats = q_feats.join(total_power, on="nombre_archivo")
    q_feats["power_ratio"] = q_feats["sum_power"] / q_feats["total_power"]
    q_feats = q_feats.drop(columns=["total_power"])

    # pivot wide to 1q*,2q*,…
    wide = q_feats.unstack("q")
    wide.columns = [f"{int(q)}q{metric}" for metric, q in wide.columns]

    # ensure all quartiles present
    base_metrics = ["mean","std","spec_centroid","spec_entropy","spec_bandwidth",
                    "spec_skewness","spec_kurtosis","flatness","crest",
                    "slope","slope_r2","f_peak","mag_peak","sum_power","count","power_ratio"]
    for q in labels:
        for m in base_metrics:
            col = f"{q}q{m}"
            if col not in wide.columns:
                wide[col] = np.nan

    # global metrics
    grp_f = df.groupby("nombre_archivo", sort=False, observed=True)
    g_feats = grp_f.apply(_global_metrics)

    out = pd.concat([wide, g_feats], axis=1).reset_index()

    # attach class if present
    if keep_class and "clase" in df.columns:
        cls = df.groupby("nombre_archivo", sort=False)["clase"].first().rename("clase").reset_index()
        out = cls.merge(out, on="nombre_archivo", how="right")

    # reorder head
    head = ["nombre_archivo"] + (["clase"] if keep_class and "clase" in df.columns else [])
    out = out[[c for c in head] + [c for c in out.columns if c not in head]]
    return out


In [13]:
features_df = build_quartile_features_plus(fft_df, fmax=4000.0, nbins=8, keep_class=True)

  q_feats = grp_q.apply(_quartile_metrics)
  g_feats = grp_f.apply(_global_metrics)


In [14]:
# results/test_fft_long.csv
fft_df2  = pd.read_parquet("test_fft_long.parquet")
fft_df2

Unnamed: 0,nombre_archivo,freq_hz,mag
0,1.wav,0.000000,4352.191406
1,1.wav,0.051712,68.802200
2,1.wav,0.103423,80.728325
3,1.wav,0.155135,103.900597
4,1.wav,0.206846,48.982105
...,...,...,...
10571954,39.wav,3999.683594,1.008719
10571955,39.wav,3999.762695,0.464301
10571956,39.wav,3999.841797,0.717871
10571957,39.wav,3999.920898,1.351254


In [15]:
features2_df = build_quartile_features_plus(fft_df2, fmax=4000.0, nbins=8, keep_class=True)
features2_df

  q_feats = grp_q.apply(_quartile_metrics)
  g_feats = grp_f.apply(_global_metrics)


Unnamed: 0,nombre_archivo,1qmean,2qmean,3qmean,4qmean,5qmean,6qmean,7qmean,8qmean,1qstd,...,spec_rolloff85_hz,spec_rolloff95_hz,spec_median_freq_hz,flatness_all,crest_all,slope_all,slope_r2_all,f_peak_all,mag_peak_all,total_power
0,1.wav,44.646703,262.371773,387.913823,196.019828,112.867148,89.186084,183.446396,39.679286,51.656685,...,2077.510498,3242.674316,1155.131958,0.166757,17.846808,-0.000305,0.027974,0.0,4352.191406,4600150000.0
1,10.wav,1.764025,4.134446,2.579844,1.323256,1.985162,1.463862,0.847785,0.797766,7.962886,...,1075.816162,2373.277344,113.673409,0.09749,181.950218,-0.000467,0.08467,0.0,707.302612,1007146.0
2,11.wav,80.060872,562.938533,698.089855,194.404218,143.179116,144.654953,138.284557,90.192041,70.846572,...,1352.87207,2364.552246,1057.785645,0.128379,14.788341,-0.000439,0.063048,0.0,6213.827637,24875690000.0
3,12.wav,1.032495,4.072379,2.708354,1.283481,1.854254,1.82006,1.287681,1.031298,6.281753,...,1492.861938,2900.472412,914.69043,0.112791,115.74762,2e-06,2e-06,0.0,418.674072,489890.1
4,13.wav,0.886648,4.965763,5.146765,3.246612,5.150373,4.564243,1.796601,1.989749,7.869969,...,2498.724365,2828.062012,1052.04541,0.126748,114.799384,0.000246,0.017818,0.0,678.251953,2099889.0
5,14.wav,14.653233,45.349516,147.758928,47.056187,17.674406,17.68288,14.821948,3.467968,14.317297,...,1453.604004,1675.010254,1236.812866,0.061688,9.891776,-0.001054,0.213279,1143.500366,691.576355,173705100.0
6,15.wav,72.497004,160.502243,301.453871,76.036632,56.290851,56.27043,65.747613,76.518571,58.568056,...,1475.03479,3307.362549,1197.237305,0.218069,6.926441,-0.000364,0.057165,1229.99646,1087.660889,451645100.0
7,16.wav,0.864656,3.558472,3.119422,1.257159,2.908333,0.970577,0.952702,1.085697,7.735972,...,1829.552734,2470.215332,547.44574,0.079002,171.794789,-0.000144,0.006584,0.0,673.765503,944514.0
8,17.wav,678.457643,364.312823,290.264269,352.216583,263.210275,87.806182,61.713359,53.445192,566.100365,...,1748.504639,2179.911377,308.165375,0.149352,22.52289,-0.001401,0.540756,0.0,9194.616211,26953490000.0
9,18.wav,18.165811,121.328098,123.767642,32.237805,36.556065,34.888335,24.454995,7.337508,28.461198,...,1317.720581,2222.101318,1016.14209,0.077357,14.742714,-0.000724,0.136714,0.0,1372.871948,195538600.0


In [16]:
features_df

Unnamed: 0,nombre_archivo,clase,1qmean,2qmean,3qmean,4qmean,5qmean,6qmean,7qmean,8qmean,...,spec_rolloff85_hz,spec_rolloff95_hz,spec_median_freq_hz,flatness_all,crest_all,slope_all,slope_r2_all,f_peak_all,mag_peak_all,total_power
0,1.wav,1,137.842234,152.599879,169.722858,79.361699,32.167138,42.170346,54.615921,25.054296,...,1491.029907,2619.628906,948.501099,0.139937,186.457673,-0.000977,0.288460,29.277962,25754.902344,1.717506e+10
1,100.wav,4,28.709245,90.594988,112.371034,99.579382,97.527817,53.941948,67.853725,11.701225,...,2589.554443,3237.848877,1634.019043,0.241413,21.517343,-0.000478,0.072781,0.000000,2017.091431,2.037686e+08
2,101.wav,4,83.813208,308.518916,476.514665,340.270231,261.854506,157.873041,268.726813,50.262713,...,2511.566895,3211.632568,1277.455811,0.214018,29.404735,-0.000353,0.040017,0.000000,9933.294922,1.358534e+10
3,102.wav,4,59.069083,240.859903,449.225510,277.060761,198.504706,253.149420,405.688683,48.908448,...,3165.892822,3334.579590,1768.466675,0.196412,34.229077,-0.000052,0.000760,0.000000,11449.164062,1.524684e+10
4,103.wav,4,68.893842,177.747416,334.663519,356.989510,273.510929,307.799863,387.184501,64.052258,...,3141.569580,3312.933594,2060.596680,0.236562,24.313856,0.000113,0.004162,0.000000,8018.140137,1.100957e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,95.wav,4,102.470965,307.013566,528.320801,555.437152,297.473840,169.750643,249.940310,61.483224,...,2166.263184,3126.041016,1660.199341,0.206711,12.613259,-0.000364,0.042700,0.000000,4997.024902,1.181288e+10
127,96.wav,4,63.331120,249.610218,392.897599,401.460804,195.710557,101.864100,181.547575,44.126938,...,2078.457764,3186.797119,1552.634888,0.171251,15.313666,-0.000377,0.040924,0.000000,4521.676758,5.013386e+09
128,97.wav,4,84.628423,293.840371,481.949749,596.259690,339.471856,110.704481,157.911902,56.136676,...,2058.252197,2391.632568,1709.069458,0.156440,18.259828,-0.000449,0.056630,0.000000,7167.521484,1.446975e+10
129,98.wav,4,77.201243,313.438162,450.354769,361.353516,176.969634,107.731511,157.528783,42.734077,...,1964.265991,3024.677979,1253.615356,0.179505,15.818186,-0.000546,0.091341,0.000000,4825.437988,5.820491e+09


In [17]:
features2_df

Unnamed: 0,nombre_archivo,1qmean,2qmean,3qmean,4qmean,5qmean,6qmean,7qmean,8qmean,1qstd,...,spec_rolloff85_hz,spec_rolloff95_hz,spec_median_freq_hz,flatness_all,crest_all,slope_all,slope_r2_all,f_peak_all,mag_peak_all,total_power
0,1.wav,44.646703,262.371773,387.913823,196.019828,112.867148,89.186084,183.446396,39.679286,51.656685,...,2077.510498,3242.674316,1155.131958,0.166757,17.846808,-0.000305,0.027974,0.0,4352.191406,4600150000.0
1,10.wav,1.764025,4.134446,2.579844,1.323256,1.985162,1.463862,0.847785,0.797766,7.962886,...,1075.816162,2373.277344,113.673409,0.09749,181.950218,-0.000467,0.08467,0.0,707.302612,1007146.0
2,11.wav,80.060872,562.938533,698.089855,194.404218,143.179116,144.654953,138.284557,90.192041,70.846572,...,1352.87207,2364.552246,1057.785645,0.128379,14.788341,-0.000439,0.063048,0.0,6213.827637,24875690000.0
3,12.wav,1.032495,4.072379,2.708354,1.283481,1.854254,1.82006,1.287681,1.031298,6.281753,...,1492.861938,2900.472412,914.69043,0.112791,115.74762,2e-06,2e-06,0.0,418.674072,489890.1
4,13.wav,0.886648,4.965763,5.146765,3.246612,5.150373,4.564243,1.796601,1.989749,7.869969,...,2498.724365,2828.062012,1052.04541,0.126748,114.799384,0.000246,0.017818,0.0,678.251953,2099889.0
5,14.wav,14.653233,45.349516,147.758928,47.056187,17.674406,17.68288,14.821948,3.467968,14.317297,...,1453.604004,1675.010254,1236.812866,0.061688,9.891776,-0.001054,0.213279,1143.500366,691.576355,173705100.0
6,15.wav,72.497004,160.502243,301.453871,76.036632,56.290851,56.27043,65.747613,76.518571,58.568056,...,1475.03479,3307.362549,1197.237305,0.218069,6.926441,-0.000364,0.057165,1229.99646,1087.660889,451645100.0
7,16.wav,0.864656,3.558472,3.119422,1.257159,2.908333,0.970577,0.952702,1.085697,7.735972,...,1829.552734,2470.215332,547.44574,0.079002,171.794789,-0.000144,0.006584,0.0,673.765503,944514.0
8,17.wav,678.457643,364.312823,290.264269,352.216583,263.210275,87.806182,61.713359,53.445192,566.100365,...,1748.504639,2179.911377,308.165375,0.149352,22.52289,-0.001401,0.540756,0.0,9194.616211,26953490000.0
9,18.wav,18.165811,121.328098,123.767642,32.237805,36.556065,34.888335,24.454995,7.337508,28.461198,...,1317.720581,2222.101318,1016.14209,0.077357,14.742714,-0.000724,0.136714,0.0,1372.871948,195538600.0


In [18]:
features_df.to_parquet("results/train.parquet", index=False)
features2_df.to_parquet("results/test.parquet", index=False)

In [19]:
features_df

Unnamed: 0,nombre_archivo,clase,1qmean,2qmean,3qmean,4qmean,5qmean,6qmean,7qmean,8qmean,...,spec_rolloff85_hz,spec_rolloff95_hz,spec_median_freq_hz,flatness_all,crest_all,slope_all,slope_r2_all,f_peak_all,mag_peak_all,total_power
0,1.wav,1,137.842234,152.599879,169.722858,79.361699,32.167138,42.170346,54.615921,25.054296,...,1491.029907,2619.628906,948.501099,0.139937,186.457673,-0.000977,0.288460,29.277962,25754.902344,1.717506e+10
1,100.wav,4,28.709245,90.594988,112.371034,99.579382,97.527817,53.941948,67.853725,11.701225,...,2589.554443,3237.848877,1634.019043,0.241413,21.517343,-0.000478,0.072781,0.000000,2017.091431,2.037686e+08
2,101.wav,4,83.813208,308.518916,476.514665,340.270231,261.854506,157.873041,268.726813,50.262713,...,2511.566895,3211.632568,1277.455811,0.214018,29.404735,-0.000353,0.040017,0.000000,9933.294922,1.358534e+10
3,102.wav,4,59.069083,240.859903,449.225510,277.060761,198.504706,253.149420,405.688683,48.908448,...,3165.892822,3334.579590,1768.466675,0.196412,34.229077,-0.000052,0.000760,0.000000,11449.164062,1.524684e+10
4,103.wav,4,68.893842,177.747416,334.663519,356.989510,273.510929,307.799863,387.184501,64.052258,...,3141.569580,3312.933594,2060.596680,0.236562,24.313856,0.000113,0.004162,0.000000,8018.140137,1.100957e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,95.wav,4,102.470965,307.013566,528.320801,555.437152,297.473840,169.750643,249.940310,61.483224,...,2166.263184,3126.041016,1660.199341,0.206711,12.613259,-0.000364,0.042700,0.000000,4997.024902,1.181288e+10
127,96.wav,4,63.331120,249.610218,392.897599,401.460804,195.710557,101.864100,181.547575,44.126938,...,2078.457764,3186.797119,1552.634888,0.171251,15.313666,-0.000377,0.040924,0.000000,4521.676758,5.013386e+09
128,97.wav,4,84.628423,293.840371,481.949749,596.259690,339.471856,110.704481,157.911902,56.136676,...,2058.252197,2391.632568,1709.069458,0.156440,18.259828,-0.000449,0.056630,0.000000,7167.521484,1.446975e+10
129,98.wav,4,77.201243,313.438162,450.354769,361.353516,176.969634,107.731511,157.528783,42.734077,...,1964.265991,3024.677979,1253.615356,0.179505,15.818186,-0.000546,0.091341,0.000000,4825.437988,5.820491e+09
