In [None]:
import librosa
import numpy as np
from scipy.stats import skew, entropy


def extract_audio_features(
    path: str,
    sr: int = 22050,
    n_fft: int = 1024,
    hop_length: int = 256,
    trim_db: int = 50,
    n_mels: int = 64
) -> tuple:
    """Extract comprehensive audio features from a single audio file.

    Args:
        path (str): Path to the audio file.
        sr (int, optional): Target sampling rate in Hz. Defaults to 22050.
        n_fft (int, optional): FFT window size for spectral analysis. Defaults to 1024.
        hop_length (int, optional): Number of samples between successive frames. Defaults to 256.
        trim_db (int, optional): Threshold in dB below reference for trimming silence. Defaults to 50.
        n_mels (int, optional): Number of mel bands for mel spectrogram. Defaults to 64.

    Returns:
        tuple: A tuple containing:
            - dict: Dictionary of extracted audio features
            - numpy.ndarray: Trimmed audio signal
            - int: Sample rate
    """
    
    y, sr = librosa.load(path, sr=sr, mono=True)

    # =========================
    # Trimming & structure
    # =========================
    y_trim, idx = librosa.effects.trim(y, top_db=trim_db)

    duration = len(y) / sr
    trimmed_duration = len(y_trim) / sr
    silence_ratio = 1.0 - (trimmed_duration / duration if duration > 0 else 0)

    trim_start_sec = idx[0] / sr
    trim_end_sec = idx[1] / sr

    # =========================
    # RMS envelope
    # =========================
    rms = librosa.feature.rms(
        y=y_trim, hop_length=hop_length
    )[0]

    rms_mean = np.mean(rms)
    rms_std = np.std(rms)

    # =========================
    # Attack time (transient strength)
    # =========================
    if len(rms) > 0 and rms_mean > 0:
        peak_idx = np.argmax(rms)
        attack_time = peak_idx * hop_length / sr
    else:
        attack_time = 0.0

    # =========================
    # Spectral features
    # =========================
    centroid = librosa.feature.spectral_centroid(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )[0]

    bandwidth = librosa.feature.spectral_bandwidth(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )[0]

    rolloff = librosa.feature.spectral_rolloff(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )[0]

    flatness = librosa.feature.spectral_flatness(
        y=y_trim, n_fft=n_fft, hop_length=hop_length
    )[0]

    contrast = librosa.feature.spectral_contrast(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )

    # =========================
    # Spectral flux (temporal change)
    # =========================
    spectral_flux = librosa.onset.onset_strength(
        y=y_trim, sr=sr, hop_length=hop_length
    )

    # =========================
    # Zero-crossing rate
    # =========================
    zcr = librosa.feature.zero_crossing_rate(
        y_trim, hop_length=hop_length
    )[0]

    # =========================
    # Temporal centroid
    # =========================
    frame_times = librosa.frames_to_time(
        np.arange(len(rms)), sr=sr, hop_length=hop_length
    )

    temporal_centroid = (
        np.sum(frame_times * rms) / np.sum(rms)
        if np.sum(rms) > 0 else 0.0
    )

    # =========================
    # Directionality (slopes)
    # =========================
    def slope(x):
        if len(x) < 2:
            return 0.0
        return np.polyfit(np.arange(len(x)), x, 1)[0]

    rms_slope = slope(rms)
    centroid_slope = slope(centroid)

    # =========================
    # Early vs late energy
    # =========================
    mid = len(rms) // 2
    early_energy = np.sum(rms[:mid])
    late_energy = np.sum(rms[mid:])
    early_late_energy_ratio = (
        late_energy / early_energy if early_energy > 0 else 0.0
    )

    # =========================
    # Harmonic / percussive
    # =========================
    y_harm, y_perc = librosa.effects.hpss(y_trim)

    harm_rms = np.mean(librosa.feature.rms(y=y_harm))
    perc_rms = np.mean(librosa.feature.rms(y=y_perc))

    harmonic_percussive_ratio = (
        harm_rms / perc_rms if perc_rms > 0 else 0.0
    )

    # =========================
    # Onsets & rhythm
    # =========================
    onsets = librosa.onset.onset_detect(
        y=y_trim, sr=sr, hop_length=hop_length, units="time"
    )

    onset_count = len(onsets)
    onset_density = onset_count / trimmed_duration if trimmed_duration > 0 else 0.0
    mean_inter_onset_interval = (
        np.mean(np.diff(onsets)) if len(onsets) > 1 else 0.0
    )

    # =========================
    # Pitch (pyin)
    # =========================
    f0, voiced_flag, voiced_prob = librosa.pyin(
        y_trim,
        fmin=librosa.note_to_hz("C2"),
        fmax=librosa.note_to_hz("C7"),
        hop_length=hop_length
    )

    f0_clean = f0[~np.isnan(f0)]
    pitch_conf = voiced_prob[~np.isnan(f0)]

    # =========================
    # Chroma
    # =========================
    chroma = librosa.feature.chroma_stft(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )

    chroma_mean = np.mean(chroma, axis=1)
    chroma_std = np.std(chroma, axis=1)
    chroma_entropy = entropy(chroma_mean + 1e-8)

    # =========================
    # MFCCs
    # =========================
    mfcc = librosa.feature.mfcc(
        y=y_trim, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length
    )

    # =========================
    # Log-mel spectrogram (NEW)
    # =========================
    mel = librosa.feature.melspectrogram(
        y=y_trim, sr=sr, n_fft=n_fft,
        hop_length=hop_length, n_mels=n_mels
    )

    log_mel = librosa.power_to_db(mel)

    # =========================
    # Assemble feature row
    # =========================
    row = {
        # structure
        "duration_sec": duration,
        "trimmed_duration_sec": trimmed_duration,
        "silence_ratio": silence_ratio,
        "trim_start_sec": trim_start_sec,
        "trim_end_sec": trim_end_sec,

        # energy
        "rms_mean": rms_mean,
        "rms_std": rms_std,
        "peak_amplitude": float(np.max(np.abs(y_trim))),
        "crest_factor": (
            float(np.max(np.abs(y_trim)) / rms_mean)
            if rms_mean > 0 else 0.0
        ),
        "attack_time_sec": attack_time,

        # texture
        "zcr_mean": float(np.mean(zcr)),
        "zcr_std": float(np.std(zcr)),

        # envelope
        "temporal_centroid": temporal_centroid,
        "rms_slope": rms_slope,
        "spectral_centroid_slope": centroid_slope,
        "early_late_energy_ratio": early_late_energy_ratio,

        # spectral shape
        "spectral_centroid_mean": float(np.mean(centroid)),
        "spectral_centroid_std": float(np.std(centroid)),
        "spectral_centroid_skew": float(skew(centroid)),
        "spectral_bandwidth_mean": float(np.mean(bandwidth)),
        "spectral_bandwidth_std": float(np.std(bandwidth)),
        "spectral_rolloff_mean": float(np.mean(rolloff)),
        "spectral_rolloff_std": float(np.std(rolloff)),

        # spectral texture
        "spectral_flatness_mean": float(np.mean(flatness)),
        "spectral_flatness_std": float(np.std(flatness)),
        "spectral_contrast_mean": float(np.mean(contrast)),
        "spectral_contrast_std": float(np.std(contrast)),
        "spectral_flux_mean": float(np.mean(spectral_flux)),
        "spectral_flux_std": float(np.std(spectral_flux)),

        # harmonicity
        "harmonic_rms": float(harm_rms),
        "percussive_rms": float(perc_rms),
        "harmonic_percussive_ratio": float(harmonic_percussive_ratio),

        # rhythm
        "onset_count": int(onset_count),
        "onset_density": float(onset_density),
        "mean_inter_onset_interval": float(mean_inter_onset_interval),

        # pitch
        "f0_mean": float(np.mean(f0_clean)) if len(f0_clean) else 0.0,
        "f0_std": float(np.std(f0_clean)) if len(f0_clean) else 0.0,
        "pitch_confidence_mean": float(np.mean(pitch_conf)) if len(pitch_conf) else 0.0,
        "pitch_confidence_std": float(np.std(pitch_conf)) if len(pitch_conf) else 0.0,

        # chroma
        "chroma_entropy": float(chroma_entropy),

        # mel
        "log_mel_mean": float(np.mean(log_mel)),
        "log_mel_std": float(np.std(log_mel)),
    }

    # individual chroma bins
    chroma_labels = ["C","Cs","D","Ds","E","F","Fs","G","Gs","A","As","B"]
    for i, note in enumerate(chroma_labels):
        row[f"chroma_mean_{note}"] = float(chroma_mean[i])
        row[f"chroma_std_{note}"] = float(chroma_std[i])

    # MFCC stats
    for i in range(mfcc.shape[0]):
        row[f"mfcc_{i+1:02d}_mean"] = float(np.mean(mfcc[i]))
        row[f"mfcc_{i+1:02d}_std"] = float(np.std(mfcc[i]))

    return row, y_trim, sr


In [None]:
import os
import pandas as pd
import soundfile as sf

data_scratch_path = "/Users/jackcdawson/Desktop/dev/sample_identifier/data/scratch"

rows = []
labels = []

for root, dirs, files in os.walk(os.path.join(data_scratch_path, "original")):
    for file in files:
        if file.endswith('.wav'):
            wav_path = os.path.join(root, file)
            sample_name = file.split('.')[0]

            # Extract category structure from path
            rel_path = os.path.relpath(wav_path, data_scratch_path)
            path_parts = rel_path.split(os.sep)

            category = path_parts[0] if len(path_parts) > 1 else "unknown"
            sub_category = path_parts[1] if len(path_parts) > 2 else "unknown"

            # # Generate unique ID
            # sample_id = str(uuid.uuid4())

            # Extract features
            row, y_trim, sr = extract_audio_features(wav_path)
            row['id'] = sample_name
            row['category'] = category
            row['sub_category'] = sub_category
            rows.append(row)

            # Create label entry
            labels.append({
                'id': sample_name,
                'top_level': category,
                'sub_level': sub_category,
                'temporal': 'loop' if sub_category.lower() == 'loops' else 'one-shot',
                'source': 'personal'
            })

            sf.write(os.path.join(data_scratch_path, f"trimmed/{sample_name}_trimmed.wav"), y_trim, sr)

df = pd.DataFrame(rows)
df_labels = pd.DataFrame(labels)

# Save labels
df_labels.to_csv('/Users/jackcdawson/Desktop/dev/sample_identifier/data/clean/labels_personal.csv', index=False)

  return pitch_tuning(


In [5]:
df.index = df['id']

In [6]:
df.sort_values(by=["silence_ratio"], ascending=False)

Unnamed: 0_level_0,duration_sec,trimmed_duration_sec,silence_ratio,trim_start_sec,trim_end_sec,rms_mean,rms_std,peak_amplitude,crest_factor,attack_time_sec,...,mfcc_10_std,mfcc_11_mean,mfcc_11_std,mfcc_12_mean,mfcc_12_std,mfcc_13_mean,mfcc_13_std,id,category,sub_category
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hat_closed,0.5,0.1161,0.7678,0.0,0.1161,0.055976,0.041304,0.576802,10.304511,0.03483,...,7.034547,-3.177755,5.419004,9.683946,5.881507,-0.574009,3.521704,hat_closed,original,unknown
clap,0.5,0.25542,0.489161,0.0,0.25542,0.068497,0.078845,0.719108,10.498418,0.04644,...,4.967082,-6.791317,5.30431,-2.085245,4.799422,-4.38009,5.261796,clap,original,unknown
cymbal,5.327619,2.786395,0.476991,0.0,2.786395,0.010599,0.015995,0.211972,20.000032,0.04644,...,3.840393,2.177898,4.891622,0.442917,3.861923,1.486579,4.502636,cymbal,original,unknown
riser,5.177143,2.902494,0.439364,0.580499,3.482993,0.017137,0.019137,0.208965,12.194122,1.520907,...,5.525521,-3.180156,4.15,7.328354,4.87631,13.343936,9.051413,riser,original,unknown
impact,1.465442,1.277098,0.128524,0.0,1.277098,0.036299,0.02416,0.969669,26.7131,0.04644,...,4.987221,-14.368458,5.505005,-6.764637,5.942872,-11.242653,5.793324,impact,original,unknown
808,5.130113,5.03873,0.017813,0.0,5.03873,0.100602,0.110983,0.943981,9.383314,0.04644,...,2.638034,4.728661,8.179127,5.629911,4.060727,8.617923,3.895463,808,original,unknown
kick,0.233651,0.2322,0.006211,0.0,0.2322,0.274865,0.197483,0.864746,3.146071,0.05805,...,10.913725,11.538823,12.271362,12.514277,9.093626,9.13727,6.470431,kick,original,unknown
fall,5.625034,5.596009,0.00516,0.0,5.596009,0.16641,0.084569,0.638384,3.836213,1.404807,...,7.73018,-2.410898,10.093077,-16.699436,12.579381,-9.04696,15.414943,fall,original,unknown
drum_loop,45.573061,45.573061,0.0,0.0,45.573061,0.00749,0.002984,0.176044,23.50297,27.283447,...,5.037512,-4.58763,5.096181,5.531654,3.956225,-12.985322,5.576625,drum_loop,original,unknown
SFX_vocal,0.312426,0.312426,0.0,0.0,0.312426,0.028869,0.016975,0.169659,5.87693,0.08127,...,4.9744,-6.164548,6.665954,5.180094,8.694826,3.130861,5.076095,SFX_vocal,original,unknown
