In [None]:
import uuid
import json
import librosa
import pandas as pd
import os
import numpy as np
from scipy.stats import skew, entropy


def extract_audio_features(
    path,
    sr=22050,
    n_fft=1024,
    hop_length=256,
    trim_db=40
):
    y, sr = librosa.load(path, sr=sr, mono=True)

    # --- trimming ---
    y_trim, idx = librosa.effects.trim(y, top_db=trim_db)
    trim_start = idx[0] / sr
    trim_end = idx[1] / sr

    duration = len(y) / sr
    trimmed_duration = len(y_trim) / sr
    silence_ratio = 1.0 - (trimmed_duration / duration if duration > 0 else 0)

    # --- RMS envelope ---
    rms = librosa.feature.rms(y=y_trim, hop_length=hop_length)[0]

    # --- Spectral features ---
    centroid = librosa.feature.spectral_centroid(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )[0]
    bandwidth = librosa.feature.spectral_bandwidth(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )[0]
    rolloff = librosa.feature.spectral_rolloff(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )[0]
    flatness = librosa.feature.spectral_flatness(
        y=y_trim, n_fft=n_fft, hop_length=hop_length
    )[0]
    contrast = librosa.feature.spectral_contrast(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )

    # --- ZCR ---
    zcr = librosa.feature.zero_crossing_rate(
        y_trim, hop_length=hop_length
    )[0]

    # --- Temporal centroid ---
    frame_times = librosa.frames_to_time(
        np.arange(len(rms)), sr=sr, hop_length=hop_length
    )
    temporal_centroid = np.sum(frame_times * rms) / np.sum(rms) if np.sum(rms) > 0 else 0

    # --- Slopes (directionality) ---
    def slope(x):
        if len(x) < 2:
            return 0.0
        return np.polyfit(np.arange(len(x)), x, 1)[0]

    rms_slope = slope(rms)
    centroid_slope = slope(centroid)

    # --- Early / late energy ---
    mid = len(rms) // 2
    early_energy = np.sum(rms[:mid])
    late_energy = np.sum(rms[mid:])
    early_late_energy_ratio = (
        late_energy / early_energy if early_energy > 0 else 0
    )

    # --- Harmonic / percussive ---
    y_harm, y_perc = librosa.effects.hpss(y_trim)
    harm_rms = np.mean(librosa.feature.rms(y=y_harm))
    perc_rms = np.mean(librosa.feature.rms(y=y_perc))
    hp_ratio = harm_rms / perc_rms if perc_rms > 0 else 0

    # --- Onsets ---
    onsets = librosa.onset.onset_detect(
        y=y_trim, sr=sr, hop_length=hop_length, units="time"
    )
    onset_count = len(onsets)
    onset_density = onset_count / trimmed_duration if trimmed_duration > 0 else 0
    mean_ioi = np.mean(np.diff(onsets)) if len(onsets) > 1 else 0

    # --- Pitch ---
    f0, voiced_flag, voiced_prob = librosa.pyin(
        y_trim,
        fmin=librosa.note_to_hz("C2"),
        fmax=librosa.note_to_hz("C7"),
        hop_length=hop_length
    )
    f0_clean = f0[~np.isnan(f0)]
    pitch_conf = voiced_prob[~np.isnan(f0)]

    # --- Chroma ---
    chroma = librosa.feature.chroma_stft(
        y=y_trim, sr=sr, n_fft=n_fft, hop_length=hop_length
    )

    # --- MFCCs ---
    mfcc = librosa.feature.mfcc(
        y=y_trim, sr=sr, n_mfcc=13, n_fft=n_fft, hop_length=hop_length
    )

    # --- Assemble row ---
    row = {
        # structure
        "duration_sec": duration,
        "trimmed_duration_sec": trimmed_duration,
        "sample_rate": sr,
        "channels": 1,
        "silence_ratio": silence_ratio,
        "trim_start_sec": trim_start,
        "trim_end_sec": trim_end,

        # energy
        "rms_mean": float(np.mean(rms)),
        "rms_std": float(np.std(rms)),
        "peak_amplitude": float(np.max(np.abs(y_trim))),
        "crest_factor": float(np.max(np.abs(y_trim)) / np.mean(rms)) if np.mean(rms) > 0 else 0,

        # texture
        "zcr_mean": float(np.mean(zcr)),
        "zcr_std": float(np.std(zcr)),

        # envelope
        "temporal_centroid": temporal_centroid,
        "rms_slope": rms_slope,
        "spectral_centroid_slope": centroid_slope,
        "early_late_energy_ratio": early_late_energy_ratio,

        # spectral shape
        "spectral_centroid_mean": float(np.mean(centroid)),
        "spectral_centroid_std": float(np.std(centroid)),
        "spectral_centroid_skew": float(skew(centroid)),
        "spectral_bandwidth_mean": float(np.mean(bandwidth)),
        "spectral_bandwidth_std": float(np.std(bandwidth)),
        "spectral_rolloff_mean": float(np.mean(rolloff)),
        "spectral_rolloff_std": float(np.std(rolloff)),

        # spectral texture
        "spectral_flatness_mean": float(np.mean(flatness)),
        "spectral_flatness_std": float(np.std(flatness)),
        "spectral_contrast_mean": float(np.mean(contrast)),
        "spectral_contrast_std": float(np.std(contrast)),
        "spectral_entropy_mean": float(entropy(np.mean(chroma, axis=1))),
    }

    # harmonic / percussive
    row["harmonic_rms"] = float(harm_rms)
    row["percussive_rms"] = float(perc_rms)
    row["harmonic_percussive_ratio"] = float(hp_ratio)

    # rhythm
    row["onset_count"] = int(onset_count)
    row["onset_density"] = float(onset_density)
    row["mean_inter_onset_interval"] = float(mean_ioi)

    # pitch
    row["f0_mean"] = float(np.mean(f0_clean)) if len(f0_clean) else 0
    row["f0_std"] = float(np.std(f0_clean)) if len(f0_clean) else 0
    row["pitch_confidence_mean"] = float(np.mean(pitch_conf)) if len(pitch_conf) else 0
    row["pitch_confidence_std"] = float(np.std(pitch_conf)) if len(pitch_conf) else 0

    # chroma
    chroma_labels = ["C","Cs","D","Ds","E","F","Fs","G","Gs","A","As","B"]
    for i, note in enumerate(chroma_labels):
        row[f"chroma_mean_{note}"] = float(np.mean(chroma[i]))
        row[f"chroma_std_{note}"] = float(np.std(chroma[i]))

    # mfccs
    for i in range(13):
        row[f"mfcc_{i+1:02d}_mean"] = float(np.mean(mfcc[i]))
        row[f"mfcc_{i+1:02d}_std"] = float(np.std(mfcc[i]))

    return row

In [None]:
data_raw_personal_path = "/Users/jackcdawson/Desktop/dev/sample_identifier/data/raw/personal"

rows = []
labels = []

for root, dirs, files in os.walk(data_raw_personal_path):
    for file in files:
        if file.endswith('.wav'):
            wav_path = os.path.join(root, file)
            
            # Extract category structure from path
            rel_path = os.path.relpath(wav_path, data_raw_personal_path)
            path_parts = rel_path.split(os.sep)
            
            category = path_parts[0] if len(path_parts) > 1 else "unknown"
            sub_category = path_parts[1] if len(path_parts) > 2 else "unknown"
            
            # Generate unique ID
            sample_id = str(uuid.uuid4())
            
            # Extract features
            row = extract_audio_features(wav_path)
            row['id'] = sample_id
            row['category'] = category
            row['sub_category'] = sub_category
            rows.append(row)
            
            # Create label entry
            labels.append({
                'id': sample_id,
                'top_level': category,
                'sub_level': sub_category,
                'temporal': 'loop' if sub_category.lower() == 'loops' else 'one-shot',
                'source': 'personal'
            })
            
df = pd.DataFrame(rows)
df_labels = pd.DataFrame(labels)

# Save labels
df_labels.to_csv('/Users/jackcdawson/Desktop/dev/sample_identifier/data/clean/labels_personal.csv', index=False)

In [None]:
# # look at data distribution

# columns = df_labels[df_labels.columns.drop('id')].columns.tolist()

# for col in columns:
#     print(df_labels[col].value_counts())