# Cough Detection Feature Extraction

This notebook reproduces eature extraction for classical ML pipeline from the research paper for cough detection using multimodal biosignals.

## Objective

Extract dataset features for three modality configurations:
1. **IMU-only**: 40 handcrafted features from accelerometer and gyroscope
2. **Audio-only**: 65 features from outer microphone (MFCC + spectral + time-domain)
3. **Multimodal**: Combined 105 features (Audio + IMU)

## Method

- **Window size**: 0.4 seconds (6400 audio samples @ 16kHz, 40 IMU samples @ 100Hz)
- **Data augmentation**: Random temporal shifts (aug_factor=2)

In [None]:
# Import required libraries
import sys
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats, signal
import librosa
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append(os.path.abspath('../src'))
from helpers import *
from dataset_gen import *

print("✓ All imports successful")

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Constants from paper
WINDOW_LEN = 0.4  # 0.4 second windows
AUG_FACTOR = 2    # Data augmentation factor

print(f"Configuration:")
print(f"  Window length: {WINDOW_LEN}s")
print(f"  Expected audio samples: {int(WINDOW_LEN * FS_AUDIO)}")
print(f"  Expected IMU samples: {int(WINDOW_LEN * FS_IMU)}")
print(f"  Augmentation factor: {AUG_FACTOR}")

## Feature Extraction Functions

### Audio Features (65 total)

1. **MFCC (52)**: 13 coefficients × 4 statistics (mean, std, min, max)
2. **Spectral (10)**: Centroid, rolloff, bandwidth, flatness, contrast, PSD features, spectral spread/skewness/kurtosis
3. **Time-domain (3)**: Zero-crossing rate, RMS energy, crest factor

### IMU Features (40 total)

For 8 signals (3 accel + accel_L2 + 3 gyro + gyro_L2):
- Line length, zero-crossing rate, kurtosis, crest factor, RMS = 5 features per signal

In [None]:
def extract_audio_features(audio_window, fs=16000):
    """
    Extract 65 audio features from single window
    
    Args:
        audio_window: 1D array of audio samples
        fs: Sampling frequency (16000 Hz)
    
    Returns:
        np.array: 65 features
    """
    features = []
    
    # MFCC features (52)
    mfccs = librosa.feature.mfcc(y=audio_window, sr=fs, n_mfcc=13)
    for coef in mfccs:
        features.extend([np.mean(coef), np.std(coef), np.min(coef), np.max(coef)])
    
    # Spectral features (10)
    features.append(np.mean(librosa.feature.spectral_centroid(y=audio_window, sr=fs)))
    features.append(np.mean(librosa.feature.spectral_rolloff(y=audio_window, sr=fs)))
    features.append(np.mean(librosa.feature.spectral_bandwidth(y=audio_window, sr=fs)))
    features.append(np.mean(librosa.feature.spectral_flatness(y=audio_window)))
    features.append(np.mean(librosa.feature.spectral_contrast(y=audio_window, sr=fs)))
    
    # PSD-based features
    f, psd = signal.welch(audio_window, fs=fs)
    features.append(np.sum(psd))  # Total power
    dom_freq_idx = np.argmax(psd)
    features.append(f[dom_freq_idx])  # Dominant frequency
    
    # Spectral spread, skewness, kurtosis
    psd_norm = psd / (np.sum(psd) + 1e-10)
    spectral_mean = np.sum(f * psd_norm)
    features.append(np.sqrt(np.sum(((f - spectral_mean)**2) * psd_norm)))  # Spread
    features.append(np.sum(((f - spectral_mean)**3) * psd_norm))  # Skewness
    features.append(np.sum(((f - spectral_mean)**4) * psd_norm))  # Kurtosis
    
    # Time-domain features (3)
    features.append(librosa.feature.zero_crossing_rate(audio_window)[0].mean())
    rms = np.sqrt(np.mean(audio_window**2))
    features.append(rms)
    features.append(np.max(np.abs(audio_window)) / (rms + 1e-10))  # Crest factor
    
    return np.array(features)

# Test on random data
test_audio = np.random.randn(6400)
test_features = extract_audio_features(test_audio)
print(f"✓ Audio feature extractor: {len(test_features)} features")
assert len(test_features) == 65, f"Expected 65 features, got {len(test_features)}"

In [None]:
def extract_imu_features(imu_window):
    """
    Extract 40 IMU features
    
    Args:
        imu_window: (40, 6) array - [Accel_x, Accel_y, Accel_z, Gyro_Y, Gyro_P, Gyro_R]
    
    Returns:
        np.array: 40 features (8 signals × 5 features)
    """
    # Subtract mean per channel (paper requirement)
    imu_centered = imu_window - np.mean(imu_window, axis=0, keepdims=True)
    
    # Compute L2 norms
    accel_l2 = np.linalg.norm(imu_centered[:, 0:3], axis=1)
    gyro_l2 = np.linalg.norm(imu_centered[:, 3:6], axis=1)
    
    # Stack all 8 signals
    signals = np.column_stack([
        imu_centered[:, 0], imu_centered[:, 1], imu_centered[:, 2], accel_l2,
        imu_centered[:, 3], imu_centered[:, 4], imu_centered[:, 5], gyro_l2
    ])
    
    features = []
    for i in range(8):
        sig = signals[:, i]
        
        # Line length
        features.append(np.sum(np.abs(np.diff(sig))))
        
        # Zero crossing rate
        features.append(np.sum(np.diff(np.sign(sig)) != 0) / len(sig))
        
        # Kurtosis
        features.append(stats.kurtosis(sig))
        
        # Crest factor
        rms = np.sqrt(np.mean(sig**2))
        features.append(np.max(np.abs(sig)) / (rms + 1e-10))
        
        # RMS power
        features.append(rms)
    
    return np.array(features)

# Test on random data
test_imu = np.random.randn(40, 6)
test_features = extract_imu_features(test_imu)
print(f"✓ IMU feature extractor: {len(test_features)} features")
assert len(test_features) == 40, f"Expected 40 features, got {len(test_features)}"

In [None]:
def extract_features_for_dataset(audio_data, imu_data, modality='all'):
    """
    Extract features for entire dataset
    
    Args:
        audio_data: (N, 6400, 2) - [outer_mic, body_mic]
        imu_data: (N, 40, 6)
        modality: 'imu_only', 'audio_only', or 'all'
    
    Returns:
        X: (N, n_features) feature matrix
    """
    N = audio_data.shape[0]
    features_list = []
    
    for i in tqdm(range(N), desc=f"Extracting {modality} features"):
        sample_features = []
        
        if modality in ['audio_only', 'all']:
            # Use outer microphone (index 0)
            audio_outer = audio_data[i, :, 0]
            sample_features.extend(extract_audio_features(audio_outer))
        
        if modality in ['imu_only', 'all']:
            imu_window = imu_data[i, :, :]
            sample_features.extend(extract_imu_features(imu_window))
        
        features_list.append(sample_features)
    
    X = np.array(features_list)
    
    # Handle NaN/Inf values
    if np.any(np.isnan(X)) or np.any(np.isinf(X)):
        print(f"Warning: Replacing {np.sum(np.isnan(X))} NaN and {np.sum(np.isinf(X))} Inf values")
        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    
    return X

print("✓ Batch feature extraction function ready")

## Data Loading

Load raw windowed data from all 15 subjects using `get_samples_for_subject()` from `dataset_gen.py`.

In [None]:
# Locate dataset folder
kaggle_dataset_dir = '/kaggle/input/edge-ai-cough-count'
base_dir = kaggle_dataset_dir if os.path.exists(kaggle_dataset_dir) else ".."
data_folder = base_dir + '/public_dataset/'

# Check if exists, otherwise try alternative path
if not os.path.exists(data_folder):
    data_folder = '../data/public_dataset/'
    if not os.path.exists(data_folder):
        raise FileNotFoundError(
            "Cannot find public_dataset/. Please download from: "
            "https://zenodo.org/record/7562332"
        )

# Get list of subject IDs
subject_ids = [d for d in os.listdir(data_folder) 
               if os.path.isdir(os.path.join(data_folder, d))]
subject_ids = sorted(subject_ids)

print(f"✓ Found {len(subject_ids)} subjects: {subject_ids}")

In [None]:
# Load raw windowed data from all subjects
all_audio = []
all_imu = []
all_labels = []
all_subjects = []

print("Loading dataset (this may take a few minutes)...\n")

for subj_id in tqdm(subject_ids, desc="Loading subjects"):
    try:
        audio, imu, labels, n_coughs = get_samples_for_subject(
            data_folder, subj_id,
            window_len=WINDOW_LEN,
            aug_factor=AUG_FACTOR
        )
        
        all_audio.append(audio)
        all_imu.append(imu)
        all_labels.append(labels)
        all_subjects.extend([subj_id] * len(labels))
        
        print(f"  {subj_id}: {n_coughs} coughs → {len(labels)} windows "
              f"({np.sum(labels==1)} cough, {np.sum(labels==0)} non-cough)")
    except Exception as e:
        print(f"  {subj_id}: Error - {e}")
        continue

# Concatenate all subjects
audio_data = np.concatenate(all_audio, axis=0)
imu_data = np.concatenate(all_imu, axis=0)
labels = np.concatenate(all_labels, axis=0)
subjects = np.array(all_subjects)

print(f"\n{'='*70}")
print(f"Total dataset:")
print(f"  Audio shape: {audio_data.shape}")
print(f"  IMU shape: {imu_data.shape}")
print(f"  Labels: {len(labels)} ({np.sum(labels==1)} coughs, {np.sum(labels==0)} non-coughs)")
print(f"  Unique subjects: {len(np.unique(subjects))}")
print(f"  Class balance: {np.sum(labels==1)/len(labels)*100:.1f}% coughs")
print(f"{'='*70}")

In [None]:
# Sanity checks
assert audio_data.shape[1] == 6400, f"Expected 6400 audio samples, got {audio_data.shape[1]}"
assert imu_data.shape[1] == 40, f"Expected 40 IMU samples, got {imu_data.shape[1]}"
assert len(np.unique(subjects)) == 15, f"Expected 15 subjects, got {len(np.unique(subjects))}"

# Visualize one cough sample
idx = np.where(labels == 1)[0][0]
fig, axes = plt.subplots(2, 1, figsize=(12, 6))

axes[0].plot(audio_data[idx, :, 0], linewidth=0.5)
axes[0].set_title(f"Sample Cough - Outer Microphone (Subject {subjects[idx]})")
axes[0].set_xlabel("Sample Index")
axes[0].set_ylabel("Amplitude")
axes[0].grid(alpha=0.3)

axes[1].plot(-imu_data[idx, :, 2], linewidth=1)
axes[1].set_title("Accelerometer Z (negated)")
axes[1].set_xlabel("Sample Index")
axes[1].set_ylabel("Acceleration")
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("✓ Data loaded and verified successfully")

## Feature Extraction

Extract handcrafted features for all three modalities:
1. IMU-only: 40 features
2. Audio-only: 65 features
3. Multimodal: 105 features

**Note**: This may take 10-20 minutes depending on hardware.

In [None]:
import multiprocessing
from joblib import Parallel, delayed

print("Extracting features for all modalities...\n")

N = audio_data.shape[0]
n_cpus = multiprocessing.cpu_count()

# Configure parallelization based on available cores
if n_cpus >= 8:
    n_jobs = 8
    n_jobs = 8
    blas_threads = 2
else:
    # Run with all CPUs, but without blas threads
    n_jobs = n_cpus
    blas_threads = 1

# Limit BLAS threading to prevent oversubscription
os.environ['OMP_NUM_THREADS'] = str(blas_threads)
os.environ['OPENBLAS_NUM_THREADS'] = str(blas_threads)
os.environ['MKL_NUM_THREADS'] = str(blas_threads)

print(f"Hardware: {n_cpus} CPU cores detected")
print(f"Configuration: {n_jobs} workers × {blas_threads} BLAS threads = {n_jobs * blas_threads} total\n")

# ===================================================================
# Step 1/2: Extract audio features (65 features from outer mic)
# ===================================================================
print("Step 1/2: Extracting audio features...")
audio_features_list = Parallel(n_jobs=n_jobs, backend='loky')(
    delayed(extract_audio_features)(audio_data[i, :, 0])
    for i in tqdm(range(N), desc="Audio features")
)
X_audio = np.array(audio_features_list)

# Handle NaN/Inf in audio features
if np.any(np.isnan(X_audio)) or np.any(np.isinf(X_audio)):
    print(f"  Warning: Replacing {np.sum(np.isnan(X_audio))} NaN and {np.sum(np.isinf(X_audio))} Inf values in audio")
    X_audio = np.nan_to_num(X_audio, nan=0.0, posinf=0.0, neginf=0.0)

# ===================================================================
# Step 2/2: Extract IMU features (40 features)
# ===================================================================
print("\nStep 2/2: Extracting IMU features...")
imu_features_list = Parallel(n_jobs=n_jobs, backend='loky')(
    delayed(extract_imu_features)(imu_data[i, :, :])
    for i in tqdm(range(N), desc="IMU features")
)
X_imu = np.array(imu_features_list)

# Handle NaN/Inf in IMU features
if np.any(np.isnan(X_imu)) or np.any(np.isinf(X_imu)):
    print(f"  Warning: Replacing {np.sum(np.isnan(X_imu))} NaN and {np.sum(np.isinf(X_imu))} Inf values in IMU")
    X_imu = np.nan_to_num(X_imu, nan=0.0, posinf=0.0, neginf=0.0)

# ===================================================================
# Combine for multimodal (65 audio + 40 IMU = 105 features)
# ===================================================================
X_all = np.concatenate([X_audio, X_imu], axis=1)

print(f"\n{'='*70}")
print(f"Feature extraction complete:")
print(f"  Audio-only: {X_audio.shape} (65 features)")
print(f"  IMU-only: {X_imu.shape} (40 features)")
print(f"  Multimodal: {X_all.shape} (105 features)")
print(f"{'='*70}")

In [None]:
# Save features to avoid re-extraction
save_path = 'extracted_features.npz'
np.savez(
    save_path,
    X_imu=X_imu, 
    X_audio=X_audio, 
    X_all=X_all,
    labels=labels, 
    subjects=subjects
)
print(f"✓ Features saved to {save_path}")
print(f"  To load: data = np.load('{save_path}')")