# 02 — Extract Speech Biomarkers from DementiaBank

Extracts four feature streams per recording:
1. Handcrafted acoustic features (eGeMAPS-style) → 216-D
2. wav2vec 2.0 embeddings → 768-D
3. Handcrafted linguistic features → 14-D
4. Sentence-BERT transcript embeddings → 384-D

Total: 1382-D per recording, saved as .npz

In [None]:
# Install dependencies (Colab)
# !pip install transformers librosa parselmouth sentence-transformers -q

import sys
sys.path.insert(0, '/content/drive/MyDrive/alzheimer-research')

import numpy as np
import torch
from pathlib import Path
from tqdm.auto import tqdm

from config import Config
cfg = Config()
cfg.ensure_dirs()

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {DEVICE}')

## 1. Configure DementiaBank Paths

In [None]:
# === CONFIGURE THESE ===
DEMENTIABANK_DIR = Path('path/to/DementiaBank/Pitt')
TRANSCRIPT_DIR = DEMENTIABANK_DIR / 'transcripts'   # .cha files or .txt
AUDIO_DIR = DEMENTIABANK_DIR / 'audio'              # .wav files
OUTPUT_PATH = cfg.embedding_dir / 'speech_features.npz'

# Expected structure:
# DementiaBank/Pitt/
#   Dementia/
#     *.wav  (or .mp3)
#   Control/
#     *.wav

## 2. Extract Handcrafted Acoustic Features

In [None]:
from data.preprocessing import extract_acoustic_handcrafted

# Collect audio files and labels
audio_files = []
labels = []

# Adapt this to your DementiaBank directory structure
# for wav in sorted(AUDIO_DIR.glob('Dementia/**/*.wav')):
#     audio_files.append(wav)
#     labels.append(2)  # map to ordinal class based on MMSE/diagnosis
# for wav in sorted(AUDIO_DIR.glob('Control/**/*.wav')):
#     audio_files.append(wav)
#     labels.append(0)

print(f'Total audio files: {len(audio_files)}')

# Extract
acoustic_features = []
for path in tqdm(audio_files, desc='Acoustic features'):
    feat = extract_acoustic_handcrafted(path)
    acoustic_features.append(feat)

acoustic_features = np.stack(acoustic_features) if acoustic_features else np.zeros((0, 216))
print(f'Acoustic features shape: {acoustic_features.shape}')

## 3. Extract wav2vec 2.0 Embeddings

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import librosa

processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-base')
wav2vec_model = Wav2Vec2Model.from_pretrained('facebook/wav2vec2-base').to(DEVICE)
wav2vec_model.eval()

wav2vec_embeds = []
for path in tqdm(audio_files, desc='wav2vec2'):
    audio, sr = librosa.load(str(path), sr=16000, mono=True)
    inputs = processor(audio, sampling_rate=16000, return_tensors='pt', padding=True)
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = wav2vec_model(**inputs)
        # Mean-pool over time dimension
        embed = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    wav2vec_embeds.append(embed)

wav2vec_embeds = np.stack(wav2vec_embeds) if wav2vec_embeds else np.zeros((0, 768))
print(f'wav2vec2 embeddings shape: {wav2vec_embeds.shape}')

## 4. Extract Linguistic Features

In [None]:
from data.preprocessing import extract_linguistic_handcrafted
from sentence_transformers import SentenceTransformer

# Load transcripts (adapt to your format: .cha, .txt, or Whisper ASR)
transcripts = []
# for path in audio_files:
#     transcript_path = TRANSCRIPT_DIR / (path.stem + '.txt')
#     if transcript_path.exists():
#         transcripts.append(transcript_path.read_text())
#     else:
#         # Use Whisper ASR as fallback
#         transcripts.append('')  # placeholder

# Handcrafted linguistic
ling_features = []
for text in tqdm(transcripts, desc='Linguistic features'):
    feat = extract_linguistic_handcrafted(text)
    ling_features.append(feat)
ling_features = np.stack(ling_features) if ling_features else np.zeros((0, 14))

# Sentence-BERT embeddings
sbert = SentenceTransformer('all-MiniLM-L6-v2', device=str(DEVICE))
sbert_embeds = sbert.encode(transcripts, show_progress_bar=True) if transcripts else np.zeros((0, 384))
print(f'Linguistic features: {ling_features.shape}')
print(f'Sentence-BERT embeddings: {sbert_embeds.shape}')

## 5. Concatenate and Save

In [None]:
# Concatenate all feature streams: [acoustic(216) | wav2vec(768) | linguistic(14) | sbert(384)]
if len(audio_files) > 0:
    all_features = np.concatenate(
        [acoustic_features, wav2vec_embeds, ling_features, sbert_embeds],
        axis=1,
    )
    all_labels = np.array(labels, dtype=np.int64)

    print(f'Combined features shape: {all_features.shape}')  # (N, 1382)
    print(f'Labels shape: {all_labels.shape}')

    np.savez_compressed(
        str(OUTPUT_PATH),
        features=all_features.astype(np.float32),
        labels=all_labels,
    )
    print(f'Saved to {OUTPUT_PATH}')
else:
    print('No audio files found — configure DEMENTIABANK_DIR above')