In [1]:
# Adapted from Parselmouth documentation at https://github.com/YannickJadoul/Parselmouth
import parselmouth
import praatio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import synapseclient
import json
from sklearn.cluster import KMeans
from collections import defaultdict
import glob
import librosa
from tqdm import tqdm
import os
import shutil

from extract_features_utils import clip_audio, get_jitter, get_shimmer, get_harmonic_to_noise_ratio, get_f0, get_formants
from pydub import AudioSegment

sns.set()

In [2]:
# Authenticate Synapse login credentials
# WARNING: This might raise an error is your .synapseConfig file isn't configured correctly.
syn = synapseclient.Synapse()
syn.login()

Welcome, Yadong Liu!



In [9]:
POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"
TEMP_PATH = "audio_files/temp"

# Returns file path to converted .wav file
def convert_to_wav(input_file, output_dir):
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_path = os.path.join(output_dir, f"{base_name}.wav")
    try:
        audio = AudioSegment.from_file(input_file, format="m4a")
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"Conversion failed for {input_file}: {e}")
        return None

# Adapted from https://github.com/Sage-Bionetworks/mPower-sdata/blob/master/examples/mPower-bootstrap.py
# Limited to 30 results for now
def extract_audio_files(output_dir, limit=10, diagnosis=False):
    os.makedirs(output_dir, exist_ok=True)
    
    # Sample the survey data
    survey_data = pd.read_csv("survey_data.csv")
    survey_data = survey_data[survey_data["professional-diagnosis"] == diagnosis]
    healthcodes = "','".join(survey_data["healthCode"]) # can use random sampling here to get different subset of the data
    
    # Download audio files from Synapse client
    query = f"SELECT * FROM syn5511444 WHERE healthCode IN ('{healthcodes}') LIMIT {limit}"
    subset_query = syn.tableQuery(query)
    subset_df = subset_query.asDataFrame()
    subset_df["audio_audio.m4a"] = subset_df["audio_audio.m4a"].astype(str) # Convert int file id's to string for comparison
    file_map = syn.downloadTableColumns(subset_query, "audio_audio.m4a")

    # Setup mapping between healthCodes and .wav files
    mappings = []

    for file_handle_id, m4a_path in file_map.items():
        wav_path = convert_to_wav(m4a_path, output_dir)

        matched_row = subset_df[subset_df["audio_audio.m4a"] == file_handle_id]
        if not matched_row.empty:
            healthcode = matched_row.iloc[0]["healthCode"]
            mappings.append({"healthCode": healthcode, "wav_path": wav_path})
        else:
            print(f"File handle ID {file_handle_id} not found in subset_df")

    # Create DataFrame for mappings
    mapping_df = pd.DataFrame(mappings)
    return mapping_df

# Feature extraction

In [10]:
data = pd.read_csv("data.csv")

In [12]:
# Change as needed
POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"

positive_samples = extract_audio_files(POSITIVE_FOLDER_PATH, 30, True)
negative_samples = extract_audio_files(NEGATIVE_FOLDER_PATH, 30, False)

/entity/syn5511444/table/download/csv/async: 0.00it [00:00, ?it/s]
Downloading files: 100%|█| 6.08k/6.08k [00:00<00:00, 13.6kB/

[syn5511444]: Downloaded to /Users/kyledy/.synapseCache/195/159333195/SYNAPSE_TABLE_QUERY_159333195.csv


Downloading files: 100%|█| 6.08k/6.08k [00:00<00:00, 13.4kB/

Downloading 30 files, 0 cached locally



/file/bulk/async:   0%|         | 0.00/1.00 [00:06<?, ?it/s]
Downloading files:  33%|▎| 12.2M/37.4M [00:01<00:02, 10.6MB/

[syn5511444]: Downloaded to /var/folders/lw/63lbkk3s63qf24jmd15c3r8h0000gn/T/tmp4or4tm3m/table_file_download.zip


Downloading files: 100%|█| 37.4M/37.4M [00:01<00:00, 28.6MB/
/entity/syn5511444/table/download/csv/async: 0.00it [00:00, ?it/s]
Downloading files: 100%|█| 6.14k/6.14k [00:00<00:00, 17.2kB/

[syn5511444]: Downloaded to /Users/kyledy/.synapseCache/208/159333208/SYNAPSE_TABLE_QUERY_159333208.csv


Downloading files: 100%|█| 6.14k/6.14k [00:00<00:00, 16.9kB/

Downloading 30 files, 0 cached locally



/file/bulk/async:   0%|         | 0.00/1.00 [00:06<?, ?it/s]
Downloading files:  25%|▎| 8.53M/33.7M [00:01<00:03, 7.55MB/

[syn5511444]: Downloaded to /var/folders/lw/63lbkk3s63qf24jmd15c3r8h0000gn/T/tmp3xgwji15/table_file_download.zip


Downloading files: 100%|█| 33.7M/33.7M [00:01<00:00, 26.3MB/


Conversion failed for /Users/kyledy/.synapseCache/273/5394273/audio_audio.m4a-71d0e099-4985-4040-b20a-a1a0a07e98a37942076271865568625.tmp: Decoding failed. ffmpeg returned error code: 69

Output from ffmpeg/avlib:

ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 17.0.0 (clang-1700.0.13.3)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_3 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 -

In [6]:
df_positive = pd.merge(positive_samples, data, on="healthCode", how="left")
df_negative = pd.merge(negative_samples, data, on="healthCode", how="left")

In [7]:
df_positive

Unnamed: 0,healthCode,wav_path,medTimepoint,age,are-caretaker,deep-brain-stimulation,diagnosis-year,education,employment,gender,...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,0085ab2b-7d74-4e88-8117-e9259adb6266,audio_files/positive/audio_audio.m4a-41cc7e9c-...,Immediately before Parkinson medication,29.0,False,False,2014.0,High School Diploma/GED,Self-employed,Male,...,true,True,"""White or Caucasian""",Easy,True,False,True,14.0,"version 1.0, build 7",iPhone 6 Plus
1,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-1acd3581-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0.5, build 12",iPhone 5s (GSM)
2,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-1acd3581-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0, build 7",iPhone 5s (GSM)
3,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-72dd011d-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0.5, build 12",iPhone 5s (GSM)
4,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-72dd011d-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0, build 7",iPhone 5s (GSM)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2619,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Another time,69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
2620,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Another time,69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
2621,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Another time,69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
2622,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Just after Parkinson medication (at your best),69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)


# Praat

In [7]:
def extract_formants_from_folder(folder_path, df, max_files=None):
    features = []
    i = 0

    for wav in tqdm(glob.glob(os.path.join(folder_path, "*.wav"))):
        try:
            audio, sr = librosa.load(wav)
            onset, offset = 0.25, 0.75
            meanf0 = np.nan
            while np.isnan(meanf0):
                trimmed_audio, _ = clip_audio(audio, onset=onset, offset=offset)
                trimmed_sound = parselmouth.Sound(trimmed_audio)

                meanf0, stdevf0, minf0, maxf0 = get_f0(trimmed_sound, 75, 500)

                # Work in progress
                # Map gender of .wav file from healthCode
                gender = df.loc[df["healthCode"] == health_code, "gender"].values
                gender = gender_str[0].lower() if len(gender_str) > 0 else "unknown"
                gender_flag = 1 if gender_str == "female" else 0
                
                f1, f2, f3 = get_formants(trimmed_sound, 75, 500, gender_flag)

                onset += 0.01
                offset += 0.01

            features.append({
                "wav_path": wav,
                "F0 (mean)": meanf0,
                "F0 (std)": stdevf0,
                "F0 (min)": minf0,
                "F0 (max)": maxf0,
                "F1 (mean)": np.nanmean(f1),
                "F1 (std)": np.nanstd(f1),
                "F2 (mean)": np.nanmean(f2),
                "F2 (std)": np.nanstd(f2),
                "F3 (mean)": np.nanmean(f3),
                "F3 (std)": np.nanstd(f3)
            })

            i += 1
            if max_files and i >= max_files:
                break

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    return pd.DataFrame(features)

In [8]:
positive_formants = extract_formants_from_folder(POSITIVE_FOLDER_PATH, df_positive)

100%|█████████████████████████████████████████| 30/30 [00:06<00:00,  4.30it/s]


In [9]:
negative_formants = extract_formants_from_folder(NEGATIVE_FOLDER_PATH, df_negative)

100%|█████████████████████████████████████████| 30/30 [00:04<00:00,  6.31it/s]


In [10]:
positive_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),F3 (std)
0,audio_files/positive/audio_audio.m4a-72dd011d-...,230.591408,2.399089,224.402999,237.737373,1102.051765,36.458219,1515.907221,41.468079,2258.28224,25.943916
1,audio_files/positive/audio_audio.m4a-3a8b66ab-...,286.389448,5.498245,275.717617,297.798765,808.422339,58.308488,1645.390667,82.62726,2474.294525,273.001286
2,audio_files/positive/audio_audio.m4a-10b592e2-...,300.229027,49.307367,141.51904,339.928644,810.379761,53.596763,1841.211907,165.378091,2638.718643,292.949467
3,audio_files/positive/audio_audio.m4a-1acd3581-...,253.03602,3.473263,242.206243,262.289794,1102.770783,43.885383,1458.713573,31.608115,2289.020862,37.62577
4,audio_files/positive/audio_audio.m4a-e7fdf3cc-...,251.728731,75.787193,77.175275,308.61325,780.891174,97.430419,1590.545083,112.121676,2313.542313,452.306301


In [11]:
negative_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),F3 (std)
0,audio_files/negative/audio_audio.m4a-49152779-...,196.423378,2.350699,192.11185,202.000689,1152.865669,112.606352,1519.420704,52.928292,2539.225481,32.684939
1,audio_files/negative/audio_audio.m4a-151e4bdd-...,226.339273,1.430775,221.205646,229.919545,652.531217,40.652569,1410.149868,146.996224,2277.604056,189.138268
2,audio_files/negative/audio_audio.m4a-6019e6da-...,208.625822,1.068123,205.781886,211.328788,1208.985216,27.970193,1522.994275,52.769631,2477.165182,21.764772
3,audio_files/negative/audio_audio.m4a-bd9c437c-...,203.389504,2.341553,200.232952,223.392036,1184.889428,133.28679,1457.021646,20.206735,2644.805187,31.097343
4,audio_files/negative/audio_audio.m4a-25c88478-...,207.139184,32.872138,95.022622,229.834387,1129.449222,202.458502,1512.20587,208.428535,2657.471902,56.049327


In [12]:
df_positive = pd.merge(positive_formants, df_positive, on="wav_path", how="left")
df_negative = pd.merge(negative_formants, df_negative, on="wav_path", how="left")

In [15]:
# Drop .wav path and put healthCode at the front
df_positive = df_positive.drop(columns=["wav_path"])
df_negative = df_negative.drop(columns=["wav_path"])

cols = ['healthCode'] + [col for col in df_positive.columns if col != 'healthCode']
df_positive = df_positive[cols]

cols = ['healthCode'] + [col for col in df_negative.columns if col != 'healthCode']
df_negative = df_negative[cols]

# Save to .csv files
df_positive.to_csv("positive.csv", index=False)
df_negative.to_csv("negative.csv", index=False)

In [19]:
df_positive.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,0085b356-0550-4cf1-85bd-2bcd89bf1201,230.591408,2.399089,224.402999,237.737373,1102.051765,36.458219,1515.907221,41.468079,2258.28224,...,True,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0.5, build 12",iPhone 5s (GSM)
1,0085b356-0550-4cf1-85bd-2bcd89bf1201,230.591408,2.399089,224.402999,237.737373,1102.051765,36.458219,1515.907221,41.468079,2258.28224,...,True,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0, build 7",iPhone 5s (GSM)
2,00dc061b-8151-44cc-8eae-4d10f11a5ab6,286.389448,5.498245,275.717617,297.798765,808.422339,58.308488,1645.390667,82.62726,2474.294525,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
3,00dc061b-8151-44cc-8eae-4d10f11a5ab6,286.389448,5.498245,275.717617,297.798765,808.422339,58.308488,1645.390667,82.62726,2474.294525,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
4,00dc061b-8151-44cc-8eae-4d10f11a5ab6,286.389448,5.498245,275.717617,297.798765,808.422339,58.308488,1645.390667,82.62726,2474.294525,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)


In [20]:
df_negative.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,008b878d-8b12-428a-99bb-d39e1db26512,196.423378,2.350699,192.11185,202.000689,1152.865669,112.606352,1519.420704,52.928292,2539.225481,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)
1,00547584-0c04-4228-a5d5-c68f7d59f176,226.339273,1.430775,221.205646,229.919545,652.531217,40.652569,1410.149868,146.996224,2277.604056,...,True,False,"""White or Caucasian""",Very easy,True,False,True,2.0,"version 1.0, build 7",iPhone 6 Plus
2,008b878d-8b12-428a-99bb-d39e1db26512,208.625822,1.068123,205.781886,211.328788,1208.985216,27.970193,1522.994275,52.769631,2477.165182,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)
3,008b878d-8b12-428a-99bb-d39e1db26512,203.389504,2.341553,200.232952,223.392036,1184.889428,133.28679,1457.021646,20.206735,2644.805187,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)
4,008b878d-8b12-428a-99bb-d39e1db26512,207.139184,32.872138,95.022622,229.834387,1129.449222,202.458502,1512.20587,208.428535,2657.471902,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)


In [16]:
print(df_positive["healthCode"].nunique())

3


In [17]:
print(df_negative["healthCode"].nunique())

5
