In [4]:
# Adapted from Parselmouth documentation at https://github.com/YannickJadoul/Parselmouth
import parselmouth
import praatio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import synapseclient
import json
from sklearn.cluster import KMeans
from collections import defaultdict
import glob
import librosa
from tqdm import tqdm
import os
import shutil

from extract_features_utils import clip_audio, get_jitter, get_shimmer, get_harmonic_to_noise_ratio, get_f0, get_formants
from pydub import AudioSegment

sns.set()

In [5]:
# Authenticate Synapse login credentials
# WARNING: This might raise an error if your .synapseConfig file isn't configured correctly.
syn = synapseclient.Synapse()
syn.login()

Welcome, Yadong Liu!



In [12]:
########################################################################
#                                                                      #
#  Change paths as needed -- will create folders if they don't exist   #
#                                                                      #
########################################################################

POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"
CSV_FILES_PATH = "csv_files/"

os.makedirs(POSITIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(NEGATIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(CSV_FILES_PATH, exist_ok=True)

In [24]:
# Helper to skip files shorter than 0.1s
def is_valid_audio(file_path):
    try:
        audio = AudioSegment.from_file(file_path, format="m4a")
        return len(audio) > 100
    except:
        return False

# Returns file path to converted .wav file
def convert_to_wav(input_file, output_dir):
    if not is_valid_audio(input_file):
        print(f"Skipping invalid or empty file: {input_file}")
        return None

    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_path = os.path.join(output_dir, f"{base_name}.wav")
    try:
        audio = AudioSegment.from_file(input_file, format="m4a")
    except Exception:
        try:
            audio = AudioSegment.from_file(input_file, format="mp4")
        except Exception as e:
            print(f"Conversion failed for {input_file}: {e}")
            return None

    try:
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"Export failed for {input_file}: {e}")
        return None

# Adapted from https://github.com/Sage-Bionetworks/mPower-sdata/blob/master/examples/mPower-bootstrap.py
# Limited to 30 results for now
def extract_audio_files(output_dir, limit=10, diagnosis=False):
    os.makedirs(output_dir, exist_ok=True)
    
    # Sample the survey data
    survey_data = pd.read_csv(CSV_FILES_PATH + "survey_data.csv")
    survey_data = survey_data[survey_data["professional-diagnosis"] == diagnosis]
    healthcodes = "','".join(survey_data["healthCode"]) 
    
    # Download audio files from Synapse client
    query = f"SELECT * FROM syn5511444 WHERE healthCode IN ('{healthcodes}') LIMIT {limit}"
    subset_query = syn.tableQuery(query)
    subset_df = subset_query.asDataFrame()
    subset_df["audio_audio.m4a"] = subset_df["audio_audio.m4a"].astype(str) # Convert int file id's to string for comparison
    file_map = syn.downloadTableColumns(subset_query, "audio_audio.m4a")

    # Setup mapping between healthCodes and .wav files
    mappings = []

    for file_handle_id, m4a_path in file_map.items():
        wav_path = convert_to_wav(m4a_path, output_dir)

        matched_row = subset_df[subset_df["audio_audio.m4a"] == file_handle_id]
        if not matched_row.empty:
            healthcode = matched_row.iloc[0]["healthCode"]
            mappings.append({"healthCode": healthcode, "wav_path": wav_path})
        else:
            print(f"File handle ID {file_handle_id} not found in subset_df")

    # Create DataFrame for mappings
    mapping_df = pd.DataFrame(mappings)
    return mapping_df

# Feature extraction

In [25]:
data = pd.read_csv(CSV_FILES_PATH + "data.csv")

In [26]:
# Change as needed
POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"

positive_samples = extract_audio_files(POSITIVE_FOLDER_PATH, 30, True)
negative_samples = extract_audio_files(NEGATIVE_FOLDER_PATH, 30, False)

/entity/syn5511444/table/download/csv/async: 0.00it [00:00, ?it/s]


Downloading 0 files, 30 cached locally


/entity/syn5511444/table/download/csv/async: 0.00it [00:00, ?it/s]


Downloading 0 files, 30 cached locally
Skipping invalid or empty file: /Users/kyledy/.synapseCache/273/5394273/audio_audio.m4a-71d0e099-4985-4040-b20a-a1a0a07e98a37942076271865568625.tmp
Skipping invalid or empty file: /Users/kyledy/.synapseCache/244/5404244/audio_audio.m4a-504ca027-99bf-468d-9f8e-4fb6963a30fb1024572782711100970.tmp
Skipping invalid or empty file: /Users/kyledy/.synapseCache/88/5403088/audio_audio.m4a-4d609256-5480-406a-99b6-a9c2aaebc61b42309751438389271.tmp
Skipping invalid or empty file: /Users/kyledy/.synapseCache/210/5408210/audio_audio.m4a-6e853b29-c048-4c30-a82e-77553efeb5213172332672847450263.tmp


In [27]:
df_positive = pd.merge(positive_samples, data, on="healthCode", how="left")
df_negative = pd.merge(negative_samples, data, on="healthCode", how="left")

In [28]:
df_positive

Unnamed: 0,healthCode,wav_path,medTimepoint,age,are-caretaker,deep-brain-stimulation,diagnosis-year,education,employment,gender,...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,0085ab2b-7d74-4e88-8117-e9259adb6266,audio_files/positive/audio_audio.m4a-41cc7e9c-...,Immediately before Parkinson medication,29.0,False,False,2014.0,High School Diploma/GED,Self-employed,Male,...,true,True,"""White or Caucasian""",Easy,True,False,True,14.0,"version 1.0, build 7",iPhone 6 Plus
1,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-1acd3581-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0.5, build 12",iPhone 5s (GSM)
2,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-1acd3581-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0, build 7",iPhone 5s (GSM)
3,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-72dd011d-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0.5, build 12",iPhone 5s (GSM)
4,0085b356-0550-4cf1-85bd-2bcd89bf1201,audio_files/positive/audio_audio.m4a-72dd011d-...,Another time,60.0,False,False,2009.0,Master's Degree,Retired,Male,...,true,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0, build 7",iPhone 5s (GSM)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2619,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Another time,69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
2620,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Another time,69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
2621,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Another time,69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
2622,00dc061b-8151-44cc-8eae-4d10f11a5ab6,audio_files/positive/audio_audio.m4a-3b28ad9d-...,Just after Parkinson medication (at your best),69.0,False,False,2014.0,Master's Degree,Retired,Female,...,true,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)


In [33]:
# Extract all features from .wav files in folder
def extract_formants_from_folder(folder_path, df, max_files=None):
    features = []
    i = 0

    for wav in tqdm(glob.glob(os.path.join(folder_path, "*.wav"))):
        try:
            # Match .wav file to its current row in the DataFrame
            row = df.loc[df["wav_path"] == wav]
            if row.empty:
                print(f"No metadata found for: {wav}")
                continue

            health_code = row["healthCode"].values[0]
            gender_str = row["gender"].values[0].lower() if "gender" in row and pd.notna(row["gender"].values[0]) else "unknown"
            gender_flag = 1 if gender_str == "female" else 0
            
            audio, sr = librosa.load(wav)
            onset, offset = 0.25, 0.75
            meanf0 = np.nan
            
            while np.isnan(meanf0):
                trimmed_audio, _ = clip_audio(audio, onset=onset, offset=offset)
                trimmed_sound = parselmouth.Sound(trimmed_audio)

                meanf0, stdevf0, minf0, maxf0 = get_f0(trimmed_sound, 75, 500)
                f1, f2, f3 = get_formants(trimmed_sound, 75, 500, gender_flag)

                onset += 0.01
                offset += 0.01

            features.append({
                "wav_path": wav,
                "F0 (mean)": meanf0,
                "F0 (std)": stdevf0,
                "F0 (min)": minf0,
                "F0 (max)": maxf0,
                "F1 (mean)": np.nanmean(f1),
                "F1 (std)": np.nanstd(f1),
                "F2 (mean)": np.nanmean(f2),
                "F2 (std)": np.nanstd(f2),
                "F3 (mean)": np.nanmean(f3),
                "F3 (std)": np.nanstd(f3)
            })

            i += 1
            if max_files and i >= max_files:
                break

        except Exception as e:
            print(f"Error processing {wav}: {e}")
            continue

    return pd.DataFrame(features)

In [34]:
positive_formants = extract_formants_from_folder(POSITIVE_FOLDER_PATH, df_positive)

100%|███████████████████████████████████████| 30/30 [00:09<00:00,  3.30it/s]


In [35]:
negative_formants = extract_formants_from_folder(NEGATIVE_FOLDER_PATH, df_negative)

100%|███████████████████████████████████████| 26/26 [00:06<00:00,  4.25it/s]


In [36]:
positive_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),F3 (std)
0,audio_files/positive/audio_audio.m4a-72dd011d-...,230.591408,2.399089,224.402999,237.737373,1085.074038,53.931202,1503.779086,52.001865,2250.408643,31.141481
1,audio_files/positive/audio_audio.m4a-3a8b66ab-...,286.389448,5.498245,275.717617,297.798765,808.477043,64.207051,1644.618996,75.978357,2520.817333,215.523942
2,audio_files/positive/audio_audio.m4a-10b592e2-...,300.229027,49.307367,141.51904,339.928644,816.928062,39.952422,1882.938269,79.652621,2712.057102,229.861989
3,audio_files/positive/audio_audio.m4a-1acd3581-...,253.03602,3.473263,242.206243,262.289794,1076.340602,53.020303,1447.041129,31.746537,2278.063227,43.102371
4,audio_files/positive/audio_audio.m4a-e7fdf3cc-...,251.728731,75.787193,77.175275,308.61325,801.616408,103.771853,1618.275978,94.43285,2695.245723,313.557399


In [37]:
negative_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),F3 (std)
0,audio_files/negative/audio_audio.m4a-ce5c7b06-...,166.053026,30.812671,88.012128,190.595737,568.68349,78.335734,1337.480034,202.816209,2074.473439,51.100553
1,audio_files/negative/audio_audio.m4a-5999cc83-...,203.484932,1.003725,201.02116,206.226139,933.627495,123.926589,1369.266689,152.387755,1982.586852,68.932353
2,audio_files/negative/audio_audio.m4a-d200f37f-...,229.947894,24.984073,105.958481,243.159261,778.475717,83.993764,1304.991784,49.500314,2743.945906,239.251125
3,audio_files/negative/audio_audio.m4a-3cfd2533-...,180.457955,14.454882,86.948365,197.201555,794.880817,107.680633,1377.85544,47.812858,1988.553307,61.861211
4,audio_files/negative/audio_audio.m4a-da2d92a6-...,169.091796,47.210757,91.554623,230.516014,566.070513,38.604653,1678.7605,193.479762,2612.818669,231.088947


In [12]:
df_positive = pd.merge(positive_formants, df_positive, on="wav_path", how="left")
df_negative = pd.merge(negative_formants, df_negative, on="wav_path", how="left")

In [15]:
# Drop .wav path and put healthCode at the front
df_positive = df_positive.drop(columns=["wav_path"])
df_negative = df_negative.drop(columns=["wav_path"])

cols = ['healthCode'] + [col for col in df_positive.columns if col != 'healthCode']
df_positive = df_positive[cols]

cols = ['healthCode'] + [col for col in df_negative.columns if col != 'healthCode']
df_negative = df_negative[cols]

# Save to .csv files
df_positive.to_csv("positive.csv", index=False)
df_negative.to_csv("negative.csv", index=False)

In [19]:
df_positive.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,0085b356-0550-4cf1-85bd-2bcd89bf1201,230.591408,2.399089,224.402999,237.737373,1102.051765,36.458219,1515.907221,41.468079,2258.28224,...,True,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0.5, build 12",iPhone 5s (GSM)
1,0085b356-0550-4cf1-85bd-2bcd89bf1201,230.591408,2.399089,224.402999,237.737373,1102.051765,36.458219,1515.907221,41.468079,2258.28224,...,True,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0, build 7",iPhone 5s (GSM)
2,00dc061b-8151-44cc-8eae-4d10f11a5ab6,286.389448,5.498245,275.717617,297.798765,808.422339,58.308488,1645.390667,82.62726,2474.294525,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
3,00dc061b-8151-44cc-8eae-4d10f11a5ab6,286.389448,5.498245,275.717617,297.798765,808.422339,58.308488,1645.390667,82.62726,2474.294525,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
4,00dc061b-8151-44cc-8eae-4d10f11a5ab6,286.389448,5.498245,275.717617,297.798765,808.422339,58.308488,1645.390667,82.62726,2474.294525,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)


In [20]:
df_negative.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,008b878d-8b12-428a-99bb-d39e1db26512,196.423378,2.350699,192.11185,202.000689,1152.865669,112.606352,1519.420704,52.928292,2539.225481,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)
1,00547584-0c04-4228-a5d5-c68f7d59f176,226.339273,1.430775,221.205646,229.919545,652.531217,40.652569,1410.149868,146.996224,2277.604056,...,True,False,"""White or Caucasian""",Very easy,True,False,True,2.0,"version 1.0, build 7",iPhone 6 Plus
2,008b878d-8b12-428a-99bb-d39e1db26512,208.625822,1.068123,205.781886,211.328788,1208.985216,27.970193,1522.994275,52.769631,2477.165182,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)
3,008b878d-8b12-428a-99bb-d39e1db26512,203.389504,2.341553,200.232952,223.392036,1184.889428,133.28679,1457.021646,20.206735,2644.805187,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)
4,008b878d-8b12-428a-99bb-d39e1db26512,207.139184,32.872138,95.022622,229.834387,1129.449222,202.458502,1512.20587,208.428535,2657.471902,...,True,False,"""White or Caucasian""",Very easy,,,True,,"version 1.0, build 7",iPhone 5s (GSM)


In [16]:
print(df_positive["healthCode"].nunique())

3


In [17]:
print(df_negative["healthCode"].nunique())

5
