In [1]:
# Adapted from Parselmouth documentation at https://github.com/YannickJadoul/Parselmouth
import parselmouth
import praatio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import synapseclient
import json
from sklearn.cluster import KMeans
from collections import defaultdict
import glob
import librosa
from tqdm import tqdm
import os
import shutil

from extract_features_utils import clip_audio, get_jitter, get_shimmer, get_harmonic_to_noise_ratio, get_f0, get_formants
from pydub import AudioSegment

sns.set()

In [2]:
# Authenticate Synapse login credentials
# WARNING: This might raise an error if your .synapseConfig file isn't configured correctly.
syn = synapseclient.Synapse()
syn.login()

Welcome, Yadong Liu!



In [3]:
########################################################################
#                                                                      #
#  Change paths as needed -- will create folders if they don't exist   #
#                                                                      #
########################################################################

POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"
CSV_FILES_PATH = "csv_files/"

os.makedirs(POSITIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(NEGATIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(CSV_FILES_PATH, exist_ok=True)

In [4]:
# Helper to skip files shorter than 0.1s
def is_valid_audio(file_path):
    try:
        audio = AudioSegment.from_file(file_path, format="m4a")
        return len(audio) > 100
    except:
        return False

# Returns file path to converted .wav file
def convert_to_wav(input_file, output_dir):
    if not is_valid_audio(input_file):
        print(f"Skipping invalid or empty file: {input_file}")
        return None

    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_path = os.path.join(output_dir, f"{base_name}.wav")
    try:
        audio = AudioSegment.from_file(input_file, format="m4a")
    except Exception:
        try:
            audio = AudioSegment.from_file(input_file, format="mp4")
        except Exception as e:
            print(f"Conversion failed for {input_file}: {e}")
            return None

    try:
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"Export failed for {input_file}: {e}")
        return None

# Adapted from https://github.com/Sage-Bionetworks/mPower-sdata/blob/master/examples/mPower-bootstrap.py
# Limited to 30 results for now
def extract_audio_files(output_dir, limit=10, diagnosis=False):
    os.makedirs(output_dir, exist_ok=True)
    
    # Sample the survey data
    survey_data = pd.read_csv(CSV_FILES_PATH + "survey_data.csv")
    survey_data = survey_data[survey_data["professional-diagnosis"] == diagnosis]
    healthcodes = "','".join(survey_data["healthCode"]) 
    
    # Download audio files from Synapse client
    query = f"SELECT * FROM syn5511444 WHERE healthCode IN ('{healthcodes}') LIMIT {limit}"
    subset_query = syn.tableQuery(query)
    subset_df = subset_query.asDataFrame()
    subset_df["audio_audio.m4a"] = subset_df["audio_audio.m4a"].astype(str) # Convert int file id's to string for comparison
    file_map = syn.downloadTableColumns(subset_query, "audio_audio.m4a")

    # Setup mapping between healthCodes and .wav files
    mappings = []

    for file_handle_id, m4a_path in file_map.items():
        wav_path = convert_to_wav(m4a_path, output_dir)

        matched_row = subset_df[subset_df["audio_audio.m4a"] == file_handle_id]
        if not matched_row.empty:
            healthcode = matched_row.iloc[0]["healthCode"]
            mappings.append({"healthCode": healthcode, "wav_path": wav_path})
        else:
            print(f"File handle ID {file_handle_id} not found in subset_df")

    # Create DataFrame for mappings
    mapping_df = pd.DataFrame(mappings)
    return mapping_df

# Feature extraction

In [5]:
data = pd.read_csv(CSV_FILES_PATH + "data.csv")

In [6]:
# Change as needed
POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"

positive_samples = extract_audio_files(POSITIVE_FOLDER_PATH, 30, True)
negative_samples = extract_audio_files(NEGATIVE_FOLDER_PATH, 30, False)

/entity/syn5511444/table/download/csv/async: 0.00it [00:00, ?it/s]


Downloading 0 files, 30 cached locally


/entity/syn5511444/table/download/csv/async: 0.00it [00:00, ?it/s]


Downloading 0 files, 30 cached locally
Skipping invalid or empty file: /Users/kyledy/.synapseCache/273/5394273/audio_audio.m4a-71d0e099-4985-4040-b20a-a1a0a07e98a37942076271865568625.tmp
Skipping invalid or empty file: /Users/kyledy/.synapseCache/244/5404244/audio_audio.m4a-504ca027-99bf-468d-9f8e-4fb6963a30fb1024572782711100970.tmp
Skipping invalid or empty file: /Users/kyledy/.synapseCache/88/5403088/audio_audio.m4a-4d609256-5480-406a-99b6-a9c2aaebc61b42309751438389271.tmp
Skipping invalid or empty file: /Users/kyledy/.synapseCache/210/5408210/audio_audio.m4a-6e853b29-c048-4c30-a82e-77553efeb5213172332672847450263.tmp


In [7]:
df_positive = pd.merge(positive_samples, data, on="healthCode", how="left")
df_negative = pd.merge(negative_samples, data, on="healthCode", how="left")

In [8]:
# Extract all features from .wav files in folder
def extract_formants_from_folder(folder_path, df, max_files=None):
    features = []
    i = 0

    for wav in tqdm(glob.glob(os.path.join(folder_path, "*.wav"))):
        try:
            # Match .wav file to its current row in the DataFrame
            row = df.loc[df["wav_path"] == wav]
            if row.empty:
                print(f"No metadata found for: {wav}")
                continue

            # Get gender data from .wav file for data extraction
            health_code = row["healthCode"].values[0]
            gender_str = row["gender"].values[0].lower() if "gender" in row and pd.notna(row["gender"].values[0]) else "unknown"
            gender_flag = 1 if gender_str == "female" else 0
            
            audio, sr = librosa.load(wav, sr=None)
            onset, offset = 0.25, 0.75
            meanf0 = np.nan
            
            while np.isnan(meanf0):
                trimmed_audio, _ = clip_audio(audio, onset=onset, offset=offset)
                trimmed_sound = parselmouth.Sound(trimmed_audio, sampling_frequency=sr)

                meanf0, stdevf0, minf0, maxf0 = get_f0(trimmed_sound, 75, 500)
                f1, f2, f3 = get_formants(trimmed_sound, 75, 500, gender_flag)
                localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter = get_jitter(trimmed_sound, minF0=75, maxF0=500)
                localShimmer, shimmer_absolute, shimmer_rap, shimmer_rap5, shimmer_ddp = get_shimmer(trimmed_sound, minF0=75, maxF0=500)
                hnr = get_harmonic_to_noise_ratio(trimmed_sound, minF0=75)
                
                onset += 0.01
                offset += 0.01

            features.append({
                "wav_path": wav,
                "F0 (mean)": meanf0,
                "F0 (std)": stdevf0,
                "F0 (min)": minf0,
                "F0 (max)": maxf0,
                "F1 (mean)": np.nanmean(f1),
                "F1 (std)": np.nanstd(f1),
                "F2 (mean)": np.nanmean(f2),
                "F2 (std)": np.nanstd(f2),
                "F3 (mean)": np.nanmean(f3),
                "F3 (std)": np.nanstd(f3),
                "Jitter (local)": localJitter,
                "Jitter (absolute)": localabsoluteJitter,
                "Jitter (rap)": rapJitter,
                "Jitter (ppq5)": ppq5Jitter,
                "Jitter (ddp)": ddpJitter,
                "Shimmer (local)": localShimmer,
                "Shimmer (absolute)": shimmer_absolute,
                "Shimmer (rap)": shimmer_rap,
                "Shimmer (ppq5)": shimmer_rap5,
                "Shimmer (ddp)": shimmer_ddp,
                "Harmonic to Noise Ratio": hnr
            })

            i += 1
            if max_files and i >= max_files:
                break

        except Exception as e:
            print(f"Error processing {wav}: {e}")
            continue

    return pd.DataFrame(features)

In [9]:
positive_formants = extract_formants_from_folder(POSITIVE_FOLDER_PATH, df_positive)

100%|███████████████████████████████████████| 30/30 [00:17<00:00,  1.69it/s]


In [10]:
negative_formants = extract_formants_from_folder(NEGATIVE_FOLDER_PATH, df_negative)

100%|███████████████████████████████████████| 26/26 [00:13<00:00,  1.91it/s]


In [11]:
positive_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,Jitter (absolute),Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),Shimmer (absolute),Shimmer (rap),Shimmer (ppq5),Shimmer (ddp),Harmonic to Noise Ratio
0,audio_files/positive/audio_audio.m4a-72dd011d-...,115.304598,1.243896,111.961458,119.03608,669.136259,21.529192,1113.544897,19.208123,2599.997243,...,2.5e-05,0.001234,0.001516,0.003701,0.025862,0.228738,0.012541,0.015026,0.037622,19.070828
1,audio_files/positive/audio_audio.m4a-3a8b66ab-...,143.191866,2.766204,137.615912,149.304561,610.158538,51.988865,1302.705347,111.012253,1951.8809,...,2.5e-05,0.002134,0.002126,0.006403,0.045375,0.404832,0.025605,0.0286,0.076816,18.553681
2,audio_files/positive/audio_audio.m4a-10b592e2-...,156.25658,8.470789,75.370748,170.600689,842.21951,153.327415,1339.33678,232.603063,2679.475374,...,5.6e-05,0.005071,0.004844,0.015212,0.081705,0.731309,0.044697,0.050183,0.134091,15.251761
3,audio_files/positive/audio_audio.m4a-1acd3581-...,126.509691,1.764763,120.920627,131.367106,666.963028,19.235238,1130.533715,16.419243,2569.586519,...,2.5e-05,0.001493,0.00146,0.00448,0.016826,0.150832,0.008314,0.009772,0.024942,25.128684
4,audio_files/positive/audio_audio.m4a-e7fdf3cc-...,146.205015,10.86108,75.568297,155.088418,606.531963,67.598403,1234.019525,214.416388,2737.254355,...,4.1e-05,0.003402,0.003061,0.010205,0.038137,0.338156,0.020545,0.022578,0.061636,14.132022


In [12]:
negative_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,Jitter (absolute),Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),Shimmer (absolute),Shimmer (rap),Shimmer (ppq5),Shimmer (ddp),Harmonic to Noise Ratio
0,audio_files/negative/audio_audio.m4a-ce5c7b06-...,89.14081,1.242084,85.476781,92.804281,395.881952,53.339396,1023.059067,33.623883,2836.240868,...,6.1e-05,0.002971,0.003214,0.008913,0.103869,0.885331,0.06012,0.063538,0.180361,12.664165
1,audio_files/negative/audio_audio.m4a-5999cc83-...,101.733111,0.600686,99.927042,103.558181,557.158714,18.023093,961.113519,24.300927,2546.240515,...,7.1e-05,0.004277,0.004273,0.01283,0.106335,0.936331,0.059105,0.068865,0.177315,12.688305
2,audio_files/negative/audio_audio.m4a-d200f37f-...,117.786824,1.459923,114.244285,121.783029,579.731669,52.207497,1429.044456,81.849431,2512.068002,...,5.2e-05,0.003402,0.002486,0.010206,0.041635,0.381749,0.022829,0.02453,0.068487,17.667577
3,audio_files/negative/audio_audio.m4a-3cfd2533-...,91.11789,2.86041,86.551296,98.974165,653.173006,93.742535,1031.352198,214.534963,2569.47875,...,3.5e-05,0.001542,0.001996,0.004625,0.034768,0.321067,0.017792,0.02173,0.053376,17.125627
4,audio_files/negative/audio_audio.m4a-da2d92a6-...,101.359918,3.635009,93.479691,116.070773,352.380936,99.048628,1340.34371,99.068336,2197.754397,...,0.000313,0.018309,0.020409,0.054928,0.139586,1.161437,0.073698,0.078681,0.221094,8.550629


In [13]:
df_positive = pd.merge(positive_formants, df_positive, on="wav_path", how="left")
df_negative = pd.merge(negative_formants, df_negative, on="wav_path", how="left")

In [14]:
# Drop .wav path and put healthCode at the front
df_positive = df_positive.drop(columns=["wav_path"])
df_negative = df_negative.drop(columns=["wav_path"])

cols = ['healthCode'] + [col for col in df_positive.columns if col != 'healthCode']
df_positive = df_positive[cols]

cols = ['healthCode'] + [col for col in df_negative.columns if col != 'healthCode']
df_negative = df_negative[cols]

# Uncomment the lines below to save to .csv files!
df_positive.to_csv(CSV_FILES_PATH + "positive.csv", index=False)
df_negative.to_csv(CSV_FILES_PATH + "negative.csv", index=False)

In [15]:
df_positive.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,0085b356-0550-4cf1-85bd-2bcd89bf1201,115.304598,1.243896,111.961458,119.03608,669.136259,21.529192,1113.544897,19.208123,2599.997243,...,True,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0.5, build 12",iPhone 5s (GSM)
1,0085b356-0550-4cf1-85bd-2bcd89bf1201,115.304598,1.243896,111.961458,119.03608,669.136259,21.529192,1113.544897,19.208123,2599.997243,...,True,True,"""White or Caucasian""",Very easy,True,False,False,15.0,"version 1.0, build 7",iPhone 5s (GSM)
2,00dc061b-8151-44cc-8eae-4d10f11a5ab6,143.191866,2.766204,137.615912,149.304561,610.158538,51.988865,1302.705347,111.012253,1951.8809,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
3,00dc061b-8151-44cc-8eae-4d10f11a5ab6,143.191866,2.766204,137.615912,149.304561,610.158538,51.988865,1302.705347,111.012253,1951.8809,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)
4,00dc061b-8151-44cc-8eae-4d10f11a5ab6,143.191866,2.766204,137.615912,149.304561,610.158538,51.988865,1302.705347,111.012253,1951.8809,...,True,True,"""White or Caucasian""",Very easy,False,False,False,,"version 1.0, build 7",iPhone 5c (GSM)


In [16]:
df_negative.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,1a8695e5-a596-41f1-af9d-8a8d732353d7,89.14081,1.242084,85.476781,92.804281,395.881952,53.339396,1023.059067,33.623883,2836.240868,...,True,False,"""White or Caucasian""",Very easy,False,,True,,"version 1.0, build 7",iPhone 5s (GSM)
1,1a8695e5-a596-41f1-af9d-8a8d732353d7,89.14081,1.242084,85.476781,92.804281,395.881952,53.339396,1023.059067,33.623883,2836.240868,...,True,False,"""White or Caucasian""",Very easy,False,,True,,"version 1.0, build 7",iPhone 5s (GSM)
2,1a8695e5-a596-41f1-af9d-8a8d732353d7,89.14081,1.242084,85.476781,92.804281,395.881952,53.339396,1023.059067,33.623883,2836.240868,...,True,False,"""White or Caucasian""",Very easy,False,,True,,"version 1.0, build 7",iPhone 5s (GSM)
3,45b4e2ca-8d15-4736-828c-829e3d4177f4,101.733111,0.600686,99.927042,103.558181,557.158714,18.023093,961.113519,24.300927,2546.240515,...,True,False,"""White or Caucasian""",Very easy,True,,True,4.0,"version 1.0, build 7",iPhone 6
4,45b4e2ca-8d15-4736-828c-829e3d4177f4,101.733111,0.600686,99.927042,103.558181,557.158714,18.023093,961.113519,24.300927,2546.240515,...,True,False,"""White or Caucasian""",Very easy,True,,True,4.0,"version 1.0, build 7",iPhone 6
