In [1]:
# Adapted from Parselmouth documentation at https://github.com/YannickJadoul/Parselmouth
import parselmouth
import praatio

import pandas as pd
import numpy as np
import os
import synapseclient
import librosa
import os

from extract_features_utils import clip_audio, get_jitter, get_shimmer, get_harmonic_to_noise_ratio, get_f0, get_formants
from pydub import AudioSegment

In [2]:
# Authenticate Synapse login credentials
# WARNING: This might raise an error if your .synapseConfig file isn't configured correctly.
syn = synapseclient.Synapse()
syn.login()

Welcome, Yadong Liu!



## File paths

In [3]:
########################################################################
#                                                                      #
#  Change paths as needed -- will create folders if they don't exist   #
#                                                                      #
########################################################################

POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"
CSV_FILES_PATH = "csv_files/"

os.makedirs(POSITIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(NEGATIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(CSV_FILES_PATH, exist_ok=True)

## Helper functions

In [4]:
# Helper to skip files shorter than 0.1s
def is_valid_audio(file_path):
    try:
        audio = AudioSegment.from_file(file_path, format="m4a")
        return len(audio) > 100
    except:
        return False

# Returns file path to converted .wav file
def convert_to_wav(input_file, output_dir):
    if not is_valid_audio(input_file):
        print(f"Skipping invalid or empty file: {input_file}")
        return None

    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_path = os.path.join(output_dir, f"{base_name}.wav")
    try:
        audio = AudioSegment.from_file(input_file, format="m4a")
    except Exception:
        try:
            audio = AudioSegment.from_file(input_file, format="mp4")
        except Exception as e:
            print(f"Conversion failed for {input_file}: {e}")
            return None

    try:
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"Export failed for {input_file}: {e}")
        return None

def convert_folder_to_wav(input_dir):
    for filename in os.listdir(input_dir):
        if filename.endswith(".m4a"):
            input_path = os.path.join(input_dir, filename)
            health_code = os.path.splitext(filename)[0]
            wav_path = os.path.join(input_dir, f"{health_code}.wav")
            if os.path.exists(wav_path):
                continue
            result = convert_to_wav(input_path, input_dir)

# Adapted from https://github.com/Sage-Bionetworks/mPower-sdata/blob/master/examples/mPower-bootstrap.py
# Downloads ALL matching audio files, customize batch size accordingly.
def extract_audio_files(output_dir, diagnosis=False, batch_size=100):
    os.makedirs(output_dir, exist_ok=True)

    # Load and filter survey data
    survey_data = pd.read_csv(CSV_FILES_PATH + "survey_data.csv")
    survey_data = survey_data[survey_data["professional-diagnosis"] == diagnosis]
    all_healthcodes = survey_data["healthCode"].tolist()

    all_mappings = []

    for i in range(0, len(all_healthcodes), batch_size):
        batch = all_healthcodes[i:i + batch_size]
        healthcodes_str = "','".join(batch)
        
        query = f"SELECT * FROM syn5511444 WHERE healthCode IN ('{healthcodes_str}')"
        subset_query = syn.tableQuery(query)
        subset_df = subset_query.asDataFrame()
        subset_df["audio_audio.m4a"] = subset_df["audio_audio.m4a"].astype(str)
        file_map = syn.downloadTableColumns(subset_query, "audio_audio.m4a")

        for file_handle_id, m4a_path in file_map.items():
            wav_path = convert_to_wav(m4a_path, output_dir)
            matched_row = subset_df[subset_df["audio_audio.m4a"] == file_handle_id]
            if not matched_row.empty:
                healthcode = matched_row.iloc[0]["healthCode"]
                all_mappings.append({"healthCode": healthcode, "wav_path": wav_path})
            else:
                print(f"File handle ID {file_handle_id} not found in subset_df")

    return pd.DataFrame(all_mappings)

In [5]:
# Extract all features from .wav files in folder
def extract_formants_from_folder(folder_path, df, batch_size=100, max_files=None):
    wav_files = glob.glob(os.path.join(folder_path, "*.wav"))
    if max_files:
        wav_files = wav_files[:max_files]

    all_features = []

    for i in range(0, len(wav_files), batch_size):
        batch = wav_files[i:i + batch_size]
        batch_features = []

        for wav in batch:
            try:
                # Match .wav file to its current row in the DataFrame
                row = df.loc[df["wav_path"] == wav]
                if row.empty:
                    print(f"No metadata found for: {wav}")
                    continue

                # Get gender data from .wav file for data extraction
                health_code = row["healthCode"].values[0]
                gender_str = row["gender"].values[0].lower() if "gender" in row and pd.notna(row["gender"].values[0]) else "unknown"
                gender_flag = 1 if gender_str == "female" else 0

                audio, sr = librosa.load(wav, sr=None)
                onset, offset = 0.25, 0.75
                meanf0 = np.nan

                while np.isnan(meanf0):
                    trimmed_audio, _ = clip_audio(audio, onset=onset, offset=offset)
                    trimmed_sound = parselmouth.Sound(trimmed_audio, sampling_frequency=sr)

                    meanf0, stdevf0, minf0, maxf0 = get_f0(trimmed_sound, 75, 500)
                    f1, f2, f3 = get_formants(trimmed_sound, 75, 500, gender_flag)
                    localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter = get_jitter(trimmed_sound, minF0=75, maxF0=500)
                    localShimmer, shimmer_absolute, shimmer_rap, shimmer_rap5, shimmer_ddp = get_shimmer(trimmed_sound, minF0=75, maxF0=500)
                    hnr = get_harmonic_to_noise_ratio(trimmed_sound, minF0=75)

                    onset += 0.01
                    offset += 0.01

                batch_features.append({
                    "wav_path": wav,
                    "F0 (mean)": meanf0,
                    "F0 (std)": stdevf0,
                    "F0 (min)": minf0,
                    "F0 (max)": maxf0,
                    "F1 (mean)": np.nanmean(f1),
                    "F1 (std)": np.nanstd(f1),
                    "F2 (mean)": np.nanmean(f2),
                    "F2 (std)": np.nanstd(f2),
                    "F3 (mean)": np.nanmean(f3),
                    "F3 (std)": np.nanstd(f3),
                    "Jitter (local)": localJitter,
                    "Jitter (absolute)": localabsoluteJitter,
                    "Jitter (rap)": rapJitter,
                    "Jitter (ppq5)": ppq5Jitter,
                    "Jitter (ddp)": ddpJitter,
                    "Shimmer (local)": localShimmer,
                    "Shimmer (absolute)": shimmer_absolute,
                    "Shimmer (rap)": shimmer_rap,
                    "Shimmer (ppq5)": shimmer_rap5,
                    "Shimmer (ddp)": shimmer_ddp,
                    "Harmonic to Noise Ratio": hnr
                })

            except Exception as e:
                print(f"Error processing {wav}: {e}")
                continue

        all_features.extend(batch_features)
        print(f"Processed batch {i // batch_size + 1} of {len(wav_files) // batch_size + 1}")

    return pd.DataFrame(all_features)

# Feature extraction

In [6]:
data = pd.read_csv(CSV_FILES_PATH + "misc/data.csv")
data = data.drop_duplicates(subset="healthCode", keep="first")

In [7]:
# Uncomment this only if you want to re-extract the audio files
# positive_samples = extract_audio_files(POSITIVE_FOLDER_PATH, True)
# negative_samples = extract_audio_files(NEGATIVE_FOLDER_PATH, False)

# Filter out rows where wav_path is not found
# positive_samples = positive_samples[positive_samples["wav_path"].notna() & (positive_samples["wav_path"] != "")]
# negative_samples = negative_samples[negative_samples["wav_path"].notna() & (negative_samples["wav_path"] != "")]

# Save mapping between audio files and healthcodes 
# positive_samples.to_csv(CSV_FILES_PATH + "healthcode_to_wav_positive.csv", index=False)
# negative_samples.to_csv(CSV_FILES_PATH + "healthcode_to_wav_negative.csv", index=False)

In [8]:
# Sanity check positive and negative data
positive_samples = pd.read_csv(CSV_FILES_PATH + "misc/healthcode_to_wav_positive.csv")
negative_samples = pd.read_csv(CSV_FILES_PATH + "misc/healthcode_to_wav_negative.csv")

print(f"Positive samples: {positive_samples.shape}")
print(f"Negative samples: {negative_samples.shape}")

Positive samples: (39376, 2)
Negative samples: (23425, 2)


In [9]:
# Merge the audio file data and survey data
df_positive = pd.merge(positive_samples, data, on="healthCode", how="left")
df_negative = pd.merge(negative_samples, data, on="healthCode", how="left")

# Assertions to make sure the merge is correct
assert len(positive_samples) == len(df_positive), "Mismatch in positive sample count!"
assert len(negative_samples) == len(df_negative), "Mismatch in negative sample count!"

In [10]:
# Uncomment to re-extract feature data
# positive_formants = extract_formants_from_folder(POSITIVE_FOLDER_PATH, df_positive)

# Extract positive values into .csv file
# positive_formants.to_csv(CSV_FILES_PATH + "misc/positive_features_only.csv", index=False)

# Clean up rows and save to .csv file
# df_positive = pd.merge(positive_formants, df_positive, on="wav_path", how="left")

# Drop wav_path and move healthCode to the leftmost column
# df_positive = df_positive.drop(columns=["wav_path"])
# cols = ['healthCode'] + [col for col in df_positive.columns if col != 'healthCode']
# df_positive = df_positive[cols]

# Save to .csv file
# df_positive.to_csv(CSV_FILES_PATH + "positive.csv", index=False)

In [11]:
positive_features = pd.read_csv(CSV_FILES_PATH + "positive.csv")
print(f"Positive features: {positive_features.shape}")

  positive_features = pd.read_csv(CSV_FILES_PATH + "positive.csv")



Positive features: (39342, 51)


In [12]:
# Uncomment to re-extract feature data
# negative_formants = extract_formants_from_folder(NEGATIVE_FOLDER_PATH, df_negative)

# Extract negative values into .csv file
# negative_formants.to_csv(CSV_FILES_PATH + "misc/negative_features_only.csv", index=False)

# Clean up rows and save to .csv file
# df_negative = pd.merge(negative_formants, df_negative, on="wav_path", how="left")

# Drop wav_path and move healthCode to the leftmost column
# df_negative = df_negative.drop(columns=["wav_path"])
# cols = ['healthCode'] + [col for col in df_negative.columns if col != 'healthCode']
# df_negative = df_negative[cols]

# Save to .csv file
# df_negative.to_csv(CSV_FILES_PATH + "negative.csv", index=False)

In [13]:
negative_features = pd.read_csv(CSV_FILES_PATH + "negative.csv")
print(f"Negative features: {negative_features.shape}")

Negative features: (23393, 51)


In [14]:
print(f"Unique positive healthcodes: {positive_features.nunique()}")

Unique positive healthcodes: healthCode                   970
F0 (mean)                  39342
F0 (std)                   39333
F0 (min)                   39342
F0 (max)                   39342
F1 (mean)                  39341
F1 (std)                   39337
F2 (mean)                  39341
F2 (std)                   39337
F3 (mean)                  39341
F3 (std)                   39337
Jitter (local)             39332
Jitter (absolute)          39332
Jitter (rap)               39323
Jitter (ppq5)              39291
Jitter (ddp)               39323
Shimmer (local)            39320
Shimmer (absolute)         39320
Shimmer (rap)              39299
Shimmer (ppq5)             39227
Shimmer (ddp)              39299
Harmonic to Noise Ratio    39342
medTimepoint                   4
age                           66
are-caretaker                  2
deep-brain-stimulation         2
diagnosis-year                30
education                      8
employment                     7
gender        

In [15]:
print(f"Unique negative healthcodes: {negative_features.nunique()}")

Unique negative healthcodes: healthCode                  3982
F0 (mean)                  23368
F0 (std)                   23358
F0 (min)                   23368
F0 (max)                   23368
F1 (mean)                  23368
F1 (std)                   23364
F2 (mean)                  23368
F2 (std)                   23364
F3 (mean)                  23368
F3 (std)                   23364
Jitter (local)             23365
Jitter (absolute)          23365
Jitter (rap)               23352
Jitter (ppq5)              23332
Jitter (ddp)               23352
Shimmer (local)            23346
Shimmer (absolute)         23346
Shimmer (rap)              23334
Shimmer (ppq5)             23295
Shimmer (ddp)              23334
Harmonic to Noise Ratio    23368
medTimepoint                   4
age                           67
are-caretaker                  2
deep-brain-stimulation         2
diagnosis-year                21
education                      8
employment                     7
gender        