In [1]:
# Adapted from Parselmouth documentation at https://github.com/YannickJadoul/Parselmouth
import parselmouth
import praatio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import synapseclient
import json
from sklearn.cluster import KMeans
from collections import defaultdict
import glob
import librosa
from tqdm import tqdm
import os
import shutil

from extract_features_utils import clip_audio, get_jitter, get_shimmer, get_harmonic_to_noise_ratio, get_f0, get_formants
from pydub import AudioSegment

sns.set()

In [2]:
# Authenticate Synapse login credentials
# WARNING: This might raise an error if your .synapseConfig file isn't configured correctly.
syn = synapseclient.Synapse()
syn.login()

Welcome, Yadong Liu!



## File paths

In [3]:
########################################################################
#                                                                      #
#  Change paths as needed -- will create folders if they don't exist   #
#                                                                      #
########################################################################

POSITIVE_FOLDER_PATH = "audio_files/positive"
NEGATIVE_FOLDER_PATH = "audio_files/negative"
CSV_FILES_PATH = "csv_files/"

os.makedirs(POSITIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(NEGATIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(CSV_FILES_PATH, exist_ok=True)

## Helper functions

In [4]:
# Helper to skip files shorter than 0.1s
def is_valid_audio(file_path):
    try:
        audio = AudioSegment.from_file(file_path, format="m4a")
        return len(audio) > 100
    except:
        return False

# Returns file path to converted .wav file
def convert_to_wav(input_file, output_dir):
    if not is_valid_audio(input_file):
        print(f"Skipping invalid or empty file: {input_file}")
        return None

    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_path = os.path.join(output_dir, f"{base_name}.wav")
    try:
        audio = AudioSegment.from_file(input_file, format="m4a")
    except Exception:
        try:
            audio = AudioSegment.from_file(input_file, format="mp4")
        except Exception as e:
            print(f"Conversion failed for {input_file}: {e}")
            return None

    try:
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"Export failed for {input_file}: {e}")
        return None

def convert_folder_to_wav(input_dir):
    for filename in os.listdir(input_dir):
        if filename.endswith(".m4a"):
            input_path = os.path.join(input_dir, filename)
            health_code = os.path.splitext(filename)[0]
            wav_path = os.path.join(input_dir, f"{health_code}.wav")
            if os.path.exists(wav_path):
                continue
            result = convert_to_wav(input_path, input_dir)

# Adapted from https://github.com/Sage-Bionetworks/mPower-sdata/blob/master/examples/mPower-bootstrap.py
# NOT BEING USED FOR NOW, doesn't quite work.
# Limited to 30 results for now
def extract_audio_files(output_dir, limit=10, diagnosis=False):
    os.makedirs(output_dir, exist_ok=True)
    
    # Sample the survey data
    survey_data = pd.read_csv(CSV_FILES_PATH + "survey_data.csv")
    survey_data = survey_data[survey_data["professional-diagnosis"] == diagnosis]
    healthcodes = "','".join(survey_data["healthCode"]) 
    
    # Download audio files from Synapse client
    query = f"SELECT * FROM syn5511444 WHERE healthCode IN ('{healthcodes}') LIMIT {limit}"
    subset_query = syn.tableQuery(query)
    subset_df = subset_query.asDataFrame()
    subset_df["audio_audio.m4a"] = subset_df["audio_audio.m4a"].astype(str) # Convert int file id's to string for comparison
    file_map = syn.downloadTableColumns(subset_query, "audio_audio.m4a")

    # Setup mapping between healthCodes and .wav files
    mappings = []

    for file_handle_id, m4a_path in file_map.items():
        wav_path = convert_to_wav(m4a_path, output_dir)

        matched_row = subset_df[subset_df["audio_audio.m4a"] == file_handle_id]
        if not matched_row.empty:
            healthcode = matched_row.iloc[0]["healthCode"]
            mappings.append({"healthCode": healthcode, "wav_path": wav_path})
        else:
            print(f"File handle ID {file_handle_id} not found in subset_df")

    # Create DataFrame for mappings
    mapping_df = pd.DataFrame(mappings)
    return mapping_df

def create_sample_df(audio_dir):
    data = []
    for filename in os.listdir(audio_dir):
        if filename.endswith(".wav"):
            health_code = os.path.splitext(filename)[0]
            wav_path = os.path.join(audio_dir, filename)
            data.append({"healthCode": health_code, "wav_path": wav_path})
    return pd.DataFrame(data)

In [5]:
# Extract all features from .wav files in folder
def extract_formants_from_folder(folder_path, df, max_files=None):
    features = []
    i = 0

    for wav in tqdm(glob.glob(os.path.join(folder_path, "*.wav"))):
        try:
            # Match .wav file to its current row in the DataFrame
            row = df.loc[df["wav_path"] == wav]
            if row.empty:
                print(f"No metadata found for: {wav}")
                continue

            # Get gender data from .wav file for data extraction
            health_code = row["healthCode"].values[0]
            gender_str = row["gender"].values[0].lower() if "gender" in row and pd.notna(row["gender"].values[0]) else "unknown"
            gender_flag = 1 if gender_str == "female" else 0
            
            audio, sr = librosa.load(wav, sr=None)
            onset, offset = 0.25, 0.75
            meanf0 = np.nan
            
            while np.isnan(meanf0):
                trimmed_audio, _ = clip_audio(audio, onset=onset, offset=offset)
                trimmed_sound = parselmouth.Sound(trimmed_audio, sampling_frequency=sr)

                meanf0, stdevf0, minf0, maxf0 = get_f0(trimmed_sound, 75, 500)
                f1, f2, f3 = get_formants(trimmed_sound, 75, 500, gender_flag)
                localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter = get_jitter(trimmed_sound, minF0=75, maxF0=500)
                localShimmer, shimmer_absolute, shimmer_rap, shimmer_rap5, shimmer_ddp = get_shimmer(trimmed_sound, minF0=75, maxF0=500)
                hnr = get_harmonic_to_noise_ratio(trimmed_sound, minF0=75)
                
                onset += 0.01
                offset += 0.01

            features.append({
                "wav_path": wav,
                "F0 (mean)": meanf0,
                "F0 (std)": stdevf0,
                "F0 (min)": minf0,
                "F0 (max)": maxf0,
                "F1 (mean)": np.nanmean(f1),
                "F1 (std)": np.nanstd(f1),
                "F2 (mean)": np.nanmean(f2),
                "F2 (std)": np.nanstd(f2),
                "F3 (mean)": np.nanmean(f3),
                "F3 (std)": np.nanstd(f3),
                "Jitter (local)": localJitter,
                "Jitter (absolute)": localabsoluteJitter,
                "Jitter (rap)": rapJitter,
                "Jitter (ppq5)": ppq5Jitter,
                "Jitter (ddp)": ddpJitter,
                "Shimmer (local)": localShimmer,
                "Shimmer (absolute)": shimmer_absolute,
                "Shimmer (rap)": shimmer_rap,
                "Shimmer (ppq5)": shimmer_rap5,
                "Shimmer (ddp)": shimmer_ddp,
                "Harmonic to Noise Ratio": hnr
            })

            i += 1
            if max_files and i >= max_files:
                break

        except Exception as e:
            print(f"Error processing {wav}: {e}")
            continue

    return pd.DataFrame(features)

# Feature extraction

In [6]:
data = pd.read_csv(CSV_FILES_PATH + "data.csv")
data = data.drop_duplicates(subset="healthCode")

In [7]:
# Convert files in audio_files to .wav, if not already done 
# Uncomment if your files are .m4a
convert_folder_to_wav(POSITIVE_FOLDER_PATH)
convert_folder_to_wav(NEGATIVE_FOLDER_PATH)

In [8]:
positive_samples = create_sample_df(POSITIVE_FOLDER_PATH)
negative_samples = create_sample_df(NEGATIVE_FOLDER_PATH)

In [9]:
df_positive = pd.merge(positive_samples, data, on="healthCode", how="left")
df_negative = pd.merge(negative_samples, data, on="healthCode", how="left")

In [10]:
positive_formants = extract_formants_from_folder(POSITIVE_FOLDER_PATH, df_positive)

100%|███████████████████████████████████████| 30/30 [00:17<00:00,  1.67it/s]


In [11]:
negative_formants = extract_formants_from_folder(NEGATIVE_FOLDER_PATH, df_negative)

100%|███████████████████████████████████████| 30/30 [00:15<00:00,  1.88it/s]


In [12]:
positive_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,Jitter (absolute),Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),Shimmer (absolute),Shimmer (rap),Shimmer (ppq5),Shimmer (ddp),Harmonic to Noise Ratio
0,audio_files/positive/43294479-32e0-4589-92ee-2...,153.412076,55.574165,139.438233,473.554846,648.094246,64.890887,1132.708698,108.01134,2204.562547,...,3.2e-05,0.002259,0.00243,0.006778,0.039764,0.294203,0.01633,0.01522,0.048989,21.971217
1,audio_files/positive/95fe2bb0-4886-4042-b63f-a...,150.861339,1.482082,147.00133,155.6694,613.536178,313.003582,1395.710382,170.062915,2761.025136,...,3.1e-05,0.002723,0.002753,0.008168,0.048344,0.425375,0.027134,0.029843,0.081403,13.639574
2,audio_files/positive/8a4790b3-3bf5-4273-b695-f...,112.424824,3.559497,99.931897,122.194304,570.730571,10.686649,1138.912358,72.033033,2740.589434,...,0.00012,0.007813,0.006422,0.023438,0.040815,0.364879,0.021848,0.024411,0.065544,13.515626
3,audio_files/positive/ef026044-8bf1-4720-be95-e...,145.144574,1.445809,141.847957,149.599826,610.632131,22.262743,899.53153,58.540645,2541.561209,...,1.8e-05,0.001154,0.001386,0.003461,0.029207,0.255505,0.015315,0.017824,0.045946,23.94294
4,audio_files/positive/ea66cb62-1e80-4a09-a98d-e...,101.334401,1.726289,98.695198,109.03007,553.580149,33.351757,1070.78865,39.092115,2263.423378,...,5.9e-05,0.003413,0.003452,0.010239,0.058234,0.532739,0.031182,0.032058,0.093547,16.611779


In [13]:
negative_formants.head()

Unnamed: 0,wav_path,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,Jitter (absolute),Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),Shimmer (absolute),Shimmer (rap),Shimmer (ppq5),Shimmer (ddp),Harmonic to Noise Ratio
0,audio_files/negative/ccad1533-2453-4507-9ba0-a...,83.850567,2.641466,75.248781,89.079894,719.573431,7.87777,1127.463523,11.440951,2625.862878,...,7.9e-05,0.002321,0.0033,0.006962,0.040538,0.357132,0.020771,0.02467,0.062312,9.645255
1,audio_files/negative/45373776-ea31-488b-87f3-5...,122.847479,1.320415,118.743152,127.374991,680.396535,42.310084,1105.255437,33.305413,2872.350496,...,3.3e-05,0.002335,0.002456,0.007004,0.046501,0.40687,0.025529,0.027835,0.076588,16.531065
2,audio_files/negative/987f7656-13bc-4b5f-9f45-c...,190.914957,5.124154,176.752711,214.073598,585.164931,45.010121,1592.637137,50.024945,2440.445599,...,2.9e-05,0.003163,0.003102,0.00949,0.061677,0.571933,0.035449,0.037577,0.106348,10.575051
3,audio_files/negative/80398a1f-9dea-4ab4-90a6-4...,121.522505,1.913514,115.677469,126.745879,458.828308,40.452886,1204.922638,35.467532,2443.631301,...,4.3e-05,0.002422,0.003027,0.007265,0.056823,0.510362,0.029281,0.035337,0.087844,14.245651
4,audio_files/negative/41bea500-2d4c-474a-aa57-d...,261.339769,4.446399,245.175805,273.849708,846.285413,90.024777,1540.967449,69.116387,3515.8891,...,1.3e-05,0.001786,0.001844,0.005359,0.016248,0.143838,0.008152,0.009244,0.024455,23.836251


In [14]:
df_positive = pd.merge(positive_formants, df_positive, on="wav_path", how="left")
df_negative = pd.merge(negative_formants, df_negative, on="wav_path", how="left")

In [15]:
# Drop .wav path and put healthCode at the front
df_positive = df_positive.drop(columns=["wav_path"])
df_negative = df_negative.drop(columns=["wav_path"])

cols = ['healthCode'] + [col for col in df_positive.columns if col != 'healthCode']
df_positive = df_positive[cols]

cols = ['healthCode'] + [col for col in df_negative.columns if col != 'healthCode']
df_negative = df_negative[cols]

# Uncomment the lines below to save to .csv files!
df_positive.to_csv(CSV_FILES_PATH + "positive.csv", index=False)
df_negative.to_csv(CSV_FILES_PATH + "negative.csv", index=False)

In [16]:
df_positive.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,43294479-32e0-4589-92ee-269b36a70e46,153.412076,55.574165,139.438233,473.554846,648.094246,64.890887,1132.708698,108.01134,2204.562547,...,True,True,"""White or Caucasian""",Easy,False,False,True,,"version 1.0, build 7",iPhone 6
1,95fe2bb0-4886-4042-b63f-a93c1635d874,150.861339,1.482082,147.00133,155.6694,613.536178,313.003582,1395.710382,170.062915,2761.025136,...,False,True,"""White or Caucasian""",Neither easy nor difficult,False,False,False,,"version 1.0.5, build 12","iPod7,1"
2,8a4790b3-3bf5-4273-b695-fe703347d399,112.424824,3.559497,99.931897,122.194304,570.730571,10.686649,1138.912358,72.033033,2740.589434,...,True,True,"""White or Caucasian""",Easy,False,False,True,,"version 1.0, build 7",iPhone 6
3,ef026044-8bf1-4720-be95-e823c7a6e1df,145.144574,1.445809,141.847957,149.599826,610.632131,22.262743,899.53153,58.540645,2541.561209,...,False,True,"""White or Caucasian""",Neither easy nor difficult,True,False,False,7.0,"version 1.0.5, build 12",iPhone 6 Plus
4,ea66cb62-1e80-4a09-a98d-e589925c957c,101.334401,1.726289,98.695198,109.03007,553.580149,33.351757,1070.78865,39.092115,2263.423378,...,True,True,"""White or Caucasian""",Easy,False,False,True,,"version 1.0, build 7",iPhone 5s (GSM)


In [17]:
df_negative.head()

Unnamed: 0,healthCode,F0 (mean),F0 (std),F0 (min),F0 (max),F1 (mean),F1 (std),F2 (mean),F2 (std),F3 (mean),...,phone-usage,professional-diagnosis,race,smartphone,smoked,surgery,video-usage,years-smoking,appVersion,phoneInfo
0,ccad1533-2453-4507-9ba0-aaf95d00f7b9,83.850567,2.641466,75.248781,89.079894,719.573431,7.87777,1127.463523,11.440951,2625.862878,...,True,False,"""White or Caucasian""",Very easy,True,False,True,15.0,"version 1.0, build 7",iPhone 6 Plus
1,45373776-ea31-488b-87f3-5765b9cc6410,122.847479,1.320415,118.743152,127.374991,680.396535,42.310084,1105.255437,33.305413,2872.350496,...,True,False,"""East Asian""",Very easy,False,False,True,,"version 1.0, build 7",iPhone 6 Plus
2,987f7656-13bc-4b5f-9f45-ce799fbf9aa5,190.914957,5.124154,176.752711,214.073598,585.164931,45.010121,1592.637137,50.024945,2440.445599,...,True,False,"""Latino/Hispanic""",Very easy,False,False,True,,"version 1.0, build 7",iPhone 5s (GSM)
3,80398a1f-9dea-4ab4-90a6-49508a73ad63,121.522505,1.913514,115.677469,126.745879,458.828308,40.452886,1204.922638,35.467532,2443.631301,...,True,False,"""White or Caucasian""",Very easy,True,False,True,25.0,"version 1.0, build 7",iPhone 6 Plus
4,41bea500-2d4c-474a-aa57-de9629386feb,261.339769,4.446399,245.175805,273.849708,846.285413,90.024777,1540.967449,69.116387,3515.8891,...,True,False,"""White or Caucasian""",Very easy,False,False,True,,"version 1.0, build 7",iPhone 5 (GSM)
