In [1]:
# Adapted from Parselmouth documentation at https://github.com/YannickJadoul/Parselmouth
import parselmouth
import praatio

import pandas as pd
import numpy as np
import os
import librosa
import glob

from extract_features_utils import clip_audio, get_jitter, get_shimmer, get_harmonic_to_noise_ratio, get_f0, get_formants
from pydub import AudioSegment

## File paths

In [2]:
POSITIVE_FOLDER_PATH = "sample_audio_files/positive"
NEGATIVE_FOLDER_PATH = "sample_audio_files/negative"
CSV_FILES_PATH = "csv_files/"
TIME_DATA_PATH = CSV_FILES_PATH + "time_data/"

os.makedirs(POSITIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(NEGATIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(CSV_FILES_PATH, exist_ok=True)

## Helper functions

In [9]:
# Create mapping DataFrame between audio files and healthCodes
def create_wav_dataframe(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            health_code = os.path.splitext(filename)[0]
            full_path = os.path.join(folder_path, filename)
            data.append({"healthCode": health_code, "wav_path": full_path})
    
    return pd.DataFrame(data)

# Tweaked slightly to return dataframe (s) that contain temporal data
# Extract formant values from folder containing .wav files
def get_formants_over_time(folder_path, df, output_path):
    wav_files = glob.glob(os.path.join(folder_path, "*.wav"))

    for wav in wav_files:
        try:
            # Match .wav file to its current row in the DataFrame
            row = df.loc[df["wav_path"] == wav]
            if row.empty:
                print(f"No metadata found for: {wav}")
                continue

            # Get gender data from .wav file for data extraction
            health_code = row["healthCode"].values[0]
            gender_str = row["gender"].values[0].lower() if "gender" in row and pd.notna(row["gender"].values[0]) else "unknown"
            gender_flag = 1 if gender_str == "female" else 0
            max_formant = 5500 if gender_flag == 1 else 5000
            
            # Load and trim audio
            audio, sr = librosa.load(wav, sr=None)
            trimmed_audio, idx = clip_audio(audio)
            sound = parselmouth.Sound(trimmed_audio, sampling_frequency = sr)

            # Get F0 over time 
            pitch = sound.to_pitch()
            f0_times = [pitch.get_time_from_frame_number(i) for i in range(1, pitch.get_number_of_frames() + 1)]
            f0_values = [pitch.get_value_in_frame(i) for i in range(1, pitch.get_number_of_frames() + 1)]
            
            # Get formants over time 
            # Adjust time steps below for more frames!
            time_step = 0.01 
            formants = sound.to_formant_burg(time_step, 5, max_formant, 0.025, 50)
            f1, f2, f3 = [], [], []
            for t in f0_times:
                f1.append(formants.get_value_at_time(1, t))  # F1
                f2.append(formants.get_value_at_time(2, t))  # F2
                f3.append(formants.get_value_at_time(3, t))  # F3

            time_df = pd.DataFrame({
                    "time": f0_times,
                    "F0": f0_values,
                    "F1": f1,
                    "F2": f2,
                    "F3": f3
            })

            out_file = os.path.join(output_path, f"{health_code}.csv")
            time_df.to_csv(out_file, index=False)
            print(f"Saved: {out_file}")

        except Exception as e:
            print(f"Error processing {wav}: {e}")
            continue

In [4]:
positive_samples = create_wav_dataframe(POSITIVE_FOLDER_PATH)
negative_samples = create_wav_dataframe(NEGATIVE_FOLDER_PATH)

In [5]:
data = pd.read_csv(CSV_FILES_PATH + "misc/data.csv")
data = data.drop_duplicates(subset="healthCode", keep="first")

In [6]:
# Sanity check that all samples are in the dataset
assert positive_samples["healthCode"].isin(data["healthCode"]).all(), "Not all healthCodes are present in data"
assert negative_samples["healthCode"].isin(data["healthCode"]).all(), "Not all healthCodes are present in data"

In [7]:
# Merge sample data with survey data to match .wav file paths
df_positive = pd.merge(data, positive_samples, on="healthCode", how="inner")
df_negative = pd.merge(data, negative_samples, on="healthCode", how="inner")

In [10]:
# get_formants_over_time(POSITIVE_FOLDER_PATH, df_positive, TIME_DATA_PATH + "positive/")

Saved: csv_files/time_data/43294479-32e0-4589-92ee-269b36a70e46.csv
Saved: csv_files/time_data/95fe2bb0-4886-4042-b63f-a93c1635d874.csv
Saved: csv_files/time_data/8a4790b3-3bf5-4273-b695-fe703347d399.csv
Saved: csv_files/time_data/ef026044-8bf1-4720-be95-e823c7a6e1df.csv
Saved: csv_files/time_data/ea66cb62-1e80-4a09-a98d-e589925c957c.csv
Saved: csv_files/time_data/1a38dca5-145a-413b-86cd-86fa9f6802ff.csv
Saved: csv_files/time_data/0b897bc9-4449-4fe8-8b91-58337ee1cb37.csv
Saved: csv_files/time_data/284ec98c-a194-4265-8525-75e4297deaea.csv
Saved: csv_files/time_data/c32c4258-4645-43a9-a423-605aeb904519.csv
Saved: csv_files/time_data/3e3dab8f-87e1-42c8-a091-9202fb089a03.csv
Saved: csv_files/time_data/d0ae0ee1-165b-4e5a-9caa-ad8a17688445.csv
Saved: csv_files/time_data/b4bb8c67-33ac-4333-b1aa-6b79babc40fc.csv
Saved: csv_files/time_data/205b0b4f-e990-49d7-b47e-9e2c2585a1bc.csv
Saved: csv_files/time_data/6134a5c5-2912-4cb0-87aa-10dee483b686.csv
Saved: csv_files/time_data/c03f7785-d690-400f-b5

In [13]:
get_formants_over_time(NEGATIVE_FOLDER_PATH, df_negative, TIME_DATA_PATH + "negative/")

Saved: csv_files/time_data/negative/ccad1533-2453-4507-9ba0-aaf95d00f7b9.csv
Saved: csv_files/time_data/negative/45373776-ea31-488b-87f3-5765b9cc6410.csv
Saved: csv_files/time_data/negative/987f7656-13bc-4b5f-9f45-ce799fbf9aa5.csv
Saved: csv_files/time_data/negative/80398a1f-9dea-4ab4-90a6-49508a73ad63.csv
Saved: csv_files/time_data/negative/41bea500-2d4c-474a-aa57-de9629386feb.csv
Saved: csv_files/time_data/negative/a0519a95-7aff-4e13-81a6-a629c81e329d.csv
Saved: csv_files/time_data/negative/cea9168b-0db2-4921-bb1c-ef2d00006445.csv
Saved: csv_files/time_data/negative/d392b5fb-2c96-4ead-b97d-254d5d8af02a.csv
Saved: csv_files/time_data/negative/1eae1a5b-09b7-41ea-9f83-b7cd9f52bc33.csv
Saved: csv_files/time_data/negative/5fdfdf9d-0f20-431d-87bb-23dc98dfb01f.csv
Saved: csv_files/time_data/negative/d64134ef-4749-4f95-b881-8d16f3d770a6.csv
Saved: csv_files/time_data/negative/f18e84f0-fd4b-4c26-b34b-9238378ded22.csv
Saved: csv_files/time_data/negative/acbb828c-7f27-4098-a639-b6694d72a7e7.csv