In [1]:
# Adapted from Parselmouth documentation at https://github.com/YannickJadoul/Parselmouth
import parselmouth
import praatio

import pandas as pd
import numpy as np
import os
import librosa
import glob

from extract_features_utils import clip_audio, get_jitter, get_shimmer, get_harmonic_to_noise_ratio, get_f0, get_formants
from pydub import AudioSegment

## File paths

In [2]:
POSITIVE_FOLDER_PATH = "sample_audio_files/positive"
NEGATIVE_FOLDER_PATH = "sample_audio_files/negative"
CSV_FILES_PATH = "csv_files/"
TIME_DATA_PATH = CSV_FILES_PATH + "time_data/"

os.makedirs(POSITIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(NEGATIVE_FOLDER_PATH, exist_ok=True)
os.makedirs(CSV_FILES_PATH, exist_ok=True)

## Helper functions

In [3]:
# Create mapping DataFrame between audio files and healthCodes
def create_wav_dataframe(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            health_code = os.path.splitext(filename)[0]
            full_path = os.path.join(folder_path, filename)
            data.append({"healthCode": health_code, "wav_path": full_path})
    
    return pd.DataFrame(data)

# Tweaked slightly to return dataframe (s) that contain temporal data
# Extract formant values from folder containing .wav files
def get_formants_over_time(folder_path, df, output_path):
    wav_files = glob.glob(os.path.join(folder_path, "*.wav"))

    for wav in wav_files:
        try:
            # Match .wav file to its current row in the DataFrame
            row = df.loc[df["wav_path"] == wav]
            if row.empty:
                print(f"No metadata found for: {wav}")
                continue

            # Get gender data from .wav file for data extraction
            health_code = row["healthCode"].values[0]
            gender_str = row["gender"].values[0].lower() if "gender" in row and pd.notna(row["gender"].values[0]) else "unknown"
            gender_flag = 1 if gender_str == "female" else 0
            max_formant = 5500 if gender_flag == 1 else 5000
            
            # Load and trim audio
            audio, sr = librosa.load(wav, sr=None)
            trimmed_audio, idx = clip_audio(audio)
            sound = parselmouth.Sound(trimmed_audio, sampling_frequency = sr)

            # Get F0 over time 
            pitch = sound.to_pitch()
            f0_times = [pitch.get_time_from_frame_number(i) for i in range(1, pitch.get_number_of_frames() + 1)]
            f0_values = [pitch.get_value_in_frame(i) for i in range(1, pitch.get_number_of_frames() + 1)]
            
            # Get formants over time 
            # Adjust time steps below for more frames!
            time_step = 0.01 
            formants = sound.to_formant_burg(time_step, 5, max_formant, 0.025, 50)
            f1, f2, f3 = [], [], []
            for t in f0_times:
                f1.append(formants.get_value_at_time(1, t))  # F1
                f2.append(formants.get_value_at_time(2, t))  # F2
                f3.append(formants.get_value_at_time(3, t))  # F3

            time_df = pd.DataFrame({
                    "time": f0_times,
                    "F0": f0_values,
                    "F1": f1,
                    "F2": f2,
                    "F3": f3
            })

            out_file = os.path.join(output_path, f"{health_code}.csv")
            time_df.to_csv(out_file, index=False)
            print(f"Saved: {out_file}")

        except Exception as e:
            print(f"Error processing {wav}: {e}")
            continue

In [4]:
positive_samples = create_wav_dataframe(POSITIVE_FOLDER_PATH)
negative_samples = create_wav_dataframe(NEGATIVE_FOLDER_PATH)

In [5]:
data = pd.read_csv(CSV_FILES_PATH + "misc/data.csv")
data = data.drop_duplicates(subset="healthCode", keep="first")

In [6]:
# Sanity check that all samples are in the dataset
assert positive_samples["healthCode"].isin(data["healthCode"]).all(), "Not all healthCodes are present in data"
assert negative_samples["healthCode"].isin(data["healthCode"]).all(), "Not all healthCodes are present in data"

In [7]:
# Merge sample data with survey data to match .wav file paths
df_positive = pd.merge(data, positive_samples, on="healthCode", how="inner")
df_negative = pd.merge(data, negative_samples, on="healthCode", how="inner")

In [8]:
# get_formants_over_time(POSITIVE_FOLDER_PATH, df_positive, TIME_DATA_PATH + "positive/")
# get_formants_over_time(NEGATIVE_FOLDER_PATH, df_negative, TIME_DATA_PATH + "negative/")