In [24]:
import os
import pandas as pd
import numpy as np
import librosa
from scipy.stats import variation

# paths
DATASET_CSV = "dataset.csv"
RECORDS_DIR = "records\records"  #this is the folder with all recorded files, with the all the original content

# loading the dataset
df = pd.read_csv(DATASET_CSV)

In [28]:
# first, extract group, subject ID, vowel, and repetition from code
counter=0
def parse_code(code):
    import re
    match = re.match(r"([A-Z]+)(\d+)([ai])(\d+)", code)
    if match:
        group, subject_id, vowel, repetition = match.groups()
        global counter
        counter=counter+1
        return group, int(subject_id), vowel, int(repetition)
    else:
        return None, None, None, None

df[['Group', 'SubjectID', 'Vowel', 'Repetition']] = df['record'].apply(
    lambda x: pd.Series(parse_code(x))
)
print(counter)
df.head()

249


Unnamed: 0,record,subject,group,vowel,jitter,shimmer,HNR,SHR,Group,SubjectID,Vowel,Repetition,Label,ImpulseJitterSTD,ImpulseShimmerCV,HasSubharmonics
88,MSA1a1,MSA1,MSA,a,3.927492,14.891001,7.819,0.357575,MSA,1,a,1,1.0,,,0
89,MSA1a2,MSA1,MSA,a,0.869565,4.19891,17.231,0.346014,MSA,1,a,2,1.0,,,0
90,MSA1i1,MSA1,MSA,i,0.983607,5.561001,22.601,0.0,MSA,1,i,1,1.0,,,0
91,MSA1i2,MSA1,MSA,i,0.671141,7.316001,21.113,0.0,MSA,1,i,2,1.0,,,0
92,MSA2a1,MSA2,MSA,a,1.704545,12.626999,12.537,0.695362,MSA,2,a,1,1.0,,,0


In [26]:
# add labels to all groups (0 is healthy)
def label(row):
    if row['Group'] == 'PD':
        return 2
    elif row['Group'] in ['MSA', 'PSP']:
        return 1
    elif row['Group'] == 'HC':
        return 
    else:
        return np.nan  
df['Label'] = df.apply(label, axis=1)
df = df.dropna(subset=['Label']) 

In [27]:
# add features from impulses.csv and subharmonics.csv from each file separately
def load_impulse_features(code):
    impulse_path = os.path.join(RECORDS_DIR, f"{code}_impulses.csv")
    if not os.path.exists(impulse_path):
        print(f"Missing: {impulse_path}")
        return pd.Series([np.nan, np.nan])
    df_imp = pd.read_csv(impulse_path, header=None, names=['Time', 'Amplitude'])
    times = df_imp['Time'].diff().dropna()
    amplitudes = df_imp['Amplitude']
    jitter_std = times.std()
    shimmer_cv = variation(amplitudes)
    return pd.Series([jitter_std, shimmer_cv])

def subharmonic_presence(code):
    path = os.path.join(RECORDS_DIR, f"{code}_subharmonics.csv")
    return int(os.path.exists(path))


df[['ImpulseJitterSTD', 'ImpulseShimmerCV']] = df['record'].apply(lambda x: pd.Series(load_impulse_features(x)))
df['HasSubharmonics'] = df['record'].apply(lambda x: pd.Series(subharmonic_presence(x)))
#old code, also did not work
#df[['ImpulseJitterSTD', 'ImpulseShimmerCV']] = df['record'].apply(load_impulse_features)
# df['HasSubharmonics'] = df['record'].apply(subharmonic_presence)

ecords\MSA1a1_impulses.csv
ecords\MSA1a2_impulses.csv
ecords\MSA1i1_impulses.csv
ecords\MSA1i2_impulses.csv
ecords\MSA2a1_impulses.csv
ecords\MSA2a2_impulses.csv
ecords\MSA2i1_impulses.csv
ecords\MSA2i2_impulses.csv
ecords\MSA3a1_impulses.csv
ecords\MSA3a2_impulses.csv
ecords\MSA3i1_impulses.csv
ecords\MSA3i2_impulses.csv
ecords\MSA4a1_impulses.csv
ecords\MSA4a2_impulses.csv
ecords\MSA4i1_impulses.csv
ecords\MSA4i2_impulses.csv
ecords\MSA5a1_impulses.csv
ecords\MSA5a2_impulses.csv
ecords\MSA5i1_impulses.csv
ecords\MSA5i2_impulses.csv
ecords\MSA6a1_impulses.csv
ecords\MSA6a2_impulses.csv
ecords\MSA6i1_impulses.csv
ecords\MSA6i2_impulses.csv
ecords\MSA7a1_impulses.csv
ecords\MSA7a2_impulses.csv
ecords\MSA7i1_impulses.csv
ecords\MSA7i2_impulses.csv
ecords\MSA8a1_impulses.csv
ecords\MSA8a2_impulses.csv
ecords\MSA8i1_impulses.csv
ecords\MSA8i2_impulses.csv
ecords\MSA9a1_impulses.csv
ecords\MSA9a2_impulses.csv
ecords\MSA9i1_impulses.csv
ecords\MSA9i2_impulses.csv
ecords\MSA10a1_impulses.csv


In [None]:
# extract features from clean waveform
def extract_audio_features(code):
    filepath = os.path.join(RECORDS_DIR, f"{code}_clean.wav")
    if not os.path.exists(filepath):
        return pd.Series([np.nan] * 5)
    y, sr = librosa.load(filepath)
    
    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=75, fmax=400)
    f0_clean = f0[~np.isnan(f0)]
    if len(f0_clean) == 0:
        return pd.Series([np.nan] * 5)
    f0_mean = np.mean(f0_clean)
    f0_std = np.std(f0_clean)
    f0_range = np.max(f0_clean) - np.min(f0_clean)

    hnr = librosa.effects.harmonic(y)
    hnr_ratio = np.mean(hnr)

    return pd.Series([f0_mean, f0_std, f0_range, hnr_ratio, len(f0_clean)])

df[['F0_mean', 'F0_std', 'F0_range', 'HNR_est', 'VoicedLen']] = df['record'].apply(lambda x: pd.Series(extract_audio_features(x)))

In [None]:

features = [
    'jitter', 'shimmer', 'HNR', 'SHR',
    'ImpulseJitterSTD', 'ImpulseShimmerCV', 'HasSubharmonics',
    'F0_mean', 'F0_std', 'F0_range', 'HNR_est', 'VoicedLen'
]

agg_df = df.groupby(['Group', 'SubjectID', 'Label'])[features].mean().reset_index()
agg_df.to_csv("processed_dataset_withhealthy.csv", index=False)

print('dataset produced')


dataset produced
