In [1]:
import os
import pandas as pd
import numpy as np
import librosa
from scipy.stats import variation

# paths
DATASET_CSV = "C:\\Users\\olasw\\OneDrive\\Documents\\Uni\\DSP\\dataset.csv"
RECORDS_DIR = "C:\\Users\\olasw\\OneDrive\\Documents\\Uni\\DSP\\data-science-project\\records\\records"
#this is the folder with all recorded files, with the all the original content

# loading the dataset
df = pd.read_csv(DATASET_CSV)

In [2]:
# first, extract group, subject ID, vowel, and repetition from code
counter=0
def parse_code(code):
    import re
    match = re.match(r"([A-Z]+)(\d+)([ai])(\d+)", code)
    if match:
        group, subject_id, vowel, repetition = match.groups()
        global counter
        counter=counter+1
        return group, int(subject_id), vowel, int(repetition)
    else:
        return None, None, None, None

df[['Group', 'SubjectID', 'Vowel', 'Repetition']] = df['record'].apply(
    lambda x: pd.Series(parse_code(x))
)
print(counter)
df.head()

337


Unnamed: 0,record,subject,group,vowel,jitter,shimmer,HNR,SHR,Group,SubjectID,Vowel,Repetition
0,HC1a1,HC1,HC,a,0.332226,3.225,22.942,0.0,HC,1,a,1
1,HC1a2,HC1,HC,a,0.344828,2.73,23.677,0.0,HC,1,a,2
2,HC1i1,HC1,HC,i,0.4329,1.567,20.838,0.0,HC,1,i,1
3,HC1i2,HC1,HC,i,0.383142,1.759999,22.681,0.0,HC,1,i,2
4,HC2a1,HC2,HC,a,0.412371,3.732,20.715,0.0,HC,2,a,1


In [3]:
# add labels to all groups (0 is healthy)
def label(row):
    if row['Group'] == 'PD':
        return 2
    elif row['Group'] in ['MSA', 'PSP']:
        return 1
    elif row['Group'] == 'HC':
        return 
    else:
        return np.nan  
df['Label'] = df.apply(label, axis=1)
df = df.dropna(subset=['Label']) 

In [4]:
# add features from impulses.csv and subharmonics.csv from each file separately
def load_impulse_features(code):
    impulse_path = os.path.join(RECORDS_DIR, f"{code}_impulses.csv")
    if not os.path.exists(impulse_path):
        print(f"Missing: {impulse_path}")
        return pd.Series([np.nan, np.nan])
    df_imp = pd.read_csv(impulse_path, header=None, names=['Time', 'Amplitude'])
    df_imp['Time'] = pd.to_numeric(df_imp['Time'], errors='coerce')
    df_imp['Amplitude'] = pd.to_numeric(df_imp['Amplitude'], errors='coerce')
    times = df_imp['Time'].diff().dropna()
    amplitudes = df_imp['Amplitude']
    jitter_std = times.std()
    shimmer_cv = variation(amplitudes)
    return pd.Series([jitter_std, shimmer_cv])

def subharmonic_presence(code):
    path = os.path.join(RECORDS_DIR, f"{code}_subharmonics.csv")
    return int(os.path.exists(path))


df[['ImpulseJitterSTD', 'ImpulseShimmerCV']] = df['record'].apply(lambda x: pd.Series(load_impulse_features(x)))
df['HasSubharmonics'] = df['record'].apply(lambda x: pd.Series(subharmonic_presence(x)))
#old code, also did not work
#df[['ImpulseJitterSTD', 'ImpulseShimmerCV']] = df['record'].apply(load_impulse_features)
# df['HasSubharmonics'] = df['record'].apply(subharmonic_presence)

In [7]:
#  extract features from clean waveform
def extract_audio_features(code):
    filepath = os.path.join(RECORDS_DIR, f"{code}_clean.wav")
    if not os.path.exists(filepath):
        print('path does not exist')
        return pd.Series([np.nan] * 5)
    y, sr = librosa.load(filepath, sr=None)
    print(sr)
    print("Y size", y.shape)
    print(np.max(y), np.min(y), np.mean(y))
    print(f"Duration: {len(y)/sr:.2f}s, Max amplitude: {np.max(np.abs(y))}")
    f0, voiced_flag, voiced_probs = librosa.pyin(y, sr=sr, fmin=45, fmax=5000)
    print(f0)
    f0_clean = f0[~np.isnan(f0)]
    if len(f0_clean) == 0:
        return pd.Series([np.nan] * 5)
    f0_mean = np.mean(f0_clean)
    f0_std = np.std(f0_clean)
    f0_range = np.max(f0_clean) - np.min(f0_clean)


    harmonic = librosa.effects.harmonic(y)
    percussive = librosa.effects.percussive(y)
    hnr_ratio = 10 * np.log10(np.sum(harmonic**2) / (np.sum(percussive**2) + 1e-6))


    return pd.Series([f0_mean, f0_std, f0_range, hnr_ratio, len(f0_clean)])

df[['F0_mean', 'F0_std', 'F0_range', 'HNR_est', 'VoicedLen']] = df['record'].apply(lambda x: pd.Series(extract_audio_features(x)))

48000
Y size (816315,)
0.5623474 -0.9000244 -5.019326e-06
Duration: 17.01s, Max amplitude: 0.9000244140625
[nan nan nan ... nan nan nan]
48000
Y size (521669,)
0.6690674 -0.9000244 0.00011205794
Duration: 10.87s, Max amplitude: 0.9000244140625


KeyboardInterrupt: 

In [None]:

features = [
    'jitter', 'shimmer', 'HNR', 'SHR',
    'ImpulseJitterSTD', 'ImpulseShimmerCV', 'HasSubharmonics',
    'F0_mean', 'F0_std', 'F0_range', 'HNR_est', 'VoicedLen'
]

agg_df = df.groupby(['Group', 'SubjectID', 'Label'])[features].mean().reset_index()
agg_df.to_csv("processed_dataset_withhealthy.csv", index=False)

print('dataset produced')


dataset produced
