In [1]:
# initialize
from tqdm import tqdm
from time import sleep

import glob
import parselmouth
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# filepath for the dataset
root = "/Users/leochoo/dev/GP2-dev/SVD"
# root = "./testSVD"

my_data_path = root + "/my_data"
healthy_path = root + "/healthy"
patho_path = root + "/pathological"

!ls {patho_path}



[1m[36mfunctional[m[m      [1m[36mhyperfunctional[m[m [1m[36morganic[m[m         [1m[36mpsychogenic[m[m


In [2]:
# include MFCC data as well

def get_voice_data(_path):
    # select .wav files only
    wav_files = glob.glob(_path + "/*.wav")

    n_list = []
    tone_list = []
    syllab_list = []

    j_list = []
    s_list = []
    h_list = []

    # for wav_file in wav_files:
    for wav_file in tqdm(wav_files): # tqdm shows the progress bar
        sound = parselmouth.Sound(wav_file) # sound object from wav file
        pitch = sound.to_pitch()
        pulses = parselmouth.praat.call([sound, pitch], "To PointProcess (cc)")

        # name analysis
        name = os.path.basename(wav_file).split(".")[0]  
        
        ## tone
        if "l" in name:
            tone_list.append("l")
        elif "n" in name:
            tone_list.append("n")
        elif "h" in name:
            tone_list.append("h")

        ## syllable
        if "a" in name:
            syllab_list.append("a")
        elif "i" in name:
            syllab_list.append("i")
        elif "u" in name:
            syllab_list.append("u")
        # jitter
        jitter_local = parselmouth.praat.call(pulses, "Get jitter (local)", 0.0, 0.0, 0.0001, 0.02, 1.3) * 100

        # shimmer
        shimmer_local = parselmouth.praat.call([sound, pulses], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

        # HNR
        harmonicity = parselmouth.praat.call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
        hnr = parselmouth.praat.call(harmonicity, "Get mean", 0, 0)
        
        # Append to numpy array
        n_list.append(name)
        j_list.append(jitter_local)
        s_list.append(shimmer_local)
        h_list.append(hnr)

        # MFCC
        mfcc_object = sound.to_mfcc(number_of_coefficients=13)
        mfcc_arr = mfcc_object.to_array()
        mfcc_dic = {}
        for i in range(1,len(mfcc_arr)):
            mfcc_dic["MFCC-"+str(i)] = mfcc_arr[i]
        mfcc_df = pd.DataFrame.from_dict(mfcc_dic)

    # create dataframe
    df = pd.DataFrame({"Name":pd.Series(n_list),
                        "Type": np.nan,
                        "Tone": pd.Series(tone_list),
                        "Syllab": pd.Series(syllab_list),
                           "Jitter":pd.Series(j_list),
                           "Shimmer":pd.Series(s_list),
                           "HNR":pd.Series(h_list)})
    df["Type"]= _path.split("/")[-1] # identify type: my_data, healthy, functional etc...
    new_df = pd.concat([df, mfcc_df], axis=1, sort=False)
    new_df = new_df.dropna() # some data are missing jitter, shimmer, hnr for some reason it seems..?
    return new_df


In [None]:
healthy_df = get_voice_data(healthy_path)
functional_df = get_voice_data(patho_path + "/functional")
hyperfunctional_df = get_voice_data(patho_path + "/hyperfunctional")
organic_df = get_voice_data(patho_path + "/organic")
psychogenic_df = get_voice_data(patho_path + "/psychogenic")

100%|██████████| 3141/3141 [03:58<00:00, 13.16it/s]
100%|██████████| 1008/1008 [01:13<00:00, 13.70it/s]
 22%|██▏       | 418/1916 [00:31<01:53, 13.16it/s]

In [None]:
new_df = pd.concat(frames)
new_df = new_df.dropna()
new_df

In [None]:
new_df.to_csv ("./SVD_j_s_hnr_mfcc.csv", index = False, header=True)

## 2020-07-28 
It is mysterious how I'm getting a bunch of NaN when I run the functiosn above. It seems that when i run without MFCC, healthy data is fine but when i run with MFCC it is not okay.

I need to look into this. But for today, I will just cut out all the NaN values and proceed with it.