In [6]:
# this is the playground for fixing mfcc addition

# initialize
from tqdm import tqdm
from time import sleep

import glob
import parselmouth
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# needed for mfcc calculation
import statistics
import speechpy
from scipy.io import wavfile

In [7]:
# Process wav files to get Jitter, Shimmer, HNR, and MFCC

def get_voice_data(_path):
    # select .wav files only
    wav_files = glob.glob(_path + "/*.wav")
    _type = _path.split("/")[-1] # identify type: my_data, healthy, functional etc...
    
    # list to hold voice data before turning it into a dataframe
    data = []
    
    # for each audio file,
    for wav_file in tqdm(wav_files): # tqdm shows the progress bar
        sound = parselmouth.Sound(wav_file) # sound object from wav file
        pitch = sound.to_pitch()
        pulses = parselmouth.praat.call([sound, pitch], "To PointProcess (cc)")

        # name analysis
        name = os.path.basename(wav_file).split(".")[0]  

        ## tone
        tone = ""
        if "l" in name:
            tone = "l"
        elif "n" in name:
            tone = "n"
        elif "h" in name:
            tone = "h"

        ## syllable
        syllab = ""
        if "a" in name:
            syllab = "a"
        elif "i" in name:
            syllab = "i"
        elif "u" in name:
            syllab = "u"

        # jitter
        jitter = parselmouth.praat.call(pulses, "Get jitter (local)", 0.0, 0.0, 0.0001, 0.02, 1.3) * 100

        # shimmer
        shimmer = parselmouth.praat.call([sound, pulses], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

        # HNR
        harmonicity = parselmouth.praat.call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
        hnr = parselmouth.praat.call(harmonicity, "Get mean", 0, 0)

        # append a bit before adding mfcc
        data_row = [name, _type, tone, syllab, jitter, shimmer, hnr]

        # MFCC, d1, d2
        samplerate, wav_data = wavfile.read(wav_file)
        mfccs = speechpy.feature.mfcc(wav_data, samplerate, num_cepstral = 12)
        mfccs = mfccs.T # transform to handle wav_data easily 
        derivatives = speechpy.feature.extract_derivative_feature(mfccs) # this now looks like: [c#][frame#][[mfcc, d1, d2]]

        mfcc_list = []
        mfcc_d1 = []
        mfcc_d2 = []

        # for each coefficient,
        for i in range(0, len(derivatives)):
            mfcc_vars = derivatives[i].T # mfcc, d1, d2

            # take the average across the entire time frame
            mfcc = statistics.mean(mfcc_vars[0])
            d1 = statistics.mean(mfcc_vars[1])
            d2 = statistics.mean(mfcc_vars[2])

            # append to the list
            mfcc_list.append(mfcc)
            mfcc_d1.append(d1)
            mfcc_d2.append(d2)

        data_row = data_row + mfcc_list + mfcc_d1 + mfcc_d2

        # append to data
        data.append(data_row)
        
    return data

In [8]:
def analyze_svd(dataset_path):
    # set up dataframe info
    columns = ["Name", "Type", "Tone", "Syllab", "Jitter", "Shimmer", "HNR"]
    for i in range(0,12):
        columns.append("MFCC-"+str(i))
    for i in range(0,12):
        columns.append("MFCC-"+str(i)+"_d1")
    for i in range(0,12):
        columns.append("MFCC-"+str(i)+"_d2")
    
    healthy = get_voice_data(dataset_path + "/healthy")
    functional = get_voice_data(dataset_path + "/pathological/functional")
    hyperfunctional = get_voice_data(dataset_path + "/pathological/hyperfunctional")
    organic = get_voice_data(dataset_path + "/pathological/organic")
    psychogenic = get_voice_data(dataset_path + "/pathological/psychogenic")

    # Combine the results into one dataframe
    combined = healthy + functional + hyperfunctional + organic + psychogenic
    df = pd.DataFrame(combined, columns=columns)
    return df


In [9]:
# filepath for the test and train datasets
test_path = "/Users/leochoo/dev/VoiceDisorderSVM/data/SVD/test_audio"
train_path = "/Users/leochoo/dev/VoiceDisorderSVM/data/SVD/train_audio"

In [10]:
# generate voice report for test dataset
test_report = analyze_svd(test_path)
test_report.shape

100%|██████████| 18/18 [00:02<00:00,  7.37it/s]
100%|██████████| 18/18 [00:01<00:00,  9.86it/s]
100%|██████████| 18/18 [00:02<00:00,  6.54it/s]
100%|██████████| 18/18 [00:02<00:00,  7.71it/s]
100%|██████████| 27/27 [00:03<00:00,  8.61it/s]


(99, 43)

In [11]:
test_report

Unnamed: 0,Name,Type,Tone,Syllab,Jitter,Shimmer,HNR,MFCC-0,MFCC-1,MFCC-2,...,MFCC-2_d2,MFCC-3_d2,MFCC-4_d2,MFCC-5_d2,MFCC-6_d2,MFCC-7_d2,MFCC-8_d2,MFCC-9_d2,MFCC-10_d2,MFCC-11_d2
0,1-i_l,healthy,l,i,0.238779,0.019045,19.410768,19.726412,4.980335,0.364373,...,0.031771,0.950529,0.207430,-0.365490,-0.002616,0.095770,-0.169743,0.089624,-0.058871,-0.158737
1,2-u_h,healthy,h,u,0.349111,0.023441,28.655604,24.231631,14.833635,5.132610,...,0.452091,-0.004018,-0.117941,-0.168529,-0.045855,-0.026354,-0.046411,-0.096588,-0.154689,-0.104487
2,1-i_n,healthy,n,i,0.209544,0.007423,26.996682,22.370851,9.131600,1.832111,...,0.166658,0.585586,0.135321,-0.225097,-0.087052,0.182036,-0.260597,-0.084150,-0.037017,-0.158376
3,2-u_l,healthy,l,u,1.069854,0.041115,25.108378,22.546297,14.963159,5.392483,...,0.478775,0.235980,-0.043400,-0.060782,-0.008986,-0.034862,0.064217,0.006862,-0.062482,0.004279
4,2-u_n,healthy,n,u,0.413457,0.031538,24.573556,22.691574,14.178967,5.598100,...,0.509009,0.185690,-0.010915,-0.085739,-0.056782,-0.037253,0.028907,0.041877,-0.035037,0.023320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,151-a_l,psychogenic,l,a,0.100698,0.010602,32.236174,23.574362,15.892063,-2.114620,...,-0.179323,-0.369967,-0.108087,0.108928,0.135450,0.083520,-0.140160,-0.111718,-0.050962,0.014016
95,366-u_n,psychogenic,n,u,0.201356,0.015767,31.531414,22.511361,19.340193,8.954125,...,0.790288,0.167677,-0.153999,-0.169101,-0.167884,0.034443,0.004447,0.011665,-0.033949,-0.111250
96,741-a_h,psychogenic,h,a,0.281613,0.026882,28.802297,24.383547,15.528880,1.116797,...,0.104059,-0.186318,-0.146550,0.081214,0.197146,0.141567,-0.099794,-0.178773,-0.165446,-0.054257
97,366-u_l,psychogenic,l,u,0.389981,0.039966,23.895287,20.848472,14.195569,5.796031,...,0.518428,0.280540,0.100661,0.036428,-0.025210,0.028554,0.010087,0.016879,0.001489,-0.045411


In [12]:
# generate voice report for train dataset
train_report = analyze_svd(train_path)
train_report.shape

100%|██████████| 3141/3141 [06:03<00:00,  8.65it/s]
100%|██████████| 990/990 [01:48<00:00,  9.16it/s]
100%|██████████| 1898/1898 [03:14<00:00,  9.78it/s]
100%|██████████| 891/891 [01:26<00:00, 10.26it/s]
100%|██████████| 792/792 [01:11<00:00, 11.02it/s]


(7712, 43)

In [13]:
# Save the outputs to the processed data directory
test_report.to_csv ("./data/processed/test_SVD_j_s_hnr_mfcc_with_d1d2.csv", index = False, header=True)
print("Test data exported")
train_report.to_csv ("./data/processed/train_SVD_j_s_hnr_mfcc_with_d1d2.csv", index = False, header=True)
print("Train data exported")




Test data exported
Train data exported


In [8]:
# 20201105 
# so i recognized the problem with mfcc calculation so I'm re-doing it correctly.

# 1105 09:02 now generating new dataset with the correct average mfcc value. no d1 d2 included here.

In [None]:
# 1109 08:55 refactoring done and testing