In [33]:
# initialize
from tqdm import tqdm
from time import sleep

import glob
import parselmouth
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import statistics
import speechpy

In [113]:
wav_file = "/Users/leochoo/dev/VoiceDisorderSVM/data/SVD/test_audio/healthy/1-a_h.wav"
sound = parselmouth.Sound(wav_file) # sound object from wav file

In [114]:
# MFCC
mfcc_object = sound.to_mfcc(number_of_coefficients=12) # from 0 to 12
mfcc_object

<parselmouth.MFCC at 0x132ef8fb0>

In [115]:
mfcc_arr = mfcc_object.to_array()
len(mfcc_arr)
statistics.mean(mfcc_arr[0])

1740.965241853779

In [116]:
mfcc_dic = {}
for i in range(0,len(mfcc_arr)):
    mfcc_dic["MFCC-"+str(i)] = [statistics.mean(mfcc_arr[i])]
mfcc_df = pd.DataFrame.from_dict(mfcc_dic)

In [117]:
mfcc_df

Unnamed: 0,MFCC-0,MFCC-1,MFCC-2,MFCC-3,MFCC-4,MFCC-5,MFCC-6,MFCC-7,MFCC-8,MFCC-9,MFCC-10,MFCC-11,MFCC-12
0,1740.965242,396.361323,-116.619975,-21.260483,-203.293634,-91.003209,-165.913936,-92.061193,27.434481,12.731777,-78.369763,-37.069225,-9.464482


In [118]:
"""
    File type = "ooTextFile"
    Object class = "MFCC 1"

    xmin = 0 
    xmax = 1.99352 
    nx = 393 
    dx = 0.005
    x1 = 0.01676000000000005 
    fmin = 0 
    fmax = 4100 
    maximumNumberOfCoefficients = 39 
    
    
    
    
"""
# This result was generated by praat. Here we can see
# how much frame_length PRAAT used (dx = 0.005)
# as well as checking duration etc.

'\n    File type = "ooTextFile"\n    Object class = "MFCC 1"\n\n    xmin = 0 \n    xmax = 1.99352 \n    nx = 393 \n    dx = 0.005\n    x1 = 0.01676000000000005 \n    fmin = 0 \n    fmax = 4100 \n    maximumNumberOfCoefficients = 39 \n    \n    \n    \n    \n'

In [153]:
# another way to check duration etc.
import wave
import contextlib
fname = wav_file
with contextlib.closing(wave.open(fname,'r')) as f:
    frames = f.getnframes()
    print(frames)
    rate = f.getframerate()
    print(rate)
    duration = frames / float(rate)
    print(duration)

99676
50000
1.99352


In [154]:
1.99352 / (99676/393)

0.007859999999999999

In [233]:
# calculate mfcc in new way
# read sound file
from scipy.io import wavfile
samplerate, data = wavfile.read(wav_file)
mfcc2 = speechpy.feature.mfcc(data, samplerate, num_cepstral = 12)
mfcc2

array([[ 2.30788011e+01,  8.74297775e+00, -4.73649223e+00, ...,
        -9.84282599e-01, -7.10952178e-01, -1.06898944e+00],
       [ 2.29972951e+01,  1.48815732e+01, -2.43431791e-01, ...,
         1.21816204e+00, -9.13094974e-01, -5.63639552e-01],
       [ 2.30777944e+01,  1.15861942e+01, -9.19905308e-01, ...,
         1.56295563e-02, -1.24734982e+00, -1.87599291e+00],
       ...,
       [ 2.38572851e+01,  9.78821675e+00, -2.12699465e+00, ...,
        -1.19123742e+00, -1.11752969e+00, -1.43835095e+00],
       [ 2.39061312e+01,  1.04396195e+01, -1.55103684e+00, ...,
        -1.58721547e+00, -1.77412958e+00, -2.02998529e+00],
       [ 2.39312181e+01,  1.31300100e+01, -9.50688736e-01, ...,
        -9.65537263e-02, -9.75815037e-01, -1.81499980e+00]])

In [234]:
mfcc2 = mfcc2.T
mfcc2

array([[ 2.30788011e+01,  2.29972951e+01,  2.30777944e+01, ...,
         2.38572851e+01,  2.39061312e+01,  2.39312181e+01],
       [ 8.74297775e+00,  1.48815732e+01,  1.15861942e+01, ...,
         9.78821675e+00,  1.04396195e+01,  1.31300100e+01],
       [-4.73649223e+00, -2.43431791e-01, -9.19905308e-01, ...,
        -2.12699465e+00, -1.55103684e+00, -9.50688736e-01],
       ...,
       [-9.84282599e-01,  1.21816204e+00,  1.56295563e-02, ...,
        -1.19123742e+00, -1.58721547e+00, -9.65537263e-02],
       [-7.10952178e-01, -9.13094974e-01, -1.24734982e+00, ...,
        -1.11752969e+00, -1.77412958e+00, -9.75815037e-01],
       [-1.06898944e+00, -5.63639552e-01, -1.87599291e+00, ...,
        -1.43835095e+00, -2.02998529e+00, -1.81499980e+00]])

In [220]:
mfcc2_dic = {}
for i in range(0,len(mfcc2)):
    mfcc2_dic["MFCC-"+str(i)] = [statistics.mean(mfcc2[i])]
mfcc_df2 = pd.DataFrame.from_dict(mfcc2_dic)

In [221]:
mfcc_df2

Unnamed: 0,MFCC-0,MFCC-1,MFCC-2,MFCC-3,MFCC-4,MFCC-5,MFCC-6,MFCC-7,MFCC-8,MFCC-9,MFCC-10,MFCC-11
0,24.244754,12.166728,-1.612134,-3.714115,-4.39273,-3.172721,0.249254,1.260676,-1.067323,-0.009724,-0.904917,-1.127606


In [222]:
derivatives = speechpy.feature.extract_derivative_feature(mfcc2)
derivatives

array([[[ 2.30788011e+01,  6.91528839e+00,  2.07761724e+00],
        [ 2.29972951e+01,  6.93334028e+00,  2.07901441e+00],
        [ 2.30777944e+01,  6.92141606e+00,  2.08638746e+00],
        ...,
        [ 2.38572851e+01,  7.17685674e+00,  2.15380963e+00],
        [ 2.39061312e+01,  7.17936542e+00,  2.15380963e+00],
        [ 2.39312181e+01,  7.17936542e+00,  2.15380963e+00]],

       [[ 8.74297775e+00,  3.80539616e+00,  1.08530465e+00],
        [ 1.48815732e+01,  3.18280231e+00,  1.09796030e+00],
        [ 1.15861942e+01,  3.83512212e+00,  9.56024543e-01],
        ...,
        [ 9.78821675e+00,  3.66996394e+00,  1.18170090e+00],
        [ 1.04396195e+01,  3.93900299e+00,  1.18170090e+00],
        [ 1.31300100e+01,  3.93900299e+00,  1.18170090e+00]],

       [[-4.73649223e+00, -2.08324241e-01, -1.07772544e-01],
        [-2.43431791e-01, -7.48978536e-01, -2.90183155e-02],
        [-9.19905308e-01, -1.64373452e-01, -1.64223812e-01],
        ...,
        [-2.12699465e+00, -3.45241431e-01,

In [223]:
derivatives[0][0]

array([23.07880106,  6.91528839,  2.07761724])

In [224]:
derivatives.shape

(12, 197, 3)

In [225]:
derivatives[0]

array([[23.07880106,  6.91528839,  2.07761724],
       [22.99729513,  6.93334028,  2.07901441],
       [23.07779438,  6.92141606,  2.08638746],
       [23.12780423,  6.93436403,  2.09204241],
       [23.04317818,  6.96475528,  2.09876043],
       [23.15023105,  6.97783443,  2.10592243],
       [23.24866086,  7.00488495,  2.10801323],
       [23.26484174,  7.02716966,  2.11120002],
       [23.39200388,  7.02648131,  2.11904178],
       [23.43984638,  7.04275946,  2.12527837],
       [23.41248333,  7.07382916,  2.12891184],
       [23.50755566,  7.0894773 ,  2.13374555],
       [23.61536796,  7.09982054,  2.14035228],
       [23.6397025 ,  7.1188175 ,  2.14356248],
       [23.67925143,  7.14235267,  2.14644727],
       [23.75446178,  7.14663607,  2.15326837],
       [23.83453248,  7.1589183 ,  2.15979279],
       [23.81591413,  7.18688272,  2.1612508 ],
       [23.88663442,  7.2055226 ,  2.16540787],
       [23.99109639,  7.20349269,  2.17443514],
       [24.03206481,  7.22529301,  2.182

In [246]:
derivaties[2].T

array([[-4.73649223e+00, -2.43431791e-01, -9.19905308e-01,
        -3.28494002e+00,  8.20602751e-01, -7.24825634e-01,
        -3.58592035e+00, -2.01852638e-01, -1.25897212e+00,
        -5.67303985e+00, -1.37496531e+00, -7.41417609e-01,
        -4.88671828e-01, -3.07444572e+00, -3.71985386e+00,
        -1.96184293e+00, -3.88376182e+00, -1.83710633e+00,
        -1.03254430e+00, -1.91223373e+00, -3.65007424e+00,
        -1.04321469e+00, -1.54693490e+00, -2.95324544e+00,
        -3.00363373e+00, -2.11203842e+00, -1.70826021e+00,
        -2.96219216e+00, -3.76915106e+00, -2.71608121e+00,
        -7.00190932e-01,  2.75099695e-01, -1.67217423e+00,
        -1.85921513e+00, -6.53882697e-01,  5.01959772e-01,
        -1.55610694e+00, -1.77532504e+00, -4.20227081e-01,
         1.70867480e+00,  1.47914081e-01, -2.43849704e-01,
        -2.62206483e-01, -1.59080160e-01, -1.50701145e+00,
        -1.83244943e+00, -1.22643909e+00, -4.45750093e+00,
        -7.33683855e-01,  2.18670696e-01, -1.25050812e-0

In [227]:

n = 0
d1 = 0
d2 = 0
mfcc_n = {}
mfcc_d1 = {}
mfcc_d2 = {}
for i in range(0,len(derivatives)):
    ders = derivatives[i].T
    n = [statistics.mean(ders[0])]
    d1 = [statistics.mean(ders[1])]
    d2 = [statistics.mean(ders[2])]
    mfcc_n["MFCC-"+str(i)] = [n]
    mfcc_d1["MFCC-"+str(i)+"d1"] = [d1]
    mfcc_d2["MFCC-"+str(i)+"d2"] = [d2]

mfcc_n


{'MFCC-0': [[24.244754195669987]],
 'MFCC-1': [[12.166728063837073]],
 'MFCC-2': [[-1.6121335543152482]],
 'MFCC-3': [[-3.714115117099929]],
 'MFCC-4': [[-4.392730498418738]],
 'MFCC-5': [[-3.172721306054781]],
 'MFCC-6': [[0.2492538042448968]],
 'MFCC-7': [[1.2606760886796164]],
 'MFCC-8': [[-1.0673234737136623]],
 'MFCC-9': [[-0.009724236558704113]],
 'MFCC-10': [[-0.9049170610691147]],
 'MFCC-11': [[-1.1276057464309042]]}

In [243]:
mfcc_d1

{'MFCC-0_d1': [7.275672500766753],
 'MFCC-1_d1': [3.654920942035056],
 'MFCC-2_d1': [-0.47859291066175846],
 'MFCC-3_d1': [-1.113300654481982],
 'MFCC-4_d1': [-1.3179514660315423],
 'MFCC-5_d1': [-0.9518270581802245],
 'MFCC-6_d1': [0.07877585209045992],
 'MFCC-7_d1': [0.3824957185207147],
 'MFCC-8_d1': [-0.32085672098766765],
 'MFCC-9_d1': [-0.002900136404547636],
 'MFCC-10_d1': [-0.2719421379662991],
 'MFCC-11_d1': [-0.34068819680453927]}

In [244]:
mfcc_d2

{'MFCC-0_d2': [2.183353669700574],
 'MFCC-1_d2': [1.09744746120549],
 'MFCC-2_d2': [-0.14322411853392167],
 'MFCC-3_d2': [-0.334611926850708],
 'MFCC-4_d2': [-0.39698219745476404],
 'MFCC-5_d2': [-0.2865521160478647],
 'MFCC-6_d2': [0.023913258250628285],
 'MFCC-7_d2': [0.11513205243507159],
 'MFCC-8_d2': [-0.0972076580848311],
 'MFCC-9_d2': [-0.0009516497292659438],
 'MFCC-10_d2': [-0.08148646661585221],
 'MFCC-11_d2': [-0.10245783376917267]}

In [229]:
# first get mfcc -> then use that data to get derivatives

In [242]:
# calculate mfcc in new way
# read sound file
import statistics
import speechpy
from scipy.io import wavfile

samplerate, data = wavfile.read(wav_file)
mfcc = speechpy.feature.mfcc(data, samplerate, num_cepstral = 12)
mfcc = mfcc.T # transform to handle data easily
derivatives = speechpy.feature.extract_derivative_feature(mfcc)

n = 0
d1 = 0
d2 = 0
mfcc_n = {}
mfcc_d1 = {}
mfcc_d2 = {}
for i in range(0,len(derivatives)):
    ders = derivatives[i].T # transform to handle data easily
    n = [statistics.mean(ders[0])]
    d1 = [statistics.mean(ders[1])]
    d2 = [statistics.mean(ders[2])]
    mfcc_n["MFCC-"+str(i)] = n
    mfcc_d1["MFCC-"+str(i)+"_d1"] = d1
    mfcc_d2["MFCC-"+str(i)+"_d2"] = d2
    
    mfcc_n_df = pd.DataFrame.from_dict(mfcc_n)
    mfcc_d1_df = pd.DataFrame.from_dict(mfcc_d1)
    mfcc_d2_df = pd.DataFrame.from_dict(mfcc_d2)
mfcc_n_df

Unnamed: 0,MFCC-0,MFCC-1,MFCC-2,MFCC-3,MFCC-4,MFCC-5,MFCC-6,MFCC-7,MFCC-8,MFCC-9,MFCC-10,MFCC-11
0,24.244754,12.166728,-1.612134,-3.714115,-4.39273,-3.172721,0.249254,1.260676,-1.067323,-0.009724,-0.904917,-1.127606


In [239]:
mfcc_d1

{'MFCC-0_d1': [[7.275672500766753]],
 'MFCC-1_d1': [[3.654920942035056]],
 'MFCC-2_d1': [[-0.47859291066175846]],
 'MFCC-3_d1': [[-1.113300654481982]],
 'MFCC-4_d1': [[-1.3179514660315423]],
 'MFCC-5_d1': [[-0.9518270581802245]],
 'MFCC-6_d1': [[0.07877585209045992]],
 'MFCC-7_d1': [[0.3824957185207147]],
 'MFCC-8_d1': [[-0.32085672098766765]],
 'MFCC-9_d1': [[-0.002900136404547636]],
 'MFCC-10_d1': [[-0.2719421379662991]],
 'MFCC-11_d1': [[-0.34068819680453927]]}

In [240]:
mfcc_d2

{'MFCC-0_d2': [[2.183353669700574]],
 'MFCC-1_d2': [[1.09744746120549]],
 'MFCC-2_d2': [[-0.14322411853392167]],
 'MFCC-3_d2': [[-0.334611926850708]],
 'MFCC-4_d2': [[-0.39698219745476404]],
 'MFCC-5_d2': [[-0.2865521160478647]],
 'MFCC-6_d2': [[0.023913258250628285]],
 'MFCC-7_d2': [[0.11513205243507159]],
 'MFCC-8_d2': [[-0.0972076580848311]],
 'MFCC-9_d2': [[-0.0009516497292659438]],
 'MFCC-10_d2': [[-0.08148646661585221]],
 'MFCC-11_d2': [[-0.10245783376917267]]}