In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import researchpy as rp
import scipy.stats as stats
import statsmodels.stats.multicomp as mc

In [21]:
DATA_PATH="/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Vowel_Data/Formant/"

In [22]:
def get_formant(file_path):
    formant=pd.read_csv(file_path,
                delim_whitespace=True,
                header=None,
                names=["time","F1","F2","F3","F4","F5"],
                dtype=np.float64,
                na_values="--undefined--")
    return formant

def get_all_formants(path=DATA_PATH):
    labels=labels=sorted(os.listdir(path))
    all_formant=pd.DataFrame() #empty dataframe to store all_formants
    
    for label_index,label in enumerate(labels):
        folder_formant=pd.DataFrame() #empty dataframe to store formants of indivuals vowels/words i.e. folders

        formant_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(formant_files,"Reading Formants of label -'{}'".format(label)):
            #print(file)
            individual_formant=get_formant(file_path=file)
  
            length=len(individual_formant)
            F_i=individual_formant.loc[2:,'F1':'F5'].values
            F_i1=individual_formant.loc[1:length-2,'F1':'F5'].values
            F_i2=individual_formant.loc[:length-3,'F1':'F5'].values
            assert len(F_i)==len(F_i1)==len(F_i2)
            individual_formant.loc[2:,['F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2']]=np.abs(F_i-2*F_i1+F_i2)
            
            individual_formant=individual_formant.dropna()   #drop the first two rows as there are NaN values due to the difference
            
            folder_formant=folder_formant.append(individual_formant,ignore_index=True)
        
        folder_formant['disp(F5-F1)']=folder_formant['F5']-folder_formant['F1']
        folder_formant['disp(F4-F3)']=folder_formant['F4']-folder_formant['F3']
        folder_formant['disp(F5-F3)']=folder_formant['F5']-folder_formant['F3']
        folder_formant['disp(F5-F4)']=folder_formant['F5']-folder_formant['F4']
        
        folder_formant['label']=label_index  #new columns for encoding label
        
        all_formant=all_formant.append(folder_formant,ignore_index=True)  
        
        vowel0_formant=all_formant.loc[all_formant['label']==0]
        
#         only_formant=all_formant.drop(labels=['time','label'],axis=1)
        only_formant=vowel0_formant.drop(labels=['time','label'],axis=1)
        
    return only_formant

In [23]:
vowel_formant_features=get_all_formants()
vowel_formant_features #print

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 136.08it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 111.22it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 133.76it/s]
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 126.66it/s]
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 106.43it/s]
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 102.20it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 135.49it/s]


Unnamed: 0,F1,F2,F3,F4,F5,F1_diff2,F2_diff2,F3_diff2,F4_diff2,F5_diff2,disp(F5-F1),disp(F4-F3),disp(F5-F3),disp(F5-F4)
0,670.76,1514.62,2428.13,3774.71,4719.93,20.25,21.72,96.00,2.56,64.24,4049.17,1346.58,2291.80,945.22
1,666.49,1524.23,2439.99,3764.21,4651.84,23.46,8.50,49.74,63.77,1.82,3985.35,1324.22,2211.85,887.63
2,667.40,1524.64,2409.82,3699.78,4696.84,5.18,9.20,42.03,53.93,113.09,4029.44,1289.96,2287.02,997.06
3,670.96,1467.74,2412.33,3627.49,4665.41,2.65,57.31,32.68,7.86,76.43,3994.45,1215.16,2253.08,1037.92
4,672.48,1450.46,2466.67,3667.72,4667.02,2.04,39.62,51.83,112.52,33.04,3994.54,1201.05,2200.35,999.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1479,0.00,903.93,1740.54,2612.79,4020.24,0.00,8.87,21.93,125.97,23.07,4020.24,872.25,2279.70,1407.45
1480,0.00,904.00,1684.15,2390.51,3964.65,0.00,0.16,43.51,65.67,60.71,3964.65,706.36,2280.50,1574.14
1481,811.92,1608.91,2075.06,3689.59,5331.79,811.92,704.84,447.30,1521.36,1422.73,4519.87,1614.53,3256.73,1642.20
1482,926.13,1614.04,2100.94,3922.63,5500.00,697.71,699.78,365.03,1066.04,1198.93,4573.87,1821.69,3399.06,1577.37


In [7]:
#rp.summary_cont(vowel_formant_features)
# stats.f_oneway(vowel_formant_features['F1'].tolist(),
#                vowel_formant_features['F2'].tolist(),
#                vowel_formant_features['F3'].tolist(),
#                vowel_formant_features['F4'].tolist(),
#                vowel_formant_features['F5'].tolist(),
#                vowel_formant_features['F1_diff2'].tolist(),
#                vowel_formant_features['F2_diff2'].tolist(),
#                vowel_formant_features['F3_diff2'].tolist(),
#                vowel_formant_features['F4_diff2'].tolist(),
#                vowel_formant_features['F5_diff2'].tolist(),
#                vowel_formant_features['disp(F5-F1)'].tolist(),
#                vowel_formant_features['disp(F4-F3)'].tolist(),
#                vowel_formant_features['disp(F5-F3)'].tolist(),
#                vowel_formant_features['disp(F5-F4)'].tolist()
#               )

In [24]:
#appending all fourteen feature data in a single list one after another
data=[]
for i in range(14):
    data.extend(vowel_formant_features.iloc[:,i].tolist())

In [25]:
#corresponding to the above list, groups (labels) are being created. Every features has 13601 (vowel_formant_features.index) samples
group=np.repeat(['F1','F2','F3','F4','F5','F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2','disp(F5-F1)','disp(F4-F3)','disp(F5-F3)','disp(F5-F4)'], repeats=len(vowel_formant_features.index))

In [26]:
assert len(data)==len(group)

In [27]:
tukey = mc.pairwise_tukeyhsd(endog=data,groups=group)
print(tukey)

         Multiple Comparison of Means - Tukey HSD, FWER=0.05          
   group1      group2    meandiff  p-adj    lower      upper    reject
----------------------------------------------------------------------
         F1    F1_diff2  -507.3525  0.001  -553.4237  -461.2814   True
         F1          F2   541.3207  0.001   495.2495   587.3918   True
         F1    F2_diff2  -473.3003  0.001  -519.3715  -427.2292   True
         F1          F3  1901.8295  0.001  1855.7583  1947.9006   True
         F1    F3_diff2  -438.4433  0.001  -484.5144  -392.3721   True
         F1          F4  3089.5975  0.001  3043.5264  3135.6687   True
         F1    F4_diff2  -401.8783  0.001  -447.9495  -355.8072   True
         F1          F5  4054.0216  0.001  4007.9505  4100.0928   True
         F1    F5_diff2  -378.7628  0.001  -424.8339  -332.6916   True
         F1 disp(F4-F3)    604.041  0.001   557.9698   650.1121   True
         F1 disp(F5-F1)  3470.2946  0.001  3424.2234  3516.3657   True
      

In [7]:
save_specifier='0_tukey_hsd_df_0.05'

In [31]:
tukey_hsd_df=pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
tukey_hsd_df.to_csv('/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Result/New_without_CNN/vowel'+save_specifier+'.csv')

In [2]:
DATA_PATH="/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Word_Data/Formant/"

In [3]:
def get_formant(file_path):
    formant=pd.read_csv(file_path,
                delim_whitespace=True,
                header=None,
                names=["time","F1","F2","F3","F4","F5"],
                dtype=np.float64,
                na_values="--undefined--")
    return formant

def get_all_formants(path=DATA_PATH):
    labels=labels=sorted(os.listdir(path))
    all_formant=pd.DataFrame() #empty dataframe to store all_formants
    
    for label_index,label in enumerate(labels):
        folder_formant=pd.DataFrame() #empty dataframe to store formants of indivuals vowels/words i.e. folders

        formant_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(formant_files,"Reading Formants of label -'{}'".format(label)):
            #print(file)
            individual_formant=get_formant(file_path=file)
  
            length=len(individual_formant)
            F_i=individual_formant.loc[2:,'F1':'F5'].values
            F_i1=individual_formant.loc[1:length-2,'F1':'F5'].values
            F_i2=individual_formant.loc[:length-3,'F1':'F5'].values
            assert len(F_i)==len(F_i1)==len(F_i2)
            individual_formant.loc[2:,['F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2']]=np.abs(F_i-2*F_i1+F_i2)
            
            individual_formant=individual_formant.dropna()   #drop the first two rows as there are NaN values due to the difference
            
            folder_formant=folder_formant.append(individual_formant,ignore_index=True)
        
        folder_formant['disp(F5-F1)']=folder_formant['F5']-folder_formant['F1']
        folder_formant['disp(F4-F3)']=folder_formant['F4']-folder_formant['F3']
        folder_formant['disp(F5-F3)']=folder_formant['F5']-folder_formant['F3']
        folder_formant['disp(F5-F4)']=folder_formant['F5']-folder_formant['F4']
        
        folder_formant['label']=label_index  #new columns for encoding label
        
        all_formant=all_formant.append(folder_formant,ignore_index=True)
        
        word0_formant=all_formant.loc[all_formant['label']==0]
#         only_formant=all_formant.drop(labels=['time','label'],axis=1)
        
        only_formant=word0_formant.drop(labels=['time','label'],axis=1)
        
    return only_formant

In [4]:
word_formant_features=get_all_formants()
word_formant_features

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 60.55it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 65.46it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 70.73it/s]
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 118.53it/s]
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 88.94it/s] 
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 131.61it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 120.50it/s]


Unnamed: 0,F1,F2,F3,F4,F5,F1_diff2,F2_diff2,F3_diff2,F4_diff2,F5_diff2,disp(F5-F1),disp(F4-F3),disp(F5-F3),disp(F5-F4)
0,0.00,307.27,2010.72,2676.97,4235.34,0.00,7.94,24.45,40.70,5.05,4235.34,666.25,2224.62,1558.37
1,0.00,312.13,1980.29,2661.43,4244.88,0.00,20.87,173.72,27.63,9.39,4244.88,681.14,2264.59,1583.45
2,0.00,312.40,1804.43,2631.69,4248.25,0.00,4.59,145.43,14.20,6.17,4248.25,827.26,2443.82,1616.56
3,0.00,298.91,1612.50,2594.02,4241.80,0.00,13.76,16.07,7.93,9.82,4241.80,981.52,2629.30,1647.78
4,0.00,281.93,1290.49,2548.34,4234.39,0.00,3.49,130.08,8.01,0.96,4234.39,1257.85,2943.90,1686.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,248.01,926.63,2432.12,3463.34,4091.41,248.01,926.63,1431.77,686.95,399.46,3843.40,1031.22,1659.29,628.07
3036,350.28,1292.04,3031.83,3696.69,4232.64,145.74,561.22,780.86,427.05,277.15,3882.36,664.86,1200.81,535.95
3037,397.92,1321.53,3118.21,3709.37,4221.05,54.63,335.92,513.33,220.67,152.82,3823.13,591.16,1102.84,511.68
3038,411.76,1343.89,3156.41,3702.88,4299.79,33.80,7.13,48.18,19.17,90.33,3888.03,546.47,1143.38,596.91


In [5]:
#appending all fourteen feature data in a single list one after another
data_w=[]
for i in range(14):
    data_w.extend(word_formant_features.iloc[:,i].tolist())
    
#corresponding to the above list, groups (labels) are being created.
group_w=np.repeat(['F1','F2','F3','F4','F5','F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2','disp(F5-F1)','disp(F4-F3)','disp(F5-F3)','disp(F5-F4)'], repeats=len(word_formant_features.index))
    
assert len(data_w)==len(group_w)

tukey_w = mc.pairwise_tukeyhsd(endog=data_w,groups=group_w)
print(tukey_w)

         Multiple Comparison of Means - Tukey HSD, FWER=0.05          
   group1      group2    meandiff  p-adj    lower      upper    reject
----------------------------------------------------------------------
         F1    F1_diff2  -301.5943  0.001   -336.018  -267.1707   True
         F1          F2   647.3214  0.001   612.8977    681.745   True
         F1    F2_diff2  -214.7054  0.001   -249.129  -180.2817   True
         F1          F3  2145.3383  0.001  2110.9146   2179.762   True
         F1    F3_diff2  -140.9978  0.001  -175.4215  -106.5742   True
         F1          F4  3283.9247  0.001   3249.501  3318.3483   True
         F1    F4_diff2  -158.6981  0.001  -193.1218  -124.2744   True
         F1          F5  4371.3256  0.001  4336.9019  4405.7493   True
         F1    F5_diff2   -123.193  0.001  -157.6166   -88.7693   True
         F1 disp(F4-F3)    760.904  0.001   726.4803   795.3276   True
         F1 disp(F5-F1)  3993.6432  0.001  3959.2195  4028.0669   True
      

In [8]:
tukey_hsd_df=pd.DataFrame(data=tukey_w._results_table.data[1:], columns=tukey_w._results_table.data[0])
tukey_hsd_df.to_csv('/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Result/New_without_CNN/word'+save_specifier+'.csv')