In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import researchpy as rp
import scipy.stats as stats
import statsmodels.stats.multicomp as mc

In [110]:
DATA_PATH="/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Vowel_Data/Formant/"

In [111]:
def get_formant(file_path):
    formant=pd.read_csv(file_path,
                delim_whitespace=True,
                header=None,
                names=["time","F1","F2","F3","F4","F5"],
                dtype=np.float64,
                na_values="--undefined--")
    return formant

def get_all_formants(path=DATA_PATH):
    labels=labels=sorted(os.listdir(path))
    all_formant=pd.DataFrame() #empty dataframe to store all_formants
    
    for label_index,label in enumerate(labels):
        folder_formant=pd.DataFrame() #empty dataframe to store formants of indivuals vowels/words i.e. folders

        formant_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(formant_files,"Reading Formants of label -'{}'".format(label)):
            #print(file)
            individual_formant=get_formant(file_path=file)
  
            length=len(individual_formant)
            F_i=individual_formant.loc[2:,'F1':'F5'].values
            F_i1=individual_formant.loc[1:length-2,'F1':'F5'].values
            F_i2=individual_formant.loc[:length-3,'F1':'F5'].values
            assert len(F_i)==len(F_i1)==len(F_i2)
            individual_formant.loc[2:,['F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2']]=np.abs(F_i-2*F_i1+F_i2)
            
            individual_formant=individual_formant.dropna()   #drop the first two rows as there are NaN values due to the difference
            
            folder_formant=folder_formant.append(individual_formant,ignore_index=True)
        
        folder_formant['disp(F5-F1)']=folder_formant['F5']-folder_formant['F1']
        folder_formant['disp(F4-F3)']=folder_formant['F4']-folder_formant['F3']
        folder_formant['disp(F5-F3)']=folder_formant['F5']-folder_formant['F3']
        folder_formant['disp(F5-F4)']=folder_formant['F5']-folder_formant['F4']
        
        folder_formant['label']=label_index  #new columns for encoding label
        
        all_formant=all_formant.append(folder_formant,ignore_index=True)  
        
        only_formant=all_formant.drop(labels=['time','label'],axis=1)
        
    return only_formant

In [112]:
vowel_formant_features=get_all_formants()
vowel_formant_features #print

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 74.65it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 98.97it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 91.88it/s] 
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 102.72it/s]
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 112.84it/s]
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 103.12it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 107.09it/s]


Unnamed: 0,F1,F2,F3,F4,F5,F1_diff2,F2_diff2,F3_diff2,F4_diff2,F5_diff2,disp(F5-F1),disp(F4-F3),disp(F5-F3),disp(F5-F4)
0,670.76,1514.62,2428.13,3774.71,4719.93,20.25,21.72,96.00,2.56,64.24,4049.17,1346.58,2291.80,945.22
1,666.49,1524.23,2439.99,3764.21,4651.84,23.46,8.50,49.74,63.77,1.82,3985.35,1324.22,2211.85,887.63
2,667.40,1524.64,2409.82,3699.78,4696.84,5.18,9.20,42.03,53.93,113.09,4029.44,1289.96,2287.02,997.06
3,670.96,1467.74,2412.33,3627.49,4665.41,2.65,57.31,32.68,7.86,76.43,3994.45,1215.16,2253.08,1037.92
4,672.48,1450.46,2466.67,3667.72,4667.02,2.04,39.62,51.83,112.52,33.04,3994.54,1201.05,2200.35,999.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13596,314.53,2646.94,3283.76,3986.25,4475.36,12.97,24.04,230.75,69.50,9.75,4160.83,702.49,1191.60,489.11
13597,300.73,2592.58,3240.90,4019.52,4490.93,0.05,57.83,204.80,21.91,42.54,4190.20,778.62,1250.03,471.41
13598,291.53,2523.56,3304.41,3979.67,4419.76,4.60,14.66,106.37,73.12,86.74,4128.23,675.26,1115.35,440.09
13599,293.56,2454.07,3125.72,3922.58,4395.65,11.23,0.47,242.20,17.24,47.06,4102.09,796.86,1269.93,473.07


In [7]:
#rp.summary_cont(vowel_formant_features)
# stats.f_oneway(vowel_formant_features['F1'].tolist(),
#                vowel_formant_features['F2'].tolist(),
#                vowel_formant_features['F3'].tolist(),
#                vowel_formant_features['F4'].tolist(),
#                vowel_formant_features['F5'].tolist(),
#                vowel_formant_features['F1_diff2'].tolist(),
#                vowel_formant_features['F2_diff2'].tolist(),
#                vowel_formant_features['F3_diff2'].tolist(),
#                vowel_formant_features['F4_diff2'].tolist(),
#                vowel_formant_features['F5_diff2'].tolist(),
#                vowel_formant_features['disp(F5-F1)'].tolist(),
#                vowel_formant_features['disp(F4-F3)'].tolist(),
#                vowel_formant_features['disp(F5-F3)'].tolist(),
#                vowel_formant_features['disp(F5-F4)'].tolist()
#               )

In [113]:
#appending all fourteen feature data in a single list one after another
data=[]
for i in range(14):
    data.extend(vowel_formant_features.iloc[:,i].tolist())

In [115]:
#corresponding to the above list, groups (labels) are being created. Every features has 13601 (vowel_formant_features.index) samples
group=np.repeat(['F1','F2','F3','F4','F5','F1''',"F2''","F3''","F4''","F5''",'F51','F43','F53','F54'], repeats=len(vowel_formant_features.index))

In [116]:
assert len(data)==len(group)

In [117]:
tukey = mc.pairwise_tukeyhsd(endog=data,groups=group)
print(tukey)

    Multiple Comparison of Means - Tukey HSD, FWER=0.05    
group1 group2  meandiff  p-adj   lower      upper    reject
-----------------------------------------------------------
    F1     F2   1289.513 0.001  1273.8261     1305.2   True
    F1   F2''   -85.5156 0.001  -101.2026   -69.8287   True
    F1     F3  2415.7242 0.001  2400.0372  2431.4112   True
    F1   F3''   -86.0052 0.001  -101.6922   -70.3182   True
    F1     F4  3406.7567 0.001  3391.0697  3422.4437   True
    F1   F4''   -91.1512 0.001  -106.8382   -75.4643   True
    F1    F43   777.0133 0.001   761.3263   792.7003   True
    F1     F5  4426.5829 0.001  4410.8959  4442.2699   True
    F1   F5''   -31.3895 0.001   -47.0765   -15.7025   True
    F1    F51  4038.8723 0.001  4023.1853  4054.5592   True
    F1    F53  1796.8395 0.001  1781.1525  1812.5265   True
    F1    F54    805.807 0.001     790.12   821.4939   True
    F2   F2'' -1375.0287 0.001 -1393.1424 -1356.9149   True
    F2     F3  1126.2111 0.001  1108.097

In [118]:
save_specifier='tukey_hsd_df_0.05'

In [119]:
tukey_hsd_df=pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
tukey_hsd_df.to_csv('/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Result/New_without_CNN/vowel_'+save_specifier+'.csv')

In [120]:
DATA_PATH="/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Word_Data/Formant/"

In [121]:
def get_formant(file_path):
    formant=pd.read_csv(file_path,
                delim_whitespace=True,
                header=None,
                names=["time","F1","F2","F3","F4","F5"],
                dtype=np.float64,
                na_values="--undefined--")
    return formant

def get_all_formants(path=DATA_PATH):
    labels=labels=sorted(os.listdir(path))
    all_formant=pd.DataFrame() #empty dataframe to store all_formants
    
    for label_index,label in enumerate(labels):
        folder_formant=pd.DataFrame() #empty dataframe to store formants of indivuals vowels/words i.e. folders

        formant_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(formant_files,"Reading Formants of label -'{}'".format(label)):
            #print(file)
            individual_formant=get_formant(file_path=file)
  
            length=len(individual_formant)
            F_i=individual_formant.loc[2:,'F1':'F5'].values
            F_i1=individual_formant.loc[1:length-2,'F1':'F5'].values
            F_i2=individual_formant.loc[:length-3,'F1':'F5'].values
            assert len(F_i)==len(F_i1)==len(F_i2)
            individual_formant.loc[2:,['F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2']]=np.abs(F_i-2*F_i1+F_i2)
            
            individual_formant=individual_formant.dropna()   #drop the first two rows as there are NaN values due to the difference
            
            folder_formant=folder_formant.append(individual_formant,ignore_index=True)
        
        folder_formant['disp(F5-F1)']=folder_formant['F5']-folder_formant['F1']
        folder_formant['disp(F4-F3)']=folder_formant['F4']-folder_formant['F3']
        folder_formant['disp(F5-F3)']=folder_formant['F5']-folder_formant['F3']
        folder_formant['disp(F5-F4)']=folder_formant['F5']-folder_formant['F4']
        
        folder_formant['label']=label_index  #new columns for encoding label
        
        all_formant=all_formant.append(folder_formant,ignore_index=True)     
        only_formant=all_formant.drop(labels=['time','label'],axis=1)
        
    return only_formant

In [122]:
word_formant_features=get_all_formants()
word_formant_features

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 139.30it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 130.83it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 143.75it/s]
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 140.11it/s]
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 120.89it/s]
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 121.07it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 137.93it/s]


Unnamed: 0,F1,F2,F3,F4,F5,F1_diff2,F2_diff2,F3_diff2,F4_diff2,F5_diff2,disp(F5-F1),disp(F4-F3),disp(F5-F3),disp(F5-F4)
0,0.00,307.27,2010.72,2676.97,4235.34,0.00,7.94,24.45,40.70,5.05,4235.34,666.25,2224.62,1558.37
1,0.00,312.13,1980.29,2661.43,4244.88,0.00,20.87,173.72,27.63,9.39,4244.88,681.14,2264.59,1583.45
2,0.00,312.40,1804.43,2631.69,4248.25,0.00,4.59,145.43,14.20,6.17,4248.25,827.26,2443.82,1616.56
3,0.00,298.91,1612.50,2594.02,4241.80,0.00,13.76,16.07,7.93,9.82,4241.80,981.52,2629.30,1647.78
4,0.00,281.93,1290.49,2548.34,4234.39,0.00,3.49,130.08,8.01,0.96,4234.39,1257.85,2943.90,1686.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19122,426.76,1907.88,2593.99,3717.64,4051.64,15.79,30.36,165.73,236.89,100.84,3624.88,1123.65,1457.65,334.00
19123,427.44,1953.37,2824.18,3329.49,4139.11,5.11,16.25,152.80,197.89,44.99,3711.67,505.31,1314.93,809.62
19124,420.76,1962.80,2635.39,3325.78,4159.54,7.36,36.06,418.98,384.44,67.04,3738.78,690.39,1524.15,833.76
19125,380.70,1958.74,2398.14,3586.44,4098.24,33.38,13.49,48.46,264.37,81.73,3717.54,1188.30,1700.10,511.80


In [125]:
#appending all fourteen feature data in a single list one after another
data_w=[]
for i in range(14):
    data_w.extend(word_formant_features.iloc[:,i].tolist())
    
#corresponding to the above list, groups (labels) are being created.
group_w=np.repeat(['F1','F2','F3','F4','F5','F1''',"F2''","F3''","F4''","F5''",'F51','F43','F53','F54'], repeats=len(word_formant_features.index))
    
assert len(data_w)==len(group_w)

tukey_w = mc.pairwise_tukeyhsd(endog=data_w,groups=group_w)
print(tukey_w)

    Multiple Comparison of Means - Tukey HSD, FWER=0.05     
group1 group2  meandiff  p-adj    lower      upper    reject
------------------------------------------------------------
    F1     F2  1052.4352  0.001  1039.1404    1065.73   True
    F1   F2''   -28.5763  0.001   -41.8711   -15.2815   True
    F1     F3  2234.8151  0.001  2221.5203  2248.1099   True
    F1   F3''    -4.1574    0.9   -17.4522     9.1374  False
    F1     F4  3356.9195  0.001  3343.6247  3370.2143   True
    F1   F4''     -7.121 0.8478   -20.4157     6.1738  False
    F1    F43   873.2793  0.001   859.9845   886.5741   True
    F1     F5  4478.8665  0.001  4465.5717  4492.1613   True
    F1   F5''    35.0908  0.001     21.796    48.3856   True
    F1    F51  4090.7387  0.001  4077.4439  4104.0335   True
    F1    F53  1995.2263  0.001  1981.9315  2008.5211   True
    F1    F54   873.1218  0.001    859.827   886.4166   True
    F2   F2'' -1081.0115  0.001  -1096.363   -1065.66   True
    F2     F3  1182.3799

In [124]:
len(data_w)

267778

In [126]:
tukey_hsd_df=pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
tukey_hsd_df.to_csv('/home/rakibul/WORK/RESEARCH/Speech_Recognition/MAIN STUDY/GitHub Repo/Result/New_without_CNN/word_'+save_specifier+'.csv')