In [53]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers as opt
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import losses
import pandas as pd
#from sklearn.metrics import precision_score,recall_score,f1_score, cohen_kappa_score
from sklearn.metrics import roc_curve,auc,roc_auc_score

In [4]:
DATA_PATH="./Vowel_Data/Formant/"

In [5]:
seed=42

In [7]:
def get_formant(file_path):
    formant=pd.read_csv(file_path,
                delim_whitespace=True,
                header=None,
                names=["time","F1","F2","F3","F4","F5"],
                dtype=np.float64,
                na_values="--undefined--")
    return formant

def get_all_formants(path=DATA_PATH):
    labels=labels=sorted(os.listdir(path))
    all_formant=pd.DataFrame() #empty dataframe to store all_formants
    
    for label_index,label in enumerate(labels):
        folder_formant=pd.DataFrame() #empty dataframe to store formants of indivuals vowels/words i.e. folders

        formant_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(formant_files,"Reading Formants of label -'{}'".format(label)):
            #print(file)
            individual_formant=get_formant(file_path=file)
  
            length=len(individual_formant)
            F_i=individual_formant.loc[2:,'F1':'F5'].values
            F_i1=individual_formant.loc[1:length-2,'F1':'F5'].values
            F_i2=individual_formant.loc[:length-3,'F1':'F5'].values
            assert len(F_i)==len(F_i1)==len(F_i2)
            individual_formant.loc[2:,['F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2']]=F_i-2*F_i1+F_i2
            
            individual_formant=individual_formant.dropna()   #drop the first two rows as there are NaN values due to the difference
            
            folder_formant=folder_formant.append(individual_formant,ignore_index=True)
        
        folder_formant['disp(F5-F1)']=folder_formant['F5']-folder_formant['F1']
        folder_formant['disp(F4-F3)']=folder_formant['F4']-folder_formant['F3']
        folder_formant['disp(F5-F3)']=folder_formant['F5']-folder_formant['F3']
        folder_formant['disp(F5-F4)']=folder_formant['F5']-folder_formant['F4']
        
        folder_formant['label']=label_index  #new columns for encoding label
        
        all_formant=all_formant.append(folder_formant,ignore_index=True)     
    return all_formant

def get_train_test():
    all_formants=get_all_formants()
    
    y=all_formants['label'].values
 
    only_formants=all_formants.drop(labels=['time','label'],axis=1)
    
    standard_formants=(only_formants-only_formants.mean())/only_formants.std()
    
    X=standard_formants.values   

    assert X.shape[0] == len(y)
    return X,y

In [8]:
get_all_formants()

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 63.87it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 97.72it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 104.40it/s]
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 68.31it/s] 
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 114.80it/s]
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 105.10it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 112.25it/s]


Unnamed: 0,time,F1,F2,F3,F4,F5,F1_diff2,F2_diff2,F3_diff2,F4_diff2,F5_diff2,disp(F5-F1),disp(F4-F3),disp(F5-F3),disp(F5-F4),label
0,0.038,670.76,1514.62,2428.13,3774.71,4719.93,-20.25,-21.72,96.00,-2.56,-64.24,4049.17,1346.58,2291.80,945.22,0
1,0.044,666.49,1524.23,2439.99,3764.21,4651.84,23.46,-8.50,-49.74,-63.77,-1.82,3985.35,1324.22,2211.85,887.63,0
2,0.050,667.40,1524.64,2409.82,3699.78,4696.84,5.18,-9.20,-42.03,-53.93,113.09,4029.44,1289.96,2287.02,997.06,0
3,0.056,670.96,1467.74,2412.33,3627.49,4665.41,2.65,-57.31,32.68,-7.86,-76.43,3994.45,1215.16,2253.08,1037.92,0
4,0.062,672.48,1450.46,2466.67,3667.72,4667.02,-2.04,39.62,51.83,112.52,33.04,3994.54,1201.05,2200.35,999.30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13596,0.331,314.53,2646.94,3283.76,3986.25,4475.36,-12.97,-24.04,-230.75,69.50,9.75,4160.83,702.49,1191.60,489.11,6
13597,0.338,300.73,2592.58,3240.90,4019.52,4490.93,-0.05,-57.83,204.80,-21.91,-42.54,4190.20,778.62,1250.03,471.41,6
13598,0.344,291.53,2523.56,3304.41,3979.67,4419.76,4.60,-14.66,106.37,-73.12,-86.74,4128.23,675.26,1115.35,440.09,6
13599,0.350,293.56,2454.07,3125.72,3922.58,4395.65,11.23,-0.47,-242.20,-17.24,47.06,4102.09,796.86,1269.93,473.07,6


In [9]:
X,y=get_train_test()

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 124.53it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 141.23it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 118.83it/s]
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 143.99it/s]
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 96.67it/s] 
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 114.85it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 137.82it/s]


In [10]:
y

array([0, 0, 0, ..., 6, 6, 6])

In [50]:
kfold=StratifiedKFold(n_splits=4,shuffle=True,random_state=seed)
cv_AUC=[]

for train,test in kfold.split(X,y):
    y_hot=to_categorical(y)
    
    model_v=Sequential()
    model_v.add(Dense(32, activation='tanh',input_shape=(X[train].shape[1],)))
    #model_v.add(Dense(32, activation='tanh'))
    model_v.add(Dense(16, activation='tanh'))
    model_v.add(Dense(7, activation='softmax'))
    
    model_v.compile(
        optimizer=opt.Adam(learning_rate=0.005),
        loss='categorical_crossentropy',
        metrics=[['accuracy','AUC']]
    )
    model_v.fit(X[train], y_hot[train], epochs=50, batch_size=X[train].shape[0],verbose=0)
    
    scores=model_v.evaluate(X[test],y_hot[test],verbose=0)
    cv_AUC.append(scores[2])
    print("%s: %.2f" % (model_v.metrics_names[2],scores[2]))
    
    yhat = model_v.predict(X[test], verbose=0)
    yhat_prob=np.argmax(yhat,axis=1)
    
    auc_score=[]
    for p in range(7):
        fpr, tpr, thresholds = roc_curve(y[test], yhat_prob, pos_label=p)
        auc_score.append(auc(fpr, tpr))
    print(auc_score)
    #print(X[test])
    #print(y[test])
    print(' ')
    
print("Overall AUC: %.2f (+/- %.2f)" % (np.mean(cv_AUC),np.std(cv_AUC)))

auc: 0.91
[0.1874178253404855, 0.12558598256543593, 0.42008801204607094, 0.5233951893691748, 0.6217325538016741, 0.8152847294036538, 0.6982497981258569]
 
auc: 0.90
[0.1796630772256329, 0.13263833374617184, 0.4726506455581613, 0.514978550024137, 0.6529227776533811, 0.8096027586206896, 0.6265344617187585]
 
auc: 0.90
[0.1363174844428387, 0.15383492959057005, 0.4584931116696421, 0.509919960455522, 0.6050689209957889, 0.8007537931034482, 0.7195785488501113]
 
auc: 0.91
[0.2054995777564407, 0.12763889975617335, 0.44185218570891666, 0.5229415007889884, 0.6373064991139187, 0.8110900000000001, 0.6487671891967444]
 
Overall AUC: 0.91 (+/- 0.00)


In [54]:
    y_hot=to_categorical(y)
    
    model_v=Sequential()
    model_v.add(Dense(32, activation='tanh',input_shape=(X.shape[1],)))
    #model_v.add(Dense(32, activation='tanh'))
    model_v.add(Dense(16, activation='tanh'))
    model_v.add(Dense(7, activation='softmax'))
    
    model_v.compile(
        optimizer=opt.Adam(learning_rate=0.005),
        loss='categorical_crossentropy',
        metrics=[['accuracy','AUC']]
    )
    model_v.fit(X, y_hot, epochs=50, batch_size=X.shape[0],verbose=0)
    
    #scores=model_v.evaluate(X[test],y_hot,verbose=0)
    #cv_AUC.append(scores[2])
    #print("%s: %.2f" % (model_v.metrics_names[2],scores[2]))
    
    yhat = model_v.predict(X, verbose=0)
    yhat_prob=np.argmax(yhat,axis=1)
    
    auc_score=[]
    for p in range(7):
        fpr, tpr, thresholds = roc_curve(y, yhat_prob, pos_label=p)
        auc_score.append(auc(fpr, tpr))
    print(auc_score)
    roc_auc_score(y,yhat_prob)
    #print(X[test])
    #print(y[test])
    print(' ')
    
#print("Overall AUC: %.2f (+/- %.2f)" % (np.mean(cv_AUC),np.std(cv_AUC)))

[0.15922501566598973, 0.15088168947751818, 0.4398112243750582, 0.5184491592491148, 0.6312579232023412, 0.813683777260581, 0.6721208983017265]


ValueError: multi_class must be in ('ovo', 'ovr')

In [48]:
for i in range(7):
    print(i)

0
1
2
3
4
5
6


In [32]:
np.argmax(yhat_prob,axis=1)

array([1, 1, 1, ..., 2, 2, 2])

In [21]:
auc_score

0.13588311548246162

In [22]:
yhat_classes

array([1, 0, 1, ..., 2, 2, 2])

In [97]:
save_specifier='only_5formants_32_16_tanh_softmax_adam_005_batchfull_seed42'

In [98]:
cv_data_df=pd.DataFrame(data={'acc':cv_acc,
                              'AUC':cv_AUC,
                              'F1':cv_f1,
                              'Kappa':cv_kappa}
                       )
cv_data_df.to_csv('./New_without_CNN/vowel_cv_data_df_'+save_specifier+'.csv')

In [99]:
DATA_PATH="./Word_Data/Formant/"

In [100]:
def get_formant(file_path):
    formant=pd.read_csv(file_path,
                delim_whitespace=True,
                header=None,
                names=["time","F1","F2","F3","F4","F5"],
                dtype=np.float64,
                na_values="--undefined--")
    return formant

def get_all_formants(path=DATA_PATH):
    labels=labels=sorted(os.listdir(path))
    all_formant=pd.DataFrame() #empty dataframe to store all_formants
    
    for label_index,label in enumerate(labels):
        folder_formant=pd.DataFrame() #empty dataframe to store formants of indivuals vowels/words i.e. folders

        formant_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(formant_files,"Reading Formants of label -'{}'".format(label)):
            #print(file)
            individual_formant=get_formant(file_path=file)
  
            length=len(individual_formant)
            F_i=individual_formant.loc[2:,'F1':'F5'].values
            F_i1=individual_formant.loc[1:length-2,'F1':'F5'].values
            F_i2=individual_formant.loc[:length-3,'F1':'F5'].values
            assert len(F_i)==len(F_i1)==len(F_i2)
            #individual_formant.loc[2:,['F1_diff2','F2_diff2','F3_diff2','F4_diff2','F5_diff2']]=F_i-2*F_i1+F_i2
            
            individual_formant=individual_formant.dropna()   #drop the first two rows as there are NaN values due to the difference
            
            folder_formant=folder_formant.append(individual_formant,ignore_index=True)
        
        #folder_formant['disp(F5-F1)']=folder_formant['F5']-folder_formant['F1']
        #folder_formant['disp(F4-F3)']=folder_formant['F4']-folder_formant['F3']
        #folder_formant['disp(F5-F3)']=folder_formant['F5']-folder_formant['F3']
        #folder_formant['disp(F5-F4)']=folder_formant['F5']-folder_formant['F4']
        
        folder_formant['label']=label_index  #new columns for encoding label
        
        all_formant=all_formant.append(folder_formant,ignore_index=True)     
    return all_formant

def get_train_test():
    all_formants=get_all_formants()
    
    y=all_formants['label'].values
 
    only_formants=all_formants.drop(labels=['time','label'],axis=1)
    
    standard_formants=(only_formants-only_formants.mean())/only_formants.std()
    
    X=standard_formants.values   

    assert X.shape[0] == len(y)
    return X,y

In [101]:
get_all_formants()

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 208.10it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 233.36it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 177.91it/s]
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 229.46it/s]
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 226.71it/s]
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 219.46it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 223.08it/s]


Unnamed: 0,time,F1,F2,F3,F4,F5,label
0,0.025,0.00,247.87,1699.69,2612.09,4240.09,0
1,0.031,0.00,281.54,1867.43,2664.88,4235.19,0
2,0.038,0.00,307.27,2010.72,2676.97,4235.34,0
3,0.044,0.00,312.13,1980.29,2661.43,4244.88,0
4,0.050,0.00,312.40,1804.43,2631.69,4248.25,0
...,...,...,...,...,...,...,...
19682,0.388,426.76,1907.88,2593.99,3717.64,4051.64,6
19683,0.394,427.44,1953.37,2824.18,3329.49,4139.11,6
19684,0.400,420.76,1962.80,2635.39,3325.78,4159.54,6
19685,0.406,380.70,1958.74,2398.14,3586.44,4098.24,6


In [102]:
X,y=get_train_test()

Reading Formants of label -'Formant1': 100%|██████████| 40/40 [00:00<00:00, 175.71it/s]
Reading Formants of label -'Formant2': 100%|██████████| 40/40 [00:00<00:00, 216.97it/s]
Reading Formants of label -'Formant3': 100%|██████████| 40/40 [00:00<00:00, 188.00it/s]
Reading Formants of label -'Formant4': 100%|██████████| 40/40 [00:00<00:00, 172.70it/s]
Reading Formants of label -'Formant5': 100%|██████████| 40/40 [00:00<00:00, 173.23it/s]
Reading Formants of label -'Formant6': 100%|██████████| 40/40 [00:00<00:00, 222.42it/s]
Reading Formants of label -'Formant7': 100%|██████████| 40/40 [00:00<00:00, 238.39it/s]


In [103]:
kfold=StratifiedKFold(n_splits=4,shuffle=True,random_state=seed)
cv_acc=[]
cv_AUC=[]
cv_f1=[]
cv_kappa=[]

for train,test in kfold.split(X,y):
    y_hot=to_categorical(y)
    
    model_w=Sequential()
    model_w.add(Dense(32, activation='tanh',input_shape=(X[train].shape[1],)))
    #model_w.add(Dense(32, activation='tanh'))
    model_w.add(Dense(16, activation='tanh'))
    model_w.add(Dense(7, activation='softmax'))
    
    model_w.compile(
        optimizer=opt.Adam(learning_rate=0.005),
        loss='categorical_crossentropy',
        metrics=[['accuracy','AUC']]
    )
    model_w.fit(X[train], y_hot[train], epochs=50, batch_size=X[train].shape[0],verbose=0)
    
    scores=model_w.evaluate(X[test],y_hot[test],verbose=0)
    
    print("%s: %.2f" % (model_w.metrics_names[1],scores[1]))
    print("%s: %.2f" % (model_w.metrics_names[2],scores[2]))
    
    yhat_classes = model_w.predict_classes(X[test], verbose=0)
    
    f1 = f1_score(y[test], yhat_classes,average='weighted')
    print('F1: %.2f' % f1)
    
    kappa = cohen_kappa_score(y[test], yhat_classes)
    print('Cohens kappa: %.2f' % kappa)
    
    cv_acc.append(scores[1])
    cv_AUC.append(scores[2])
    cv_f1.append(f1)
    cv_kappa.append(kappa)
    
    print(' ')
    
print("Overall acc: %.2f (+/- %.2f)" % (np.mean(cv_acc),np.std(cv_acc)))
print("Overall AUC: %.2f (+/- %.2f)" % (np.mean(cv_AUC),np.std(cv_AUC)))
print("Overall f1: %.2f (+/- %.2f)" % (np.mean(cv_f1),np.std(cv_f1)))
print("Overall kappa: %.2f (+/- %.2f)" % (np.mean(cv_kappa),np.std(cv_kappa)))

accuracy: 0.38
auc: 0.76
F1: 0.33
Cohens kappa: 0.27
 
accuracy: 0.35
auc: 0.76
F1: 0.30
Cohens kappa: 0.24
 
accuracy: 0.36
auc: 0.77
F1: 0.32
Cohens kappa: 0.25
 
accuracy: 0.35
auc: 0.75
F1: 0.30
Cohens kappa: 0.23
 
Overall acc: 0.36 (+/- 0.01)
Overall AUC: 0.76 (+/- 0.00)
Overall f1: 0.31 (+/- 0.01)
Overall kappa: 0.25 (+/- 0.01)


In [104]:
cv_data_df=pd.DataFrame(data={'acc':cv_acc,
                              'AUC':cv_AUC,
                              'F1':cv_f1,
                              'Kappa':cv_kappa}
                       )
cv_data_df.to_csv('./New_without_CNN/Word_cv_data_df_'+save_specifier+'.csv')