In [1]:
import os,librosa
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers as opt
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import losses
import pandas as pd
from sklearn.metrics import precision_score,recall_score,f1_score, cohen_kappa_score

In [2]:
seed=42
n_MFCC=13
n_epochs=50
column_names= ['MFCC_'+str(i) for i in range(1,n_MFCC+1)]

In [3]:
DATA_PATH="/home/rakibul/WORK/Work/Thesis/Speech_Recognition/Vowel_Word/GitHub Repo/Vowel_Data/Vowel/"

In [4]:
def get_mfcc(file_path):
    wave, sr= librosa.load(file_path, mono=True, sr=None)
    mfcc=librosa.feature.mfcc(wave,sr=sr,n_mfcc=n_MFCC)
    mfcc_df=pd.DataFrame(mfcc.T,columns=column_names) #convert array to pandas df
    return mfcc_df

def get_all_mfcc(path=DATA_PATH):
    labels=sorted(os.listdir(path))
    all_mfcc=pd.DataFrame() #empty dataframe to store all_mfccs
    
    for label_index,label in enumerate(labels):
        folder_mfcc=pd.DataFrame() #empty dataframe to store mfcc of indivuals vowels/words i.e. folders

        speech_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(speech_files,"Reading Speech of label -'{}'".format(label)):
            #print(file)
            individual_mfcc=get_mfcc(file_path=file)
            folder_mfcc=folder_mfcc.append(individual_mfcc,ignore_index=True)
        
        folder_mfcc['label']=label_index  #new columns for encoding label
        
        all_mfcc=all_mfcc.append(folder_mfcc,ignore_index=True)
        
    return all_mfcc

def get_train_test():
    all_mfccs=get_all_mfcc()
    
    y=all_mfccs['label'].values
    
    only_mfcc=all_mfccs.drop(labels=['label'],axis=1)
    standard_mfcc=(only_mfcc-only_mfcc.mean())/only_mfcc.std()
    X=standard_mfcc.values

    assert X.shape[0] == len(y)
    return X,y

In [5]:
X,y=get_train_test()

Reading Speech of label -'vowel1': 100%|██████████| 40/40 [00:00<00:00, 56.10it/s]
Reading Speech of label -'vowel2': 100%|██████████| 40/40 [00:00<00:00, 115.32it/s]
Reading Speech of label -'vowel3': 100%|██████████| 40/40 [00:00<00:00, 74.37it/s]
Reading Speech of label -'vowel4': 100%|██████████| 40/40 [00:00<00:00, 84.78it/s] 
Reading Speech of label -'vowel5': 100%|██████████| 40/40 [00:00<00:00, 59.67it/s]
Reading Speech of label -'vowel6': 100%|██████████| 40/40 [00:00<00:00, 75.43it/s]
Reading Speech of label -'vowel7': 100%|██████████| 40/40 [00:00<00:00, 56.41it/s]


In [6]:
kfold=StratifiedKFold(n_splits=4,shuffle=True,random_state=seed)
cv_acc=[]
cv_AUC=[]
cv_f1=[]
cv_kappa=[]

for train,test in kfold.split(X,y):
    y_hot=to_categorical(y)
    
    model_v=Sequential()
    model_v.add(Dense(64, activation='tanh',input_shape=(X[train].shape[1],)))
    #model_v.add(Dense(128, activation='tanh'))
    #model_v.add(Dense(64, activation='tanh'))
    model_v.add(Dense(32, activation='tanh'))
    model_v.add(Dense(16, activation='tanh'))
    model_v.add(Dense(7, activation='softmax'))
    
    model_v.compile(
        optimizer=opt.Adam(learning_rate=0.005),
        loss='categorical_crossentropy',
        metrics=[['accuracy','AUC']]
    )
    model_v.fit(X[train], y_hot[train], epochs=n_epochs, batch_size=X[train].shape[0],verbose=0)
    
    scores=model_v.evaluate(X[test],y_hot[test],verbose=0)
    
    print("%s: %.2f" % (model_v.metrics_names[1],scores[1]))
    print("%s: %.2f" % (model_v.metrics_names[2],scores[2]))
    
    yhat_classes = model_v.predict_classes(X[test], verbose=0)
    
    f1 = f1_score(y[test], yhat_classes,average='weighted')
    print('F1: %.2f' % f1)
    
    kappa = cohen_kappa_score(y[test], yhat_classes)
    print('Cohens kappa: %.2f' % kappa)
    
    cv_acc.append(scores[1])
    cv_AUC.append(scores[2])
    cv_f1.append(f1)
    cv_kappa.append(kappa)
    
    print(' ')
    
print("Overall acc: %.2f (+/- %.2f)" % (np.mean(cv_acc),np.std(cv_acc)))
print("Overall AUC: %.2f (+/- %.2f)" % (np.mean(cv_AUC),np.std(cv_AUC)))
print("Overall f1: %.2f (+/- %.2f)" % (np.mean(cv_f1),np.std(cv_f1)))
print("Overall kappa: %.2f (+/- %.2f)" % (np.mean(cv_kappa),np.std(cv_kappa)))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
accuracy: 0.80
auc: 0.97
Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
F1: 0.79
Cohens kappa: 0.76
 
accuracy: 0.79
auc: 0.97
F1: 0.79
Cohens kappa: 0.76
 
accuracy: 0.80
auc: 0.97
F1: 0.79
Cohens kappa: 0.76
 
accuracy: 0.78
auc: 0.97
F1: 0.77
Cohens kappa: 0.74
 
Overall acc: 0.79 (+/- 0.01)
Overall AUC: 0.97 (+/- 0.00)
Overall f1: 0.79 (+/- 0.01)
Overall kappa: 0.76 (+/- 0.01)


In [7]:
save_specifier='standard_'+str(n_MFCC)+'mfcc_64_32_16_tanh_softmax_adam_0.005_batchfull'+str(n_epochs)+'epocs'

In [8]:
cv_data_df=pd.DataFrame(data={'acc':cv_acc,
                              'AUC':cv_AUC,
                              'F1':cv_f1,
                              'Kappa':cv_kappa}
                       )
cv_data_df.to_csv('/home/rakibul/WORK/Work/Thesis/Speech_Recognition/Vowel_Word/GitHub Repo/Result/New_without_CNN/vowel_cv_data_df_'+save_specifier+'.csv')

In [9]:
#WORD

In [10]:
DATA_PATH="/home/rakibul/WORK/Work/Thesis/Speech_Recognition/Vowel_Word/GitHub Repo/Word_Data/Word/"

In [11]:
def get_mfcc(file_path):
    wave, sr= librosa.load(file_path, mono=True, sr=None)
    mfcc=librosa.feature.mfcc(wave,sr=sr,n_mfcc=n_MFCC)
    mfcc_df=pd.DataFrame(mfcc.T,columns=column_names) #convert array to pandas df
    return mfcc_df

def get_all_mfcc(path=DATA_PATH):
    labels=sorted(os.listdir(path))
    all_mfcc=pd.DataFrame() #empty dataframe to store all_mfccs
    
    for label_index,label in enumerate(labels):
        folder_mfcc=pd.DataFrame() #empty dataframe to store mfcc of indivuals vowels/words i.e. folders

        speech_files=[path + label + '/' + file for file in sorted(os.listdir(path+'/'+label))]
        for file in tqdm(speech_files,"Reading Speech of label -'{}'".format(label)):
            #print(file)
            individual_mfcc=get_mfcc(file_path=file)
            folder_mfcc=folder_mfcc.append(individual_mfcc,ignore_index=True)
        
        folder_mfcc['label']=label_index  #new columns for encoding label
        
        all_mfcc=all_mfcc.append(folder_mfcc,ignore_index=True)
        
    return all_mfcc

def get_train_test():
    all_mfccs=get_all_mfcc()
    
    y=all_mfccs['label'].values
    
    only_mfcc=all_mfccs.drop(labels=['label'],axis=1)
    standard_mfcc=(only_mfcc-only_mfcc.mean())/only_mfcc.std()
    X=standard_mfcc.values

    assert X.shape[0] == len(y)
    return X,y

In [12]:
X,y=get_train_test()

Reading Speech of label -'Word1': 100%|██████████| 40/40 [00:00<00:00, 78.07it/s]
Reading Speech of label -'Word2': 100%|██████████| 40/40 [00:00<00:00, 85.78it/s]
Reading Speech of label -'Word3': 100%|██████████| 40/40 [00:00<00:00, 110.15it/s]
Reading Speech of label -'Word4': 100%|██████████| 40/40 [00:00<00:00, 77.98it/s]
Reading Speech of label -'Word5': 100%|██████████| 40/40 [00:00<00:00, 90.36it/s]
Reading Speech of label -'Word6': 100%|██████████| 40/40 [00:00<00:00, 94.29it/s] 
Reading Speech of label -'Word7': 100%|██████████| 40/40 [00:00<00:00, 115.45it/s]


In [13]:
kfold=StratifiedKFold(n_splits=4,shuffle=True,random_state=seed)
cv_acc=[]
cv_AUC=[]
cv_f1=[]
cv_kappa=[]

for train,test in kfold.split(X,y):
    y_hot=to_categorical(y)
    
    model_w=Sequential()
    model_w.add(Dense(64, activation='tanh',input_shape=(X[train].shape[1],)))
    #model_w.add(Dense(128, activation='tanh'))
    #model_w.add(Dense(64, activation='tanh'))
    model_w.add(Dense(32, activation='tanh'))
    model_w.add(Dense(16, activation='tanh'))
    model_w.add(Dense(7, activation='softmax'))
    
    model_w.compile(
        optimizer=opt.Adam(learning_rate=0.005),
        loss='categorical_crossentropy',
        metrics=[['accuracy','AUC']]
    )
    model_w.fit(X[train], y_hot[train], epochs=n_epochs, batch_size=X[train].shape[0],verbose=0)
    
    scores=model_w.evaluate(X[test],y_hot[test],verbose=0)
    
    print("%s: %.2f" % (model_w.metrics_names[1],scores[1]))
    print("%s: %.2f" % (model_w.metrics_names[2],scores[2]))
    
    yhat_classes = model_w.predict_classes(X[test], verbose=0)
    
    f1 = f1_score(y[test], yhat_classes,average='weighted')
    print('F1: %.2f' % f1)
    
    kappa = cohen_kappa_score(y[test], yhat_classes)
    print('Cohens kappa: %.2f' % kappa)
    
    cv_acc.append(scores[1])
    cv_AUC.append(scores[2])
    cv_f1.append(f1)
    cv_kappa.append(kappa)
    
    print(' ')
    
print("Overall acc: %.2f (+/- %.2f)" % (np.mean(cv_acc),np.std(cv_acc)))
print("Overall AUC: %.2f (+/- %.2f)" % (np.mean(cv_AUC),np.std(cv_AUC)))
print("Overall f1: %.2f (+/- %.2f)" % (np.mean(cv_f1),np.std(cv_f1)))
print("Overall kappa: %.2f (+/- %.2f)" % (np.mean(cv_kappa),np.std(cv_kappa)))

accuracy: 0.55
auc: 0.88
F1: 0.54
Cohens kappa: 0.47
 
accuracy: 0.57
auc: 0.89
F1: 0.56
Cohens kappa: 0.50
 
accuracy: 0.55
auc: 0.89
F1: 0.54
Cohens kappa: 0.47
 
accuracy: 0.54
auc: 0.88
F1: 0.54
Cohens kappa: 0.47
 
Overall acc: 0.55 (+/- 0.01)
Overall AUC: 0.89 (+/- 0.00)
Overall f1: 0.55 (+/- 0.01)
Overall kappa: 0.48 (+/- 0.01)


In [14]:
cv_data_df=pd.DataFrame(data={'acc':cv_acc,
                              'AUC':cv_AUC,
                              'F1':cv_f1,
                              'Kappa':cv_kappa}
                       )
cv_data_df.to_csv('/home/rakibul/WORK/Work/Thesis/Speech_Recognition/Vowel_Word/GitHub Repo/Result/New_without_CNN/word_cv_data_df_'+save_specifier+'.csv')

In [16]:
model_v.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 64)                896       
_________________________________________________________________
dense_13 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_14 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_15 (Dense)             (None, 7)                 119       
Total params: 3,623
Trainable params: 3,623
Non-trainable params: 0
_________________________________________________________________
