In [2]:
import os
import numpy as np 
import tensorflow as tf
import matplotlib.pyplot as plt
import pathlib
import librosa.display
from tqdm import tqdm
from sklearn import metrics
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

try:
    from spela.spectrogram import Spectrogram 
    from spela.melspectrogram import Melspectrogram
except:
    !pip install spela
    from spela.spectrogram import Spectrogram 
    from spela.melspectrogram import Melspectrogram
    
tf.compat.v1.disable_eager_execution()
data_dir = r"C:/Users/ASUS/OneDrive - BUET/Desktop/SR_DSP/TestData"

In [3]:
# get wav paths
def get_wav_paths(speaker):
    speaker_path = data_dir + speaker
    all_paths = [item for item in os.listdir(speaker_path)]
    return all_paths

In [4]:
id_16_path = get_wav_paths("/ID_16")
id_19_path = get_wav_paths("/ID_19")
id_20_path = get_wav_paths("/ID_20")
id_21_path = get_wav_paths("/ID_21")
id_29_path = get_wav_paths("/ID_29")
id_32_path = get_wav_paths("/ID_32")
id_33_path = get_wav_paths("/ID_33")
id_35_path = get_wav_paths("/ID_35")
id_41_path = get_wav_paths("/ID_41")
id_45_path = get_wav_paths("/ID_45")
id_46_path = get_wav_paths("/ID_46")
id_49_path = get_wav_paths("/ID_49")
id_53_path = get_wav_paths("/ID_53")
id_56_path = get_wav_paths("/ID_56")
id_57_path = get_wav_paths("/ID_57")
id_59_path = get_wav_paths("/ID_59")
id_64_path = get_wav_paths("/ID_64")
id_01_path = get_wav_paths("/ID_01")
id_02_path = get_wav_paths("/ID_02")
id_04_path = get_wav_paths("/ID_04")
id_05_path = get_wav_paths("/ID_05")
id_06_path = get_wav_paths("/ID_06")
id_07_path = get_wav_paths("/ID_07")
id_10_path = get_wav_paths("/ID_10")
id_11_path = get_wav_paths("/ID_11")
id_12_path = get_wav_paths("/ID_12")
id_13_path = get_wav_paths("/ID_13")
id_25_path = get_wav_paths("/ID_25")
id_26_path = get_wav_paths("/ID_26")
id_36_path = get_wav_paths("/ID_36")
id_39_path = get_wav_paths("/ID_39")
id_42_path = get_wav_paths("/ID_42")
id_43_path = get_wav_paths("/ID_43")
id_44_path = get_wav_paths("/ID_44")
id_48_path = get_wav_paths("/ID_48")
id_61_path = get_wav_paths("/ID_61")
id_62_path = get_wav_paths("/ID_62")
id_63_path = get_wav_paths("/ID_63")

In [5]:
# load the data
def load_wav(wav_path, speaker):
    with tf.compat.v1.Session(graph=tf.compat.v1.Graph()) as sess:
        wav_path = data_dir +speaker + "/"+ wav_path
        wav_filename_placeholder = tf.compat.v1.placeholder(tf.compat.v1.string, [])
        wav_loader = tf.io.read_file(wav_filename_placeholder)
        wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1)
        wav_data = sess.run(
            wav_decoder, feed_dict={
                wav_filename_placeholder: wav_path
            }).audio.flatten().reshape((1, 44100))
        sess.close()
    return wav_data

In [6]:
# create training data
def generate_training_data(speaker_paths, speaker, label):
    wavs, labels = [], []
    for i in tqdm(speaker_paths):
        wav = load_wav(i, speaker)
        wavs.append(wav)
        labels.append(label)
    return wavs, labels

In [7]:
id_32_wavs, id_32_labels = generate_training_data(id_32_path, "/ID_32",0) 
id_33_wavs, id_33_labels = generate_training_data(id_33_path, "/ID_33",1) 
id_35_wavs, id_35_labels = generate_training_data(id_35_path, "/ID_35",2)
id_41_wavs, id_41_labels = generate_training_data(id_41_path, "/ID_41",3)
id_45_wavs, id_45_labels = generate_training_data(id_45_path, "/ID_45",4)
id_16_wavs, id_16_labels = generate_training_data(id_16_path, "/ID_16",5)
id_19_wavs, id_19_labels = generate_training_data(id_19_path, "/ID_19",6)
id_20_wavs, id_20_labels = generate_training_data(id_20_path, "/ID_20",7)
id_21_wavs, id_21_labels = generate_training_data(id_21_path, "/ID_21",8)
id_29_wavs, id_29_labels = generate_training_data(id_29_path, "/ID_29",9)
id_46_wavs, id_46_labels = generate_training_data(id_46_path, "/ID_46",10)
id_49_wavs, id_49_labels = generate_training_data(id_49_path, "/ID_49",11)
id_53_wavs, id_53_labels = generate_training_data(id_53_path, "/ID_53",12)
id_56_wavs, id_56_labels = generate_training_data(id_56_path, "/ID_56",13)
id_57_wavs, id_57_labels = generate_training_data(id_57_path, "/ID_57",14)
id_59_wavs, id_59_labels = generate_training_data(id_59_path, "/ID_59",15)
id_64_wavs, id_64_labels = generate_training_data(id_64_path, "/ID_64",16)
id_01_wavs, id_01_labels = generate_training_data(id_01_path, "/ID_01",17)
id_02_wavs, id_02_labels = generate_training_data(id_02_path, "/ID_02",18)
id_04_wavs, id_04_labels = generate_training_data(id_04_path, "/ID_04",19)
id_05_wavs, id_05_labels = generate_training_data(id_05_path, "/ID_05",20)
id_06_wavs, id_06_labels = generate_training_data(id_06_path, "/ID_06",21)
id_07_wavs, id_07_labels = generate_training_data(id_07_path, "/ID_07",22)
id_10_wavs, id_10_labels = generate_training_data(id_10_path, "/ID_10",23)
id_11_wavs, id_11_labels = generate_training_data(id_11_path, "/ID_11",24)
id_12_wavs, id_12_labels = generate_training_data(id_12_path, "/ID_12",25)
id_13_wavs, id_13_labels = generate_training_data(id_13_path, "/ID_13",26)
id_25_wavs, id_25_labels = generate_training_data(id_25_path, "/ID_25",27)
id_26_wavs, id_26_labels = generate_training_data(id_26_path, "/ID_26",28)
id_36_wavs, id_36_labels = generate_training_data(id_36_path, "/ID_36",29)
id_39_wavs, id_39_labels = generate_training_data(id_39_path, "/ID_39",30)
id_42_wavs, id_42_labels = generate_training_data(id_42_path, "/ID_42",31)
id_43_wavs, id_43_labels = generate_training_data(id_43_path, "/ID_43",32)
id_44_wavs, id_44_labels = generate_training_data(id_44_path, "/ID_44",33)
id_48_wavs, id_48_labels = generate_training_data(id_48_path, "/ID_48",34)
id_61_wavs, id_61_labels = generate_training_data(id_61_path, "/ID_61",35)
id_62_wavs, id_62_labels = generate_training_data(id_62_path, "/ID_62",36)
id_63_wavs, id_63_labels = generate_training_data(id_63_path, "/ID_63",37)

100%|██████████| 5/5 [00:00<00:00, 29.53it/s]
100%|██████████| 5/5 [00:00<00:00, 319.26it/s]
100%|██████████| 5/5 [00:00<00:00, 160.23it/s]
100%|██████████| 5/5 [00:00<00:00, 159.99it/s]
100%|██████████| 5/5 [00:00<00:00, 225.33it/s]
100%|██████████| 5/5 [00:00<00:00, 160.23it/s]
100%|██████████| 5/5 [00:00<00:00, 159.72it/s]
100%|██████████| 5/5 [00:00<00:00, 132.43it/s]
100%|██████████| 5/5 [00:00<00:00, 160.03it/s]
100%|██████████| 5/5 [00:00<00:00, 159.69it/s]
100%|██████████| 5/5 [00:00<00:00, 226.21it/s]
100%|██████████| 5/5 [00:00<00:00, 320.08it/s]
100%|██████████| 5/5 [00:00<00:00, 160.04it/s]
100%|██████████| 5/5 [00:00<00:00, 159.89it/s]
100%|██████████| 5/5 [00:00<00:00, 319.97it/s]
100%|██████████| 5/5 [00:00<00:00, 160.04it/s]
100%|██████████| 5/5 [00:00<00:00, 160.03it/s]
100%|██████████| 5/5 [00:00<00:00, 132.39it/s]
100%|██████████| 5/5 [00:00<00:00, 159.82it/s]
100%|██████████| 5/5 [00:00<00:00, 160.25it/s]
100%|██████████| 5/5 [00:00<00:00, 225.05it/s]
100%|█████████

In [8]:
all_wavs = id_32_wavs + id_33_wavs + id_35_wavs + id_41_wavs + id_45_wavs + id_16_wavs + id_19_wavs + id_20_wavs + id_21_wavs + id_29_wavs + id_46_wavs + id_49_wavs + id_53_wavs + id_56_wavs + id_57_wavs + id_59_wavs + id_64_wavs + id_01_wavs + id_02_wavs + id_04_wavs + id_05_wavs  + id_06_wavs + id_07_wavs + id_10_wavs+ id_11_wavs+ id_12_wavs+ id_13_wavs+ id_25_wavs+ id_26_wavs + id_36_wavs + id_39_wavs + id_42_wavs + id_43_wavs + id_44_wavs + id_48_wavs + id_61_wavs + id_62_wavs + id_63_wavs

all_labels = id_32_labels + id_33_labels + id_35_labels + id_41_labels  + id_45_labels + id_16_labels + id_19_labels + id_20_labels + id_21_labels + id_29_labels + id_46_labels + id_49_labels + id_53_labels + id_56_labels + id_57_labels + id_59_labels + id_64_labels + id_01_labels + id_02_labels + id_04_labels + id_05_labels + id_06_labels + id_07_labels + id_10_labels + id_11_labels + id_12_labels + id_13_labels + id_25_labels + id_26_labels + id_36_labels + id_39_labels + id_42_labels + id_43_labels + id_44_labels + id_48_labels + id_61_labels + id_62_labels + id_63_labels

In [9]:
name_list = ["Ayan_32","Razon_33","Abir_35","Indronil_41","Sourav_45","Ananna_16","Redwan_19","Shafin_20","Shovon_21","Samdani_29","Rasel_46","Humayom_49","Saleh_53","Shihab_56","Prithu_57","Fatin_59","Sadat_64","Mrinmoy_01","Elin_02","Aroni_04","Nabila_05","Subah_06","Rafi_07","Plabon_07","Saleah_11","Sabbir_12","Toiyob_13","Tauhid_25","Murad_26","Tonmoy_36","Tajwar_39","Monirul_42","Fariza_43","Shuvro_44","Tanvir_48","Swadesh_61","Imtiaz_62","Tamim_63"]

In [10]:
# split the dataset into trainin and testing set\
train_wavs, test_wavs, train_labels, test_labels = train_test_split(all_wavs, all_labels, test_size=0.2)
train_x, train_y = np.array(train_wavs), np.array(train_labels)
test_x, test_y = np.array(test_wavs), np.array(test_labels)

train_y = tf.keras.utils.to_categorical(train_y)
test_y = tf.keras.utils.to_categorical(test_y)

In [11]:
# create a model
def create_model(speech_feature):
    model = tf.keras.Sequential()
    if speech_feature == "spectrogram":
        model.add(Spectrogram(n_dft=1024, n_hop=256, input_shape=(1, 44100),
                            return_decibel_spectrogram=True, power_spectrogram=2.0,
                            trainable_kernel=False, name='static_stft'))
    elif speech_feature == "melspectrogram":
        model.add(Melspectrogram(sr=44100, n_mels=128,n_dft=1024, n_hop=256,
                            input_shape=(1 , 44100),return_decibel_melgram=True,
                            trainable_kernel=False, name='melgram'))
   

    model.add(tf.keras.layers.Conv2D(64, (3, 3), activation="relu"))
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2)))

    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(38, activation="softmax"))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate =3e-4)
            , loss = "categorical_crossentropy"
            , metrics = ["accuracy"])
    return model

In [12]:
# melspectrogram model
model = create_model("melspectrogram")
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
melgram (Melspectrogram)     (None, 128, 173, 1)       1116288   
_________________________________________________________________
conv2d (Conv2D)              (None, 126, 171, 64)      640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 63, 85, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 342720)            0         
_________________________________________________________________
dense (Dense)                (None, 38)                13023398  
Total params: 14,140,326
Trainable params: 14,140,326
Non-trainable params: 0
_________________________________________________________________


In [None]:
# melspectrogram model
model = create_model("melspectrogram")
model.fit(x=train_x, y=train_y, epochs=50, validation_data=(test_x, test_y))

Train on 152 samples, validate on 38 samples
Epoch 1/50



Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
 32/152 [=====>........................] - ETA: 2s - loss: 676.5802 - accuracy: 0.1875

In [None]:
z,sr = librosa.load("C:/Users/ASUS/OneDrive - BUET/Desktop/SR_DSP/TestData/ID_59/ID_59_3.wav",duration=1,sr=44100)
z = np.array(z,dtype=object)
z = z.reshape(1,44100)
z = z[np.newaxis,:]
y = model.predict(z)
y = y.astype(int)
print(y)
idx = np.where(y == 1.0)
print(name_list[int(idx[1])])

In [None]:
predictions = model.predict(test_x)


matrix = metrics.confusion_matrix(test_y.argmax(axis=1), predictions.argmax(axis=1))
print(matrix)
disp = ConfusionMatrixDisplay(confusion_matrix=matrix)
disp.plot() 

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write
import time

fs = 44100  # Sample rate
seconds = 3  # Duration of recording
print("Start Speaking Now\n")
time.sleep(0.5)
print('Listening....Speak Now')
myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=1)
sd.wait()  # Wait until recording is finished
print("Done Recording\n")

test1 = np.array(myrecording[20000:20000+44100])
test2 = np.array(myrecording[0:44100])

test1 = test1.reshape(1,44100)
test1 = test1[np.newaxis,:]
test2 = test2.reshape(1,44100)
test2 = test2[np.newaxis,:]

y1 = model.predict(test1)
y1 = y1.astype(int)
print(y1)
y2 = model.predict(test2)
y2 = y2.astype(int)
print(y2)

if np.sum(y1)>0:

        index1 = np.where(y1 == 1)
         
        index1 = np.array(index1)
        pos = index1[1][0]
        print(name_list[pos])
       
elif np.sum(y2)>0:
        index1 = np.where(y2 == 1)

        index1 = np.array(index1)
        pos = index1[1][0]
        print(name_list[pos])
else:
        print("Input again")


