In [63]:
# !pip install praat-textgrids

In [1]:
import textgrids
import os
import sys
import librosa
from pathlib import Path
import scipy.io.wavfile as wav
from scipy.signal import stft, resample
from python_speech_features import mfcc
from tqdm.notebook import tqdm
import pyaudio
import numpy as np

In [2]:
label_path = Path('./Data/Annotation')
data_path = Path('./Data/Audio')
print(label_path)
print(data_path)

Data/Annotation
Data/Audio


In [3]:
dataset = {}
for root,dirs,files in os.walk(label_path):
    for file in files:
        label = os.path.join(root, file)
        data =os.path.join(root.replace("Annotation", "Audio"), file.replace(".TextGrid", ".wav"))
        dataset[label] = data

In [4]:
def resample_frames(signal, original_rate, desired_rate):
    if original_rate == desired_rate: return signal
    num_samples = int(signal.shape[0]*float(desired_rate)/original_rate)
    return resample(signal, num_samples)

In [5]:
frame_rate = 16_0000

In [6]:
mfcc_data = []
mfcc_labels = []
def distribute_data(label_file, data_file):
    grid = textgrids.TextGrid(label_file)
    zones = grid['silences']
    x, fs = wav.read(data_file)
    signal = resample_frames(fs, x, frame_rate)
    for zone in zones:
        start = int(frame_rate*zone.xmin)
        end = int(frame_rate*zone.xmax)
        short_signal = signal[start:end]
        c_mfcc = mfcc(short_signal, frame_rate, nfilt=26, numcep=13, winlen=512/frame_rate, winstep=256/frame_rate)
        mfcc_data.extend(c_mfcc)
        mfcc_labels.extend(len(c_mfcc)*[int(zone.text)])
        
    


for label, data in tqdm(dataset.items(), total=len(dataset)):
    # print(label)
    distribute_data(label , data)
    # break

  0%|          | 0/719 [00:00<?, ?it/s]

In [9]:
max_length = max(mfcc_data_individual.shape[0] for mfcc_data_individual in mfcc_data)
max_length
array = np.save('tight_mfcc_data.npy', mfcc_data)

In [9]:
def pad_mfcc_seq(mfcc_data, max_length):
    padded_array = []
    for item in tqdm(mfcc_data, total=len(mfcc_data)):
        num_frames, num_features = item.shape
        if (num_frames == 0):
            padded_feature = np.zeros((max_length, num_features, 1), dtype=item.dtype)
        else:
            # print(((0, max_length - num_frames), (0, 0)))
            padded_feature = np.array([np.pad(item, ((0, max_length - num_frames), (0, 0)), mode='constant')])
        padded_array.append(padded_feature)
    return padded_array
((0, 3680), (0, 0))

((0, 3680), (0, 0))

In [13]:
np.save('tight_labels.npy', mfcc_labels)

In [15]:
padded_array = np.array(padded_features)
np.save("padded_dataset.npy", padded_array)

In [22]:
label_array = np.expand_dims(mfcc_labels, (1,))
np.save("labels.npy", label_array)

### Training BiLSTM for VAD

In [2]:
from tensorflow.keras.layers import Bidirectional, Dropout, Dense, LSTM, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import load_model
import numpy as np

2024-10-23 19:44:14.596291: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-23 19:44:15.543296: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-23 19:44:18.948315: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
def build_biLSTM_VAD(in_shape):
    model = Sequential()
    model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=in_shape))
    model.add(BatchNormalization())
    
    model.add(Dropout(0.1))
    model.add(Bidirectional(LSTM(64, return_sequences=True)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
   
    model.add(Dense(32, activation='relu')) 
    model.add(Dropout(0.1))

    model.add(Bidirectional(LSTM(32)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(8, activation='relu'))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=0.0005), metrics=['accuracy'], loss='binary_crossentropy')
    return model

In [4]:
# data = np.load('dataset.npy')

In [14]:
flatten_data = np.load('tight_mfcc_data.npy')

In [17]:
flatten_data.shape
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(flatten_data)

In [23]:
normalized_data.shape
# flatten_data.shape

(1989517, 13)

In [24]:
final_dataset = normalized_data.reshape(flatten_data.shape)
np.save('normalized_tight_mfcc_data.npy', final_dataset)

In [5]:
X_train = np.load('normalized_tight_mfcc_data.npy')
y_train = np.load("tight_labels.npy")
# X_train = np.expand_dims(X_train, 1)
# np.save('normalized_tight_mfcc_data.npy', X_train)

In [10]:
X_train = X_train[:100000]
y_train = y_train[:100000]

In [11]:
model = build_biLSTM_VAD(X_train[0].shape)

In [12]:
y_train.shape

(1989517,)

In [13]:
model.summary()

In [14]:
epochs = 100
history = model.fit(X_train, y_train, validation_split=0.2, batch_size=32, epochs=10)

Epoch 1/10
[1m49738/49738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m452s[0m 9ms/step - accuracy: 0.8476 - loss: 0.3428 - val_accuracy: 0.8994 - val_loss: 0.2482
Epoch 2/10
[1m49738/49738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 9ms/step - accuracy: 0.8600 - loss: 0.3189 - val_accuracy: 0.9023 - val_loss: 0.2450
Epoch 3/10
[1m49738/49738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 8ms/step - accuracy: 0.8621 - loss: 0.3157 - val_accuracy: 0.8989 - val_loss: 0.2476
Epoch 4/10
[1m49738/49738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3537s[0m 71ms/step - accuracy: 0.8630 - loss: 0.3124 - val_accuracy: 0.9011 - val_loss: 0.2479
Epoch 5/10
[1m49738/49738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 9ms/step - accuracy: 0.8638 - loss: 0.3113 - val_accuracy: 0.9043 - val_loss: 0.2387
Epoch 6/10
[1m49738/49738[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m472s[0m 9ms/step - accuracy: 0.8643 - loss: 0.3104 - val_accuracy: 0.9024 - val_los

In [22]:
history.history['accuracy'], history.history['val_accuracy']

([0.8531150221824646,
  0.8605364561080933,
  0.8622328639030457,
  0.8629214763641357,
  0.8639512062072754,
  0.864428699016571,
  0.8647460341453552,
  0.8650180697441101,
  0.8652737736701965,
  0.8652844429016113],
 [0.8994104266166687,
  0.9023382663726807,
  0.8989052772521973,
  0.9010841846466064,
  0.9042909741401672,
  0.9023659229278564,
  0.9026147127151489,
  0.9031147956848145,
  0.8994581699371338,
  0.9019612669944763])

In [15]:
model.save('analysis_mono.h5')



In [3]:
model = load_model('analysis_mono.h5')



In [13]:
model.predict_on_batch(X_train[y_train == 0][100:200]) > 0.5

array([[False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [