1. Import and Install Dependencies

1.1 Install Dependencies

In [None]:
%pip install tensorflow matplotlib tensorflow-io

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
from scipy.io import wavfile
from collections import defaultdict, Counter
from scipy import signal
import numpy as np
import librosa
from sklearn import preprocessing
import random as rn
from keras.layers import Dense
from keras import Input
from tensorflow.keras.models import Model
from keras.utils import to_categorical
from keras.layers import Dense, TimeDistributed, Dropout, Bidirectional, GRU, BatchNormalization, Activation, LeakyReLU, LSTM, Flatten, RepeatVector, Permute, Multiply, Conv2D, MaxPooling2D
import tensorflow as tf
import random

2. Data Loading

In [None]:
# Avoid OOM errors by setting GPU Memory Consumption Growth
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus: 
    tf.config.experimental.set_memory_growth(gpu, True)

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=3072)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

In [None]:
tf.config.list_physical_devices('GPU')


In [None]:
DATA_DIR = os.path.join('data', 'train/')

In [None]:
wav, sr = librosa.load(DATA_DIR + 'Q-01.wav', sr=16000)
print('sr:', sr)
print('wav shape:', wav.shape)
print('length:', wav.shape[0]/float(sr), 'secs')

In [None]:
# raw wave
print(plt.plot(wav))
print(plt.plot(wav[0:500]))

In [None]:
train_mlp = []
train_spectrograms = []
train_mel_spectrograms = []
train_mfccs = []
train_y = []

test_mlp = []
test_spectrograms = []
test_mel_spectrograms = []
test_mfccs = []
test_y = []

# 모든 음성파일의 길이가 같도록 후위에 padding 처리
pad1d = lambda a, i: a[0: i] if a.shape[0] > i else np.hstack((a, np.zeros(i-a.shape[0])))
pad2d = lambda a, i: a[:, 0:i] if a.shape[1] > i else np.hstack((a, np.zeros((a.shape[0], i-a.shape[1]))))

frame_length = 0.025
frame_stride = 0.0010

In [None]:
from sklearn.model_selection import train_test_split

def get_labels(filename):
  if filename[0] == 'Q':
    return 0
  elif filename[0] == 'W':
    return 1
  elif filename[0] == 'E':
    return 2
  elif filename[0] == 'R':
    return 3
  else:
    return 4


Spectogram

In [None]:
def get_spectrogram(filename):
    wav, sr = librosa.load(filename, sr=16000)
    spectrogram = np.abs(librosa.stft(wav))
    padded_spectrogram = pad2d(spectrogram, 40)
    return padded_spectrogram

# Step 1: Create a dictionary where keys are labels and values are lists of spectrograms.
spectrograms_by_label = defaultdict(list)

# Step 2: For each filename in the directory, get the spectrogram and label, and append the spectrogram to the corresponding list in the dictionary.
for filename in os.listdir(DATA_DIR):
    if '.wav' not in filename:
        continue

    spectrogram = get_spectrogram(DATA_DIR + filename)
    label = get_labels(filename)

    spectrograms_by_label[label].append(spectrogram)


In [None]:
# Step 3 and 4: For each label in the dictionary, apply the train_test_split function to create separate train and test sets. Store the train and test sets for each label in separate dictionaries.
train_spectrograms_by_label = {}
test_spectrograms_by_label = {}
train_labels_by_label = {}
test_labels_by_label = {}
for label, spectrograms in spectrograms_by_label.items():
    train_spectrograms, test_spectrograms = train_test_split(spectrograms, test_size=0.2, random_state=42)
    train_spectrograms_by_label[label] = train_spectrograms
    test_spectrograms_by_label[label] = test_spectrograms
    train_labels_by_label[label] = [label] * len(train_spectrograms)
    test_labels_by_label[label] = [label] * len(test_spectrograms)

Spectogram feature training

In [None]:
train_spectrograms = np.expand_dims(train_spectrograms, -1)
test_spectrograms = np.expand_dims(test_spectrograms, -1)
print('train_spectograms shape:', train_spectrograms.shape)
print('test_spectograms shape:', test_spectrograms.shape)

In [None]:
ip = Input(shape=train_spectrograms[0].shape)

m = Conv2D(32, kernel_size=(4,4), activation='relu')(ip)
m = MaxPooling2D(pool_size=(4,4))(m)

m = Conv2D(32*2, kernel_size=(4,4), activation='relu')(ip)
m = MaxPooling2D(pool_size=(4,4))(m)

m = Conv2D(32 * 3, kernel_size=(4, 4), activation='relu')(ip)
m = MaxPooling2D(pool_size=(4,4))(m)

m = Flatten()(m)

m = Dense(64, activation='relu')(m)

m = Dense(32, activation='relu')(m)

op = Dense(4, activation='softmax')(m)

model = Model(ip, op)

model.summary()


In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(train_spectrograms,
                    train_y,
                    epochs=100,
                    batch_size=32,
                    verbose=1,
                    validation_data=(test_spectrograms, test_y))

In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

In [None]:
metrics = history.history
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch')
plt.ylabel('Loss [CrossEntropy]')

plt.subplot(1,2,2)
plt.plot(history.epoch, 100*np.array(metrics['accuracy']), 100*np.array(metrics['val_accuracy']))
plt.legend(['accuracy', 'val_accuracy'])
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy [%]')

In [None]:
filename = (DATA_DIR + 'Q-30.wav')
spectrogram = get_spectrogram(filename)
spectrogram = np.expand_dims(spectrogram, 0)
prediction = model.predict(spectrogram)
plt.bar(["Q", "W", "E", "R", "S"], tf.nn.softmax(prediction[0]))
plt.title('Q')
plt.show()

In [None]:
filename = (DATA_DIR + 'W-30.wav')
spectrogram = get_spectrogram(filename)
spectrogram = np.expand_dims(spectrogram, 0)
prediction = model.predict(spectrogram)
plt.bar(["Q", "W", "E", "R", "S"], tf.nn.softmax(prediction[0]))
plt.title('W')
plt.show()

In [None]:
filename = (DATA_DIR + 'E-30.wav')
spectrogram = get_spectrogram(filename)
spectrogram = np.expand_dims(spectrogram, 0)
prediction = model.predict(spectrogram)
plt.bar(["Q", "W", "E", "R", "S"], tf.nn.softmax(prediction[0]))
plt.title('E')
plt.show()

In [None]:
filename = (DATA_DIR + 'R-30.wav')
spectrogram = get_spectrogram(filename)
spectrogram = np.expand_dims(spectrogram, 0)
prediction = model.predict(spectrogram)
plt.bar(["Q", "W", "E", "R", "S"], tf.nn.softmax(prediction[0]))
plt.title('R')
plt.show()

In [None]:
filename = (DATA_DIR + 'S-30.wav')
spectrogram = get_spectrogram(filename)
spectrogram = np.expand_dims(spectrogram, 0)
prediction = model.predict(spectrogram)
plt.bar(["Q", "W", "E", "R", "S"], tf.nn.softmax(prediction[0]))
plt.title('S')
plt.show()