In [None]:
!pip install pydub
!pip install tensorflow-io
!pip install mutagen

In [2]:
import os
import pathlib
import re
import shutil
import mutagen
import math

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio

from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display
from pydub import AudioSegment
from mutagen.wave import WAVE
from string import ascii_lowercase

### GetCleanFile Class

In [3]:
class GetCleanFile:
  def __init__(self, origin, new_path, newer_path):
    self.origin = origin
    self.new_path = new_path
    self.newer_path = newer_path
    self.train_path = ""
    
    # make new directory to contain organized sub-directory
    if not os.path.exists(self.newer_path):
      os.mkdir(self.newer_path)
  
  def __call__(self):
    # download data from the web server
    data_dir = pathlib.Path(self.new_path)
    
    if not data_dir.exists():
      tf.keras.utils.get_file(
          'librispeech.zip',
          origin = self.origin,
          extract = True,
          cache_dir = '.',
          cache_subdir = self.new_path.split('/')[-1])
      
    return self
  
  def get_train_path (self):
    '''get train data directory path'''
    all_file = os.listdir(self.new_path)
    dir = [i for i in all_file if not re.match('[\w]*.zip',i)][0]
    path = os.path.join(new_path,dir)
    train_dir = [i for i in os.listdir(path) if not re.match('[\w]*.TXT',i)][0]
    self.train_path = os.path.join(path,train_dir)
    return self

  def get_subdirectory(self):
    '''including subdirectories and excluding upper directories'''
    return tf.io.gfile.glob(str(self.train_path)+'/*/*')

  def rename_and_move_dir(self, dir_names):
    ''' rename the sub-directory and move the subdirectory
        to another directory'''
    for i, dir in enumerate(dir_names):
        split_dir = dir.split('/')
        split_dir[-1] = str(i)
        joined_dir = '/'.join(split_dir)
        shutil.move(dir, joined_dir)
        shutil.move(joined_dir, self.newer_path)
    return self
  
  def delete_directory(self):
    '''delete initial data directory'''
    shutil.rmtree(self.new_path)

  def clean_label(self, subdirs):
    '''process the label so its content does not have filename in front of each
        lines'''
    for subdir in subdirs:
      # Define sub-directory for the new files
      new_subdir = subdir.split('/')[:-1]
      new_subdir = '/'.join(new_subdir)

      with open(subdir, 'r') as f:
      
        # Read all lines and return as list
        lines = f.readlines()

        # iterate line by line
        for line in lines:
          new_name = line.split()[0]
          content = ' '.join(line.split()[1:]).lower()
          file_subdir = os.path.join(new_subdir, f'{new_name}.txt')
          with open(file_subdir, 'w') as new_file:
            new_file.write(content)

        # delete initial text file
        os.remove(subdir)

### EncodingDecoding Class

In [4]:
class EncodingDecoding:
  def __init__(self):
    self.char = [c for c in ascii_lowercase]
    self.non_alpha = [" ", "'"]
    self.non_alpha.extend(self.char)  

  def encode_label(self, label):
    keys_tensor = tf.constant(self.non_alpha)
    vals_tensor = tf.constant(np.arange(len(self.non_alpha)))
    input_tensor = label

    table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor),
        default_value=-1)
    
    return table.lookup(input_tensor)

  def decode_label(self,predicted_label):
    keys_tensor = tf.constant(np.arange(len(self.non_alpha)))
    vals_tensor = tf.constant(self.non_alpha)
    input_tensor = predicted_label

    table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor),
        default_value='')
    
    return table.lookup(input_tensor).numpy()
  
  def decode_audio(self, audio_binary):
    ''' decode wav file to float tensor'''
    waveform, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(waveform,axis=-1)

### AudioFileConversion Class

In [5]:
class AudioFileConversion:
  def convert_flac_to_wav(self, src, dst):
    flac_audio = AudioSegment.from_file(src,format="flac")
    flac_audio.export(dst, format="wav") 

  def file_conversion(self, path):
    '''convert flac file into wav file'''
    for i, (subdirs, dir, fnames) in enumerate(os.walk(path)):
      if i > 0: 
        fnames = [fname for fname in fnames if not re.match('[\w\d.-]*.txt',fname)]
        for fname in fnames:

          # creating source path and destination path for the converted file
          src = os.path.join(subdirs,fname)
          fname_split = fname.split('.')
          fname_split[-1]='wav'
          fname = '.'.join(fname_split)
          dst =  os.path.join(subdirs, fname)

          # convert flac file format into wav file format
          self.convert_flac_to_wav(src, dst)

          # delete initial flac file
          os.remove(src) 

### AudioDataProcessing Class

In [6]:
class AudioDataProcessing:
  def __init__(self, path):
    self.path = path
    self.max_length = 0
    self.sample_rate = 16000
  def get_max_length(self):
    '''find maximum length file'''
    audio_length = []
    file_dir = tf.io.gfile.glob(self.path+'/*/*.wav')
    for fil in file_dir:
      audio = WAVE(fil).info.length
      audio_length.append(audio)
    self.max_length = max(audio_length)

    return self

  def get_spectrogram(self, waveform):
    '''Create spectogram from audio wave form'''
    # Padding for files with less than max sample
    max_sample = int(self.max_length * self.sample_rate)
    zero_padding = tf.zeros([max_sample] - tf.shape(waveform), dtype=tf.float32)

    # Concatenate audio with padding so that all audio clips will be of the 
    # same length
    waveform = tf.cast(waveform, tf.float32)
    equal_length = tf.concat([waveform, zero_padding], 0)
    spectrogram = tf.signal.stft(
        equal_length, frame_length=1024, 
        frame_step = 892)
      
    spectrogram = tf.abs(spectrogram)

    return spectrogram

    #Spoken Word Recognition Using MFCC and Learning Vector Quantization
  def get_log_mel_spectrograms(self, spectrogram):
    '''extract log mel spectrogram from spectrogram'''
    num_spectrogram_bins = spectrogram.shape[-1]
    num_mel_bins, lower_edge_hertz, upper_edge_hertz = 13, 250, 8000
    weight = tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins,
                                                 self.sample_rate, lower_edge_hertz,
                                                 upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrogram,weight,1)
    mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
            weight.shape[-1:]))
  
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    return log_mel_spectrograms
  
  def get_mfcc(self, log_mel_spectrograms):
    '''extract mel frequency ceptrums coefficients from audio waveform'''
    mfcc = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)
    return mfcc

  def spec_augment(self):
    '''perform data augmentation for audio log spectrogram'''
    param = np.random.randint(1,100)
    augmentation = tf.keras.Sequential([
       layers.Lambda(lambda x : tfio.experimental.audio.freq_mask(x, param)),
       layers.Lambda(lambda x : tfio.experimental.audio.time_mask(x, param))            
    ])

    return augmentation

### GetWaveformLabel Class

In [7]:
class GetWaveformLabel(EncodingDecoding):
  def __init__(self):
    super().__init__()

  def get_waveform_label(self, audio_file, text_file):
    # decode WAV audio file
    audio_data = tf.io.read_file(audio_file)
    waveform = super().decode_audio(audio_data)
    
    #convert tensor into str
    text = tf.io.read_file(text_file)

    #split char from whole string
    chars = tf.strings.bytes_split(text)

    # encode text file to numeric values  
    label = super().encode_label(chars)

    return waveform, label

### GetProcessDataLabel

In [8]:
class GetProcessDataLabel(AudioDataProcessing):
  def __init__(self, path):
    super().__init__(path)
  
  def get_process_label(self, waveform, label):
    super().get_max_length()
    x = super().get_spectrogram(waveform)
    x = super().spec_augment()(x)
    x = super().get_log_mel_spectrograms(x)
    x = super().get_mfcc(x)
    return x, label

### Extract

In [9]:
def organize_file(origin, new_path, newer_path):
  get_clean_file = GetCleanFile(origin,new_path,newer_path)
  dir_names = get_clean_file().get_train_path().get_subdirectory()
  get_clean_file.rename_and_move_dir(dir_names)
  get_clean_file.delete_directory()

  subdir = tf.io.gfile.glob(newer_path + '/*/*.txt')
  subdir_1 = tf.io.gfile.glob(newer_path + '/*/*.flac')
  get_clean_file.clean_label(subdir)

def flac_conversion(path):
  file_conversion = AudioFileConversion()
  file_conversion.file_conversion(path)

def preprocess_dataset(path):
  newer_path = './DATA'
  AUTOTUNE = tf.data.experimental.AUTOTUNE
  audio_file = sorted(np.array(tf.io.gfile.glob(str(newer_path) + '/*/*.wav')))
  text_file = sorted(np.array(tf.io.gfile.glob(str(newer_path) + '/*/*.txt')))
  list_ds = tf.data.Dataset.from_tensor_slices((audio_file, text_file)).cache()
  
  get_waveform_label = GetWaveformLabel()
  get_process_data_label = GetProcessDataLabel('./DATA').get_max_length()
  
  waveform_ds = list_ds.map(get_waveform_label.get_waveform_label, num_parallel_calls=AUTOTUNE)
  waveform_ds = waveform_ds.cache()
  waveform_ds = waveform_ds.prefetch(AUTOTUNE)

  spectrogram_ds = waveform_ds.map(get_process_data_label.get_process_label, num_parallel_calls=AUTOTUNE)
  spectrogram_ds = spectrogram_ds.cache()

  preprocess_ds = spectrogram_ds.shuffle(1024).repeat(5).batch(128)
  return preprocess_ds


In [10]:
if __name__=="__main__":
  origin = 'https://www.openslr.org/resources/12/train-clean-100.tar.gz'
  new_path = './data'
  newer_path = './DATA'
  organize_file(origin, new_path, newer_path)
  flac_conversion(newer_path)
  preprocess_ds = preprocess_dataset(newer_path)


Downloading data from https://www.openslr.org/resources/12/train-clean-100.tar.gz




AttributeError: ignored

### Test

In [None]:
display.Audio(waveform.numpy(),rate=sample_rate.numpy())

In [None]:
def plot_spectrogram(spectrogram, ax):
  # Convert to frequencies to log scale and transpose so that the time is
  # represented in the x-axis (columns).
  log_spec = np.log(spectrogram.T)
  height = log_spec.shape[0]
  width = log_spec.shape[1]
  X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
  Y = range(height)
  ax.pcolormesh(X, Y, log_spec)


fig, axes = plt.subplots(2, figsize=(12, 8))
timescale = np.arange(waveform.shape[0])
axes[0].plot(timescale, waveform.numpy())
axes[0].set_title('Waveform')

result = audio_processing.spec_augment()(spectrogram)
plot_spectrogram(result.numpy(), axes[1])
axes[1].set_title('Spectrogram')
plt.show()


In [None]:
#build network model using Keras

import tensorflow as tf
import tensorflow.keras as keras
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(16000,)))
model.add(Dense(256, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adadelta(), metrics=['acc'])
model.fit(waveform, label, batch_size=124, epochs=20, verbose=1, validation_data=(,))
print('Loss for test: ', sc[0])
print('Accuracy: ', sc[1])
