<a href="https://colab.research.google.com/github/renardelyon/Pronunciation-Learning-with-Translator/blob/main/Speech_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pydub
!pip install tensorflow-io==0.17
!pip install mutagen

In [None]:
import os
import pathlib
import re
import shutil
import mutagen
import math

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio

from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow import keras
from IPython import display
from pydub import AudioSegment
from mutagen.wave import WAVE
from string import ascii_lowercase

### GetCleanFile Class

In [None]:
class GetCleanFile:
  def __init__(self, origin, new_path, newer_path, zip_file):
    self.origin = origin
    self.new_path = new_path
    self.newer_path = newer_path
    self.zip_file = zip_file
    self.train_path = ""
    
    # make new directory to contain organized sub-directory
    if not os.path.exists(self.newer_path):
      os.mkdir(self.newer_path)
  
  def __call__(self):
    # download data from the web server
    data_dir = pathlib.Path(self.new_path)
    
    if not data_dir.exists():
      tf.keras.utils.get_file(
          self.zip_file,
          origin = self.origin,
          extract = True,
          cache_dir = '.',
          cache_subdir = self.new_path.split('/')[-1])
      
    return self
  
  def get_path (self):
    '''get data directory path'''
    all_file = os.listdir(self.new_path)
    dir = [i for i in all_file if not re.match('[\w]*.zip',i)][0]
    path = os.path.join(self.new_path,dir)
    train_test_dir = [i for i in os.listdir(path) if not re.match('[\w]*.TXT',i)][0]
    self.train_path = os.path.join(path,train_test_dir)
    return self

  def get_subdirectory(self):
    '''including subdirectories and excluding upper directories'''
    return tf.io.gfile.glob(str(self.train_path)+'/*/*')

  def rename_and_move_dir(self, dir_names):
    ''' rename the sub-directory and move the subdirectory
        to another directory'''
    for i, dir in enumerate(dir_names):
        split_dir = dir.split('/')
        split_dir[-1] = str(i)
        joined_dir = '/'.join(split_dir)
        shutil.move(dir, joined_dir)
        shutil.move(joined_dir, self.newer_path)
    return self
  
  def delete_directory(self):
    '''delete initial data directory'''
    shutil.rmtree(self.new_path)

  def clean_label(self, subdirs):
    '''process the label so its content does not have filename in front of each
        lines'''
    for subdir in subdirs:
      # Define sub-directory for the new files
      new_subdir = subdir.split('/')[:-1]
      new_subdir = '/'.join(new_subdir)

      with open(subdir, 'r') as f:
      
        # Read all lines and return as list
        lines = f.readlines()

        # iterate line by line
        for line in lines:
          new_name = line.split()[0]
          content = ' '.join(line.split()[1:]).lower()
          file_subdir = os.path.join(new_subdir, f'{new_name}.txt')
          with open(file_subdir, 'w') as new_file:
            new_file.write(content)

        # delete initial text file
        os.remove(subdir)

### EncodingDecoding Class

In [None]:
class EncodingDecoding:
  def __init__(self):
    self.char = [c for c in ascii_lowercase]
    self.non_alpha = ["-"," ", "'"]
    self.non_alpha.extend(self.char)  

  def encode_label(self, label):
    keys_tensor = tf.constant(self.non_alpha)
    vals_tensor = tf.constant(np.arange(len(self.non_alpha)))
    input_tensor = label

    table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor),
        default_value=-1)
    
    return table.lookup(input_tensor)

  def decode_label(self,predicted_label):
    keys_tensor = tf.constant(np.arange(len(self.non_alpha)))
    vals_tensor = tf.constant(self.non_alpha)
    input_tensor = predicted_label

    table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor),
        default_value='')
    
    return table.lookup(input_tensor).numpy()
  
  def decode_audio(self, audio_binary):
    ''' decode wav file to float tensor'''
    waveform, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(waveform,axis=-1)

### AudioFileConversion Class

In [None]:
class AudioFileConversion:
  def convert_flac_to_wav(self, src, dst):
    flac_audio = AudioSegment.from_file(src,format="flac")
    flac_audio.export(dst, format="wav") 

  def file_conversion(self, path):
    '''convert flac file into wav file'''
    for i, (subdirs, dir, fnames) in enumerate(os.walk(path)):
      if i > 0: 
        fnames = [fname for fname in fnames if not re.match('[\w\d.-]*.txt',fname)]
        for fname in fnames:

          # creating source path and destination path for the converted file
          src = os.path.join(subdirs,fname)
          fname_split = fname.split('.')
          fname_split[-1]='wav'
          fname = '.'.join(fname_split)
          dst =  os.path.join(subdirs, fname)

          # convert flac file format into wav file format
          self.convert_flac_to_wav(src, dst)

          # delete initial flac file
          os.remove(src) 

### AudioDataProcessing Class

In [None]:
class AudioDataProcessing:
  def __init__(self):
    self.max_length = 25
    self.sample_rate = 16000
    
  def get_spectrogram(self, waveform):
    '''Create spectogram from audio wave form'''
    # Padding for files with less than max sample
    max_sample = int(self.max_length * self.sample_rate)
    zero_padding = tf.zeros([max_sample] - tf.shape(waveform), dtype=tf.float32)

    # Concatenate audio with padding so that all audio clips will be of the 
    # same length
    waveform = tf.cast(waveform, tf.float32)
    equal_length = tf.concat([waveform, zero_padding], 0)
    spectrogram = tf.signal.stft(
        equal_length, frame_length=1024, 
        frame_step = 892)
      
    spectrogram = tf.abs(spectrogram)

    return spectrogram

    #Spoken Word Recognition Using MFCC and Learning Vector Quantization
  def get_log_mel_spectrograms(self, spectrogram):
    '''extract log mel spectrogram from spectrogram'''
    num_spectrogram_bins = spectrogram.shape[-1]
    num_mel_bins, lower_edge_hertz, upper_edge_hertz = 13, 250, 8000
    weight = tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins,
                                                 self.sample_rate, lower_edge_hertz,
                                                 upper_edge_hertz)
    mel_spectrograms = tf.tensordot(spectrogram,weight,1)
    mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate(
            weight.shape[-1:]))
  
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    return log_mel_spectrograms
  
  def get_mfcc(self, log_mel_spectrograms):
    '''extract mel frequency ceptrums coefficients from audio waveform'''
    mfcc = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)
    return mfcc

  def spec_augment(self):
    '''perform data augmentation for audio log spectrogram'''
    param = np.random.randint(1,100)
    augmentation = tf.keras.Sequential([
       layers.Lambda(lambda x : tfio.experimental.audio.freq_mask(x, param)),
       layers.Lambda(lambda x : tfio.experimental.audio.time_mask(x, param))            
    ])

    return augmentation

### GetWaveformLabel Class

In [None]:
class GetWaveformLabel(EncodingDecoding):
  def __init__(self):
    super().__init__()
    self.max_length = 400
    
  def get_waveform_label(self, audio_file, text_file):
    # decode WAV audio file
    audio_data = tf.io.read_file(audio_file)
    waveform = super().decode_audio(audio_data)
    
    #convert tensor into str
    text = tf.io.read_file(text_file)

    #split char from whole string
    chars = tf.strings.bytes_split(text)

    # encode text file to numeric values  
    label = super().encode_label(chars)

    zero_padding = tf.zeros([self.max_length] - tf.shape(label), dtype=tf.int64)

    # Concatenate encode text with padding so that all encode text will be of the 
    # same length
    label = tf.concat([label, zero_padding], 0)

    return waveform, label

### GetProcessDataLabel

In [None]:
class GetProcessDataLabel(AudioDataProcessing):
  def __init__(self):
    super().__init__()
  
  def get_process_label(self, waveform, label, train):
    x = super().get_spectrogram(waveform)
    if train:
      x = super().spec_augment()(x)
    x = super().get_log_mel_spectrograms(x)
    x = super().get_mfcc(x)
    return x, label

### Model

In [None]:
class TokenEmbedding(layers.Layer):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(num_vocab, num_hid)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        x = self.emb(x)
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions


class SpeechFeatureEmbedding(layers.Layer):
    def __init__(self, num_hid=64, maxlen=100):
        super().__init__()
        self.conv1 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv2 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.conv3 = tf.keras.layers.Conv1D(
            num_hid, 11, strides=2, padding="same", activation="relu"
        )
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        return self.conv3(x)

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output) #residual
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) #residual

In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
        super().__init__()
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.self_att = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.self_dropout = layers.Dropout(0.5)
        self.enc_dropout = layers.Dropout(0.1)
        self.ffn_dropout = layers.Dropout(0.1)
        self.ffn = keras.Sequential(
            [
                layers.Dense(feed_forward_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
        """Masks the upper half of the dot product matrix in self attention.

        This prevents flow of information from future tokens to current token.
        1's in the lower triangle, counting from the lower right corner.
        """
        i = tf.range(n_dest)[:, None]
        j = tf.range(n_src)
        m = i >= j - n_src + n_dest
        mask = tf.cast(m, dtype)
        mask = tf.reshape(mask, [1, n_dest, n_src])
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
        )
        return tf.tile(mask, mult)

    def call(self, enc_out, target):
        input_shape = tf.shape(target)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        target_att = self.self_att(target, target, attention_mask=causal_mask)
        target_norm = self.layernorm1(target + self.self_dropout(target_att))
        enc_out = self.enc_att(target_norm, enc_out)
        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
        ffn_out = self.ffn(enc_out_norm)
        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
        return ffn_out_norm

In [None]:
class Transformer(keras.Model):
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=100,
        target_maxlen=100,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=10,
    ):
        super().__init__()
        self.loss_metric = keras.metrics.Mean(name="loss")
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_input = SpeechFeatureEmbedding(num_hid=num_hid, maxlen=source_maxlen)
        self.dec_input = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        )

        self.encoder = keras.Sequential(
            [self.enc_input]
            + [
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        for i in range(num_layers_dec):
            setattr(
                self,
                f"dec_layer_{i}",
                TransformerDecoder(num_hid, num_head, num_feed_forward),
            )

        self.classifier = layers.Dense(num_classes)

    def decode(self, enc_out, target):
        y = self.dec_input(target)
        for i in range(self.num_layers_dec):
            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
        return y

    def call(self, inputs):
        source = inputs[0]
        target = inputs[1]
        x = self.encoder(source)
        y = self.decode(x, target)
        return self.classifier(y)

    @property
    def metrics(self):
        return [self.loss_metric]

    def train_step(self, batch):
        """Processes one batch inside model.fit()."""
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        with tf.GradientTape() as tape:
            preds = self([source, dec_input])
            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
            loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def test_step(self, batch):
        source = batch["source"]
        target = batch["target"]
        dec_input = target[:, :-1]
        dec_target = target[:, 1:]
        preds = self([source, dec_input])
        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
        loss = self.compiled_loss(one_hot, preds, sample_weight=mask)
        self.loss_metric.update_state(loss)
        return {"loss": self.loss_metric.result()}

    def generate(self, source, target_start_token_idx):
        """Performs inference over one batch of inputs using greedy decoding."""
        bs = tf.shape(source)[0]
        enc = self.encoder(source)
        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
        dec_logits = []
        for i in range(self.target_maxlen - 1):
            dec_out = self.decode(enc, dec_input)
            logits = self.classifier(dec_out)
            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
            dec_logits.append(last_logit)
            dec_input = tf.concat([dec_input, last_logit], axis=-1)
        return dec_input

### Extract

In [None]:
class TrainTestDataset:
  def organize_file(self, origin, new_path, newer_path, zip_file):
    get_clean_file = GetCleanFile(origin,new_path,newer_path, zip_file)
    dir_names = get_clean_file().get_path().get_subdirectory()
    get_clean_file.rename_and_move_dir(dir_names)
    get_clean_file.delete_directory()

    subdir = tf.io.gfile.glob(newer_path + '/*/*.txt')
    subdir_1 = tf.io.gfile.glob(newer_path + '/*/*.flac')
    get_clean_file.clean_label(subdir)
    return self

  def flac_conversion(self, path):
    file_conversion = AudioFileConversion()
    file_conversion.file_conversion(path)
    return self

  def prepare_dataset(self, path):
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    audio_file = sorted(np.array(tf.io.gfile.glob(str(newer_path) + '/*/*.wav')))
    text_file = sorted(np.array(tf.io.gfile.glob(str(newer_path) + '/*/*.txt')))
    list_ds = tf.data.Dataset.from_tensor_slices((audio_file, text_file))
    return list_ds

  def preprocess_dataset(self, list_ds, train):
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    get_waveform_label = GetWaveformLabel()
    get_process_data_label = GetProcessDataLabel()
  
    waveform_ds = list_ds.map(get_waveform_label.get_waveform_label, 
                            num_parallel_calls=AUTOTUNE)
    waveform_ds = waveform_ds.prefetch(AUTOTUNE)

    spectrogram_ds = waveform_ds.map(
        lambda x,y : get_process_data_label.get_process_label(x, y, train), 
        num_parallel_calls=AUTOTUNE)

    ds = spectrogram_ds.map(lambda x, y: {"source": x, "target": y}).cache()
    ds = ds.prefetch(AUTOTUNE)

    return ds


In [None]:

origin = 'https://www.openslr.org/resources/12/train-clean-100.tar.gz'
new_path = './data'
newer_path = './DATA'

val_origin = 'https://www.openslr.org/resources/12/test-clean.tar.gz'
val_new_path = './val_data'
val_newer_path = './VAL_DATA'

create_train_ds = TrainTestDataset()
create_test_ds = TrainTestDataset()

create_train_ds.organize_file(origin, new_path, newer_path, 'train.zip')
create_train_ds.flac_conversion(newer_path)
list_ds = create_train_ds.prepare_dataset(newer_path)
train_ds = create_train_ds.preprocess_dataset(list_ds, train=True)

create_test_ds.organize_file(val_origin, val_new_path, val_newer_path, 'test.zip')
create_test_ds.flac_conversion(val_newer_path)
val_list_ds = create_test_ds.prepare_dataset(val_newer_path)
val_ds = create_test_ds.preprocess_dataset(val_list_ds, train=False)

train_ds = train_ds.shuffle(1024).batch(16)
val_ds = val_ds.batch(4)

Downloading data from https://www.openslr.org/resources/12/test-clean.tar.gz




###TEST

In [None]:
def create_model():
  model = Transformer(
      num_hid=200,
      num_head=2,
      num_feed_forward=400,
      target_maxlen=400,
      num_layers_enc=4,
      num_layers_dec=1,
      num_classes=29,
    )
  loss_fn = tf.keras.losses.CategoricalCrossentropy(
      from_logits=True, label_smoothing=0.1,
    )

  optimizer = keras.optimizers.Adam()
  model.compile(optimizer=optimizer, loss=loss_fn)

  return model

Run cell dibawah buat train model

In [None]:
checkpoint_path = "./training/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:

model = create_model()

epochs = 1

history = model.fit(train_ds,validation_data=val_ds,
                    epochs=epochs, verbose=1,
                    callbacks=[cp_callback])


Epoch 00001: saving model to ./training/cp.ckpt


Kalau model belum selesai ditrain, save dulu file checkpointnya terus run kode dibawah

In [None]:
model = create_model()

# Load the previously saved weights
model.load_weights('./training/cp.ckpt')

epochs=1

history = model.fit(train_ds,validation_data=val_ds,
                    epochs=epochs, verbose=1,
                    callbacks=[cp_callback])


Epoch 00001: saving model to ./training/cp.ckpt


In [None]:
model.save('saved_model/my_model')





INFO:tensorflow:Assets written to: saved_model/my_model/assets


INFO:tensorflow:Assets written to: saved_model/my_model/assets


In [None]:
audio_data = tf.io.read_file('./DATA/0/445-123857-0000.wav')
audio_decoding = EncodingDecoding()
waveform = audio_decoding.decode_audio(audio_data)

In [None]:
spectrogram=get_process_waveform(waveform)
spectrogram = tf.expand_dims(spectrogram, axis=0)

In [None]:
def inference(model, spectrogram):
  label = model.generate(spectrogram,0)

  decoding = EncodingDecoding()
  label = decoding.decode_label(tf.cast(tf.squeeze(label, axis=0),
                                        dtype=tf.int64))
  
  label = b''.join(label).decode('utf-8')
  return label

In [None]:
inference(model, spectrogram)

'-he was the stood the station the stood the said the startion the stood the station the stand the start the station the stand the stand the stand the stand the stating the the stone ther the sare the stond the the the the the the the the the the the the the the the wan the the was the the the the the ast the the s the the t ont the ont wont wind st t s ast ent on  onthe w the the  s the s  t the o'