In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

import os
os.chdir('./drive/MyDrive')

Mounted at /content/drive


#### import

In [2]:
!pip install params

Collecting params
  Downloading params-0.9.0-py3-none-any.whl.metadata (631 bytes)
Downloading params-0.9.0-py3-none-any.whl (11 kB)
Installing collected packages: params
Successfully installed params-0.9.0


In [3]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.13.0


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

from sklearn.model_selection import train_test_split

import numpy as np
import matplotlib.pyplot
from IPython import display
from jiwer import wer
import params
import time


#### read data
define help function

In [101]:
def read_TIMIT(path):
  '''
  args:
    path: path of TIMIT data(mfcc features, phoneme labels)
  return:
    feats: list of list for each audio samples
    labels: list of list for each audio samples
  '''

  feats, labels = [], []
  length_feats, length_labels = [], []

  # read processed TIMIT data
  # list of dictionarys with keys being 'mfcc', 'phonemes', 'path'
  samples = torch.load(path, weights_only = False)
  for idx in range(len(samples)):
    feats.append(samples[idx]['mfcc'])
    labels.append(samples[idx]['phonemes'])
  return feats, labels


execute and review

In [102]:
os.listdir()[-2]

'timit_mfcc_data.pt'

In [103]:
# split train / dev data
path = r'timit_mfcc_data.pt'
feats, labels = read_TIMIT(path)

train_feats, dev_feats, train_labels, dev_labels = train_test_split(feats, labels)

In [104]:
# check mfcc feature matrix dimension
print(f'MFCC feature matrix Shape(one audio sample):\t{train_feats[-1].shape}')
# check IPA repository
print(f'phoneme labels(one audio sample):\t{train_labels[-1]}')

# seems like 'h#' marks sos and eos

MFCC feature matrix Shape(one audio sample):	(80, 39)
phoneme labels(one audio sample):	['h#', 'dh', 'ix', 's', 'ih', 's', 'tcl', 't', 'ax', 'm', 'w', 'er', 'kcl', 'k', 's', 'q', 'eh', 'z', 'ix', 'n', 'ix', 'm', 'pcl', 'p', 'er', 's', 'epi', 'en', 'el', 'm', 'eh', 'kcl', 'k', 'ix', 'n', 'ih', 'z', 'ax', 'm', 'h#']


In [105]:
print('\n'.join([f'num_frames:\t{len(train_feats[0])}', f'num_labels:\t{len(train_labels[0])}']))

num_frames:	154
num_labels:	60


In [106]:
# mark max length for mfcc features and labels
max_len_feats = max([len(feat) for feat in feats])
max_len_labels = max([len(label) for label in labels])

## max_len = 100

#### create IPA dictionary

define help function

In [107]:
def create_IPAdictionary(labels):
  '''
  args:
    labels: list of list
  return: ipa2idx
        dictionary of IPA_label: index
  '''
  ipas = set()
  for label in labels:
    ipas = ipas.union(set(label))
  ipas = sorted(ipas)
  ipas.remove('pau') # blank symbol substitues 'pau'
  ipas.remove('h#') # to assign index 1 to 'h#' (begin / end marker)
  ipas.remove('epi') # to assign index 2 to 'epi' (epenthetic silence)

  ipa2idx = {ipa:(idx+2) for idx, ipa in enumerate(ipas)}
  ipa2idx['<blank>'] = 0
  ipa2idx['h#'] = 1
  ipa2idx['epi'] = 2

  return ipa2idx

execute and review

In [108]:
ipa2idx = create_IPAdictionary(labels)
print(*sorted(ipa2idx))

<blank> aa ae ah ao aw ax ax-h axr ay b bcl ch d dcl dh dx eh el em en eng epi er ey f g gcl h# hh hv ih ix iy jh k kcl l m n ng nx ow oy p pcl q r s sh t tcl th uh uw ux v w y z zh


In [109]:
print(f'the number of IPA labels in TIMIT:\t {len(ipa2idx)}')
print(f'Index of blank symbol <blank>":\t {ipa2idx["<blank>"]}')
print(f'Index of sos/eos label "h#":\t {ipa2idx["h#"]}')

the number of IPA labels in TIMIT:	 61
Index of blank symbol <blank>":	 0
Index of sos/eos label "h#":	 1


In [110]:
# compare phonemic / phonetic symbols between
# provided TIMIT phonecode and transcription data
# https://catalog.ldc.upenn.edu/docs/LDC96S32/PHONCODE.TXT

existing = set(ipa2idx.keys())

phonecode = set(['b', 'd', 'g', 'p', 't', 'k', 'dx', 'q', # stops
             'jh', 'ch', # affricates
             's', 'sh', 'z', 'zh', 'f', 'th', 'v', 'dh', # frcatives
             'm', 'n', 'ng', 'em', 'en', 'eng', 'nx', # nasals
             'l', 'r', 'w', 'y', 'hh', 'hv', 'el', # semivowels and glides
             'iy', 'ih', 'eh', 'ey', 'ae', 'aa', 'aw', 'ay', 'ah', 'ao', 'oy', 'ow', 'uh', 'uw', 'ux', 'er', 'ax', 'ix', 'axr', 'ax-h', # vowels
             'pau', 'epi', 'h#', '1', '2',# others # epi: epenthetic silence # h# : begin/ end marker # 1 : primary stress marker # 2 : secondary stress marker
             ])

In [111]:
print(*sorted(existing.union(phonecode) - (existing & phonecode)))
print(*sorted((existing - phonecode)))
print(*sorted((phonecode - existing)))

1 2 <blank> bcl dcl gcl kcl pau pcl tcl
<blank> bcl dcl gcl kcl pcl tcl
1 2 pau


In [94]:
# review
# in data, there is no stress markers > not important to our model
# in data, there is additional phone symbols, bcl, dcl, gcl, kcl, pcl, tcl

In [95]:
# handling bcl, dcl, gcl, kcl, pcl, tcl
# used as abbreviation for voiceless stop or plosive with closure
# e.g. bcl : bilbial closure, tcl : alveolar closure

handling diphthongs

In [96]:
# diphthongs
# ey 'bait'
# aw 'bout'
# ay 'bite'
# oy 'boy'
# ow 'boat'

diphthongs = ['ey', 'aw', 'ay', 'ow']
# ey, aw, ay, ow : just split
# oy : oh + y

In [97]:
import re

diphthong_regex = re.compile('|'.join(sorted(map(re.escape, diphthongs),
                                              key = len, reverse = True)))

def split_diphthongs(label):
  label = ' '.join(label)
  label_split = diphthong_regex.sub(lambda x: ' '.join(x.group()), label).split()
  return label_split


# additionaly handling 'oy'
oy_regex = re.compile('oy')

def split_oy(label):
  label = ' '.join(label)
  oy_split = oy_regex.sub('oh y', label).split()
  return oy_split

execute and review

In [98]:
labels_split = []
for label in labels:
  label_split = split_diphthongs(label)
  label_split = split_oy(label_split)
  labels_split.append(label_split)


In [82]:
for i in range(len(labels)):
  if 'oy' in labels[i]:
    print(*labels[i])
    print(*labels_split[i])

h# d ah nx ae s epi m ih kcl k eh r iy ix nx ix q oy l iy r ae gcl g l ay kcl dh ae tcl h#
h# d ah nx ae s epi m ih kcl k eh r iy ix nx ix q oh y l iy r ae gcl g l a y kcl dh ae tcl h#
h# d ow n tcl ae s kcl m iy tcl t ix kcl k eh r iy q ax n oy l iy r ae gcl g l ay kcl k dh ae tcl h#
h# d o w n tcl ae s kcl m iy tcl t ix kcl k eh r iy q ax n oh y l iy r ae gcl g l a y kcl k dh ae tcl h#
h# d ow nx ae s kcl m ix dx ix kcl k ae r iy ix n oy l iy r ae gcl l ay kcl dh ae tcl h#
h# d o w nx ae s kcl m ix dx ix kcl k ae r iy ix n oh y l iy r ae gcl l a y kcl dh ae tcl h#
h# hh aw oy l iy dcl d ix y ux l ay kcl k y axr s ae l ix dcl d r ah s eng h#
h# hh a w oh y l iy dcl d ix y ux l a y kcl k y axr s ae l ix dcl d r ah s eng h#
h# d ow nx aw s kcl k m ih dx ix kcl k ae r ih y ix n oy l ix r ae gcl g l ay kcl dh ah q h#
h# d o w nx a w s kcl k m ih dx ix kcl k ae r ih y ix n oh y l ix r ae gcl g l a y kcl dh ah q h#
h# q ax kcl k aa n ax s er w el ix n dcl jh oy dh ix s sh eh l f ax sh dcl d

handling similar sounds

In [84]:
merge_ipa = {
    # marginal sounds
    'ax-h': 'ax',
    'bcl': 'b',
    'dcl': 'd',
    'gcl': 'g',
    'kcl': 'k',
    'pcl': 'p',
    'tcl': 't',

    'en': 'n',
    'em': 'm',
    'el': 'l',
    'eng': 'ng',

    ## /ɹ/ sound
    # 'axr': 'r' ? 'ɹ' ?
    # 'dx': 'r',
    # 'nx': 'r',
    # 'er': 'r', 'ɹ' ?

    # /h/ sound
    'hh': 'h',
}

execute and review

In [85]:
labels_merge = [[merge_ipa.get(symbol, symbol) for symbol in label] for label in labels]

In [87]:
print(*labels[0])
print(*labels_split[0])
print(*labels_merge[0])

h# sh uw w ax z hh ow l dx ix ng q aa nx uh hv ih z r aa kcl k w ax dh w ah n hv ae n dcl d h#
h# sh uw w ax z hh o w l dx ix ng q aa nx uh hv ih z r aa kcl k w ax dh w ah n hv ae n dcl d h#
h# sh uw w ax z h ow l dx ix ng q aa nx uh hv ih z r aa k k w ax dh w ah n hv ae n d d h#


In [None]:
## after merging, there may appear identical consecutive phones
## again merge or leave it as it is ?

In [88]:
# reassign labels
labels = labels_merge

#### Dataset + pad
define Dataset Class

In [None]:
class PhonemeASRDataset(Dataset):
  def __init__(self, feats, labels, ipa2idx):
    super(PhonemeASRDataset, self).__init__()
    self.feats, self.labels = feats, labels
    self.ipa2idx = ipa2idx

  def __len__(self):
    return len(self.feats)

  def __getitem__(self, idx):
      feat, label = self.feats[idx], self.labels[idx]
      label = [ipa2idx[ipa] for ipa in label]

      return torch.tensor(feat), torch.tensor(label, dtype = torch.long)

define padding function

In [None]:
def pad_collate(batch, pad_value_feat=0, pad_value_label=0):
    '''
      for collate_fn in DataLoader function

    args:
      batch: a list of tuples (mfcc, label)
      return: padded_mfccs, padded_labels
    '''

    mfccs, labels = zip(*batch)

    # find max length for mfcc(time step) and label in the current batch
    max_len_feats = max(mfcc.shape[0] for mfcc in mfccs)
    max_len_labels = max(label.shape[0] for label in labels)

    # pad mfcc matrices and labels
    padded_mfccs = [F.pad(mfcc, (0, 0, 0, max_len_feats - mfcc.shape[0]), value=pad_value_feat) for mfcc in mfccs]
    padded_labels = [F.pad(label, (0, max_len_labels - label.shape[0]), value=pad_value_label) for label in labels]

    # Stack the padded tensors
    padded_mfccs = torch.stack(padded_mfccs)
    padded_labels = torch.stack(padded_labels)

    return padded_mfccs, padded_labels



execute and review

In [None]:
train_ds = PhonemeASRDataset(train_feats, train_labels, ipa2idx = ipa2idx)
train_loader = DataLoader(train_ds, batch_size = 32, # can adjust
                          shuffle = True, collate_fn=pad_collate) # yields batch_size x max_len x num_feats as one training batch
## Task: should discover more about collate_fn keyword

In [None]:
a, b = next(iter(train_loader))
print(a.shape, b.shape)

torch.Size([32, 138, 39]) torch.Size([32, 54])


#### Model Architecture

define Loss Function

In [114]:
def CTCLoss(y_true, y_pred):
  '''
  caculate CTC loss
  args:
    y_true:
      gold labels for input sequence
    y_pred:
      predicted labels obtained from model
  return:
    loss value ## should discover more

  '''

  batch_len = tf.cast(tf.shape(y_true)[0], dtype = 'int64') # batch_size
  input_length = tf.cast(tf.shape(y_pred)[1], dtype = 'int64') # prediction sequence length
  label_length = tf.cast(tf.shape(y_true)[1], dtype = 'int64') # gold labels length

  input_length = input_length * tf.ones(shape = (batch_len, 1), dtype = 'int64')
  label_length = label_length * tf.ones(shape = (batch_len, 1), dtype = 'int64')
  print(input_length, label_length)

  loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

  return loss

define (Positional) Encoder

In [116]:
class SequenceWise(nn.Module):
  def __init__(self, module):
    super(SequenceWise, self).__init__()
    self.module = module

  def forward(self, x):
    try:
      x, batch_size = x.data, x.batch_sizes
      # print(x)
      # x.data: sum(x_len * num_features)
      x = self.module(x)
      x = nn.utils.rnn.PackedSequence(x, batch_size)

    except:
      t, n = x.size(0), x.size(1)
      x = x.view(t * n, -1)
      # print(x)
      # x : sum(x_len) * num_features
      x = self.module(x)
      x = x.view(t, n, -1)
    return x

  def __repr__(self):
    tmpstr = self.__class__.__name__ + ' (\n'
    tmpstr += self.module.__repr__()
    tmpstr += ')'
    return tmpstr

define RNN Encoder

In [115]:
class RNNEncoder(nn.Module):
  def __init__(self, input_size, hidden_size, rnn_type = nn.GRU,
               bidirectional = True, batch_norm = True, dropout = 0.2):
    super(RNNEncoder, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bidirectional= bidirectional
    self.batch_norm = SequenceWise(nn.BatchNorm1d(input_size)) if batch_norm else None
    self.rnn = rnn.type(input_size = input_size, hidden_size = hidden_size,
                        bidirectional = bidirectional, dropout = dropout, bias = False)

  def forward(self, x):
    if self.batch_norm is not None:
      x = self.batch_norm(x)
    decoder_hidden, decoder_states = self.rnn(x)
    self.rnn.flatten_parameters()
    return x

define CTC-RNN linkage

define CNN-RNN-CTC model

execute and reivew

#### call relevant dictionaries

In [None]:
# for model
ipa2idx = ipa2idx
idx2ipa = {idx:ipa for ipa, idx in ipa2idx.item()}

# for decoding
eng2yor =
yor2eng = {yor:eng for eng, yor in eng2yor.item()}

#### train

define train function

In [None]:
def train_epochs(dataloader, model, num_epochs):
  # set model mode
  # loop
    # feed data
    # calculate loss
    # back propagation
    # parameter update
    # record loss
  # return seth

set up device

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

set settings

execute and review

#### evaluate

define evaluation function

set up device

set settings

execute and reivew