In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)

import os
os.chdir('./drive/MyDrive')

Mounted at /content/drive


#### import

In [2]:
!pip install params
!pip install jiwer

Collecting params
  Downloading params-0.9.0-py3-none-any.whl.metadata (631 bytes)
Downloading params-0.9.0-py3-none-any.whl (11 kB)
Installing collected packages: params
Successfully installed params-0.9.0
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-4.0.0 rapidfuzz-3.13.0


In [321]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm # Import tqdm function specifically
import editdistance as ed

import subprocess
import numpy as np
import matplotlib.pyplot
from IPython import display
from jiwer import wer
import params
import time

#### read data
define help function

In [165]:
def read_TIMIT(path):
  '''
  args:
    path: path of TIMIT data(mfcc features, phoneme labels)
  return:
    feats: list of list for each audio samples
    labels: list of list for each audio samples
  '''

  feats, labels= [], []
  length_feats, length_labels= [], []

  # read processed TIMIT data
  # list of dictionarys with keys being 'mfcc', 'phonemes', 'path'
  samples= torch.load(path, weights_only= False)
  for idx in range(len(samples)):
    feats.append(samples[idx]['mfcc'])
    labels.append([phoneme.strip() for phoneme in samples[idx]['phonemes']])
  return feats, labels


execute and review

In [166]:
os.listdir()[-3]

'timit_mfcc_data.pt'

In [203]:
# split train / dev data
path= r'timit_mfcc_data.pt'
feats, labels= read_TIMIT(path)

train_feats, dev_feats, train_labels, dev_labels= train_test_split(feats, labels)

In [204]:
# check mfcc feature matrix dimension
print(f'MFCC feature matrix Shape(one audio sample):\t{train_feats[-1].shape}')
# check IPA repository
print(f'phoneme labels(one audio sample):\t{train_labels[-1]}')

# seems like 'h#' marks sos and eos

MFCC feature matrix Shape(one audio sample):	(92, 39)
phoneme labels(one audio sample):	['h#', 't', 'er', 'bcl', 'b', 'y', 'ih', 'l', 'eh', 'n', 'tcl', 't', 'ay', 'dcl', 'd', 'z', 'r', 'ow', 'z', 'ax', 'z', 'epi', 'm', 'ah', 'tcl', 'ch', 'ix', 'z', 'f', 'ih', 'f', 'tcl', 't', 'iy', 'f', 'iy', 'tcl', 't', 'h#']


In [205]:
print('\n'.join([f'num_frames:\t{len(train_feats[0])}', f'num_labels:\t{len(train_labels[0])}']))

num_frames:	141
num_labels:	52


In [206]:
# mark max length for mfcc features and labels
max_len_feats= max([len(feat) for feat in feats])
max_len_labels= max([len(label) for label in labels])

## max_len= 100

#### create IPA dictionary

define help function

In [223]:
def create_IPAdictionary(labels):
  '''
  args:
    labels: list of list
  return: ipa2idx
        dictionary of IPA_label: index
  '''
  ipas= set()
  for label in labels:
    ipas= ipas.union(set(label))
  ipas= sorted(ipas)
  try:
    ipas.remove('pau') # blank symbol substitues 'pau'
  except Exception:
    pass
  ipas.remove('h#') # to assign index 1 to 'h#' (begin / end marker)
  ipas.remove('epi') # to assign index 2 to 'epi' (epenthetic silence)

  ipa2idx= {ipa:(idx+2) for idx, ipa in enumerate(ipas)}
  ipa2idx['<blank>']= 0
  ipa2idx['h#']= 1
  ipa2idx['epi']= 2

  return ipa2idx

execute and review

In [208]:
ipa2idx= create_IPAdictionary(labels)
print(*sorted(ipa2idx))

# substitute pau to <blank>
labels= [[symbol if symbol !=  'pau' else '<blank>' for symbol in label] for label in labels ]

<blank> aa ae ah ao aw ax ax-h axr ay b bcl ch d dcl dh dx eh el em en eng epi er ey f g gcl h# hh hv ih ix iy jh k kcl l m n ng nx ow oy p pcl q r s sh t tcl th uh uw ux v w y z zh


In [209]:
print(f'the number of IPA labels in TIMIT:\t {len(ipa2idx)}')
print(f'Index of blank symbol <blank>":\t {ipa2idx["<blank>"]}')
print(f'Index of sos/eos label "h#":\t {ipa2idx["h#"]}')

the number of IPA labels in TIMIT:	 61
Index of blank symbol <blank>":	 0
Index of sos/eos label "h#":	 1


In [210]:
# compare phonemic / phonetic symbols between
# provided TIMIT phonecode and transcription data
# https://catalog.ldc.upenn.edu/docs/LDC96S32/PHONCODE.TXT

existing= set(ipa2idx.keys())

phonecode= set(['b', 'd', 'g', 'p', 't', 'k', 'dx', 'q', # stops
             'jh', 'ch', # affricates
             's', 'sh', 'z', 'zh', 'f', 'th', 'v', 'dh', # frcatives
             'm', 'n', 'ng', 'em', 'en', 'eng', 'nx', # nasals
             'l', 'r', 'w', 'y', 'hh', 'hv', 'el', # semivowels and glides
             'iy', 'ih', 'eh', 'ey', 'ae', 'aa', 'aw', 'ay', 'ah', 'ao', 'oy', 'ow', 'uh', 'uw', 'ux', 'er', 'ax', 'ix', 'axr', 'ax-h', # vowels
             'pau', 'epi', 'h#', '1', '2',# others # epi: epenthetic silence # h# : begin/ end marker # 1 : primary stress marker # 2 : secondary stress marker
             ])

In [211]:
print(*sorted(existing.union(phonecode) - (existing & phonecode)))
print(*sorted((existing - phonecode)))
print(*sorted((phonecode - existing)))

1 2 <blank> bcl dcl gcl kcl pau pcl tcl
<blank> bcl dcl gcl kcl pcl tcl
1 2 pau


In [212]:
# review
# in data, there is no stress markers > not important to our model
# in data, there is additional phone symbols, bcl, dcl, gcl, kcl, pcl, tcl

In [213]:
# handling bcl, dcl, gcl, kcl, pcl, tcl
# used as abbreviation for voiceless stop or plosive with closure
# e.g. bcl : bilbial closure, tcl : alveolar closure

handling diphthongs in labels

In [214]:
# diphthongs
# ey 'bait'
# aw 'bout'
# ay 'bite'
# oy 'boy'
# ow 'boat'

diphthongs= ['ey', 'aw', 'ay', 'ow']
# ey, aw, ay, ow : just split
# oy : oh + y

In [215]:
import re

diphthong_regex= re.compile('|'.join(sorted(map(re.escape, diphthongs),
                                              key= len, reverse= True)))

def split_diphthongs(label):
  label= ' '.join(label)
  label_split= diphthong_regex.sub(lambda x: ' '.join(x.group()), label).split()
  return label_split


# additionaly handling 'oy'
oy_regex= re.compile('oy')

def split_oy(label):
  label= ' '.join(label)
  oy_split= oy_regex.sub('oh y', label).split()
  return oy_split

execute and review

In [216]:
labels_split= []
for label in labels:
  label_split= split_diphthongs(label)
  label_split= split_oy(label_split)
  labels_split.append(label_split)


In [217]:
for i in range(len(labels)):
  if 'oy' in labels[i]:
    print(*labels[i])
    print(*labels_split[i])

h# d ah nx ae s epi m ih kcl k eh r iy ix nx ix q oy l iy r ae gcl g l ay kcl dh ae tcl h#
h# d ah nx ae s epi m ih kcl k eh r iy ix nx ix q oh y l iy r ae gcl g l a y kcl dh ae tcl h#
h# d ow n tcl ae s kcl m iy tcl t ix kcl k eh r iy q ax n oy l iy r ae gcl g l ay kcl k dh ae tcl h#
h# d o w n tcl ae s kcl m iy tcl t ix kcl k eh r iy q ax n oh y l iy r ae gcl g l a y kcl k dh ae tcl h#
h# d ow nx ae s kcl m ix dx ix kcl k ae r iy ix n oy l iy r ae gcl l ay kcl dh ae tcl h#
h# d o w nx ae s kcl m ix dx ix kcl k ae r iy ix n oh y l iy r ae gcl l a y kcl dh ae tcl h#
h# hh aw oy l iy dcl d ix y ux l ay kcl k y axr s ae l ix dcl d r ah s eng h#
h# hh a w oh y l iy dcl d ix y ux l a y kcl k y axr s ae l ix dcl d r ah s eng h#
h# d ow nx aw s kcl k m ih dx ix kcl k ae r ih y ix n oy l ix r ae gcl g l ay kcl dh ah q h#
h# d o w nx a w s kcl k m ih dx ix kcl k ae r ih y ix n oh y l ix r ae gcl g l a y kcl dh ah q h#
h# q ax kcl k aa n ax s er w el ix n dcl jh oy dh ix s sh eh l f ax sh dcl d

handling similar sounds in labels

In [218]:
merge_ipa= {
    # marginal sounds
    'ax-h': 'ax',
    'bcl': 'b',
    'dcl': 'd',
    'gcl': 'g',
    'kcl': 'k',
    'pcl': 'p',
    'tcl': 't',

    'en': 'n',
    'em': 'm',
    'el': 'l',
    'eng': 'ng',

    ## /ɹ/ sound
    # 'axr': 'r' ? 'ɹ' ?
    # 'dx': 'r',
    # 'nx': 'r',
    # 'er': 'r', 'ɹ' ?

    # /h/ sound
    'hh': 'h',
}

execute and review

In [219]:
labels_merge= [[merge_ipa.get(symbol, symbol) for symbol in label] for label in labels_split]

In [220]:
print(*labels[0])
print(*labels_split[0])
print(*labels_merge[0])

h# sh uw w ax z hh ow l dx ix ng q aa nx uh hv ih z r aa kcl k w ax dh w ah n hv ae n dcl d h#
h# sh uw w ax z hh o w l dx ix ng q aa nx uh hv ih z r aa kcl k w ax dh w ah n hv ae n dcl d h#
h# sh uw w ax z h o w l dx ix ng q aa nx uh hv ih z r aa k k w ax dh w ah n hv ae n d d h#


In [221]:
## after merging, there may appear identical consecutive phones
## again merge or leave it as it is ?

## if dcl and d > d
## if dcl was by itself > d

In [224]:
# redefine labels and the dictionary
labels= labels_merge
ipa2idx= create_IPAdictionary(labels)

# again split train / dev dataset
train_feats, dev_feats, train_labels, dev_labels= train_test_split(feats, labels)

#### Dataset + pad
define Dataset Class

In [225]:
class PhonemeASRDataset(Dataset):
  def __init__(self, feats, labels, ipa2idx):
    super(PhonemeASRDataset, self).__init__()
    self.feats, self.labels= feats, labels
    self.ipa2idx= ipa2idx

  def __len__(self):
    return len(self.feats)

  def __getitem__(self, idx):
      feat, label= self.feats[idx], self.labels[idx]
      label= [ipa2idx[ipa] for ipa in label]

      return torch.tensor(feat), torch.tensor(label, dtype= torch.long)

define padding function

In [306]:
def pad_collate(batch, pad_value_feat= 0, pad_value_label= 0):
    '''
      for collate_fn in DataLoader function

    args:
      batch: a list of tuples (mfcc, label)
      return: padded_mfccs, padded_labels
    '''

    mfccs, labels= zip(*batch)

    # find max length for mfcc(time step) and label in the current batch
    max_len_feats= max(mfcc.shape[0] for mfcc in mfccs)
    max_len_labels= max(label.shape[0] for label in labels)

    # pad mfcc matrices and labels
    padded_mfccs= [F.pad(mfcc, (0, 0, 0, max_len_feats - mfcc.shape[0]), value= pad_value_feat) for mfcc in mfccs]
    padded_labels= [F.pad(label, (0, max_len_labels - label.shape[0]), value= pad_value_label) for label in labels]

    # calculate lengths of input and target lengths
    input_lengths = torch.tensor([mfcc.shape[0] for mfcc in mfccs], dtype = torch.long)
    target_lengths = torch.tensor([label.shape[0] for label in labels], dtype = torch.long)


    # Stack the padded tensors
    padded_mfccs= torch.stack(padded_mfccs)
    padded_labels= torch.stack(padded_labels)

    return padded_mfccs, padded_labels, input_lengths, target_lengths



execute and review

In [308]:
train_ds= PhonemeASRDataset(train_feats, train_labels, ipa2idx= ipa2idx)
train_loader= DataLoader(train_ds, batch_size= 32, # can adjust
                          shuffle= True, collate_fn= pad_collate) # yields batch_size x max_len x num_feats as one training batch
## Task: should discover more about collate_fn keyword

In [309]:
a, b, _, _= next(iter(train_loader))
print(a.shape, b.shape)

torch.Size([32, 161, 39]) torch.Size([32, 69])


#### Model Architecture

In [358]:
class ResidualBlock(nn.Module):
  def __init__(self, in_channels, out_channels,
               kernel_size= 15, feature_map= 50, stride= 1):
    super(ResidualBlock, self).__init__()

    # Adjusted padding for Conv1d
    padding = kernel_size // 2
    self.conv1= nn.Conv1d(in_channels, out_channels,
                           kernel_size, stride=stride, padding=padding) # Corrected stride and added padding
    self.conv2= nn.Conv1d(out_channels, out_channels,
                           kernel_size, stride=stride, padding=padding) # Corrected stride and added padding
    self.norm1= nn.BatchNorm1d(out_channels) # Added BatchNorm1d for conv1
    self.norm2= nn.BatchNorm1d(out_channels) # Added BatchNorm1d for conv2
    self.PReLU= nn.PReLU()

    if in_channels !=  out_channels:
      self.shortcut= nn.Sequential(
          nn.Conv1d(in_channels= in_channels, out_channels= out_channels,
                    kernel_size= 1, stride= 1, ),
          nn.BatchNorm1d(out_channels)
      )
    else:
      self.shortcut= nn.Identity()

  def forward(self, x):
    identity= self.shortcut(x)

    # Applied norm and PReLU to the output of convolutions
    out= self.PReLU(self.norm1(self.conv1(x)))
    out= self.PReLU(self.norm2(self.conv2(out)))
    out +=  identity

    return out

In [359]:
class ResidualNet(nn.Module):
  def __init__(self, in_channels= 39, out_channels= 50,
               kernel_size= 15, stride= 1):
    super(ResidualNet, self).__init__()

    self.stem= nn.Sequential(
        nn.Conv1d(in_channels= in_channels, out_channels= out_channels,
                  kernel_size= kernel_size, stride= stride, padding= kernel_size // 2),
      nn.BatchNorm1d(num_features= out_channels)
    )

    self.layer1= ResidualBlock(in_channels= out_channels, out_channels= out_channels)
    self.layer2= ResidualBlock(in_channels= out_channels, out_channels= out_channels)
    self.layer3= ResidualBlock(in_channels= out_channels, out_channels= out_channels)
    self.layer4= ResidualBlock(in_channels= out_channels, out_channels= out_channels)
    self.layer5= ResidualBlock(in_channels= out_channels, out_channels= out_channels)


  def forward(self, x): # Added input argument x
    out= self.stem(x)
    out= self.layer5(self.layer4(self.layer3(self.layer2(self.layer1(out)))))
    return out

define RNN Encoder

In [360]:
class biLSTM(nn.Module):
  def __init__(self, input_size= 50, hidden_size= 170, bidirectional= True,
               num_layers= 2,
               batch_first= True, dropout= 0.2, ipa2idx= ipa2idx
               ):
    super(biLSTM, self).__init__()
    self.input_size= input_size
    self.hidden_size= hidden_size
    self.bidirectional= bidirectional
    self.rnn= nn.LSTM(input_size= input_size, hidden_size= hidden_size,
                        bidirectional= bidirectional, dropout= dropout,
                       num_layers= num_layers, bias= False)
    self.dropout= nn.Dropout(dropout)

  def forward(self, x):
    '''
    input: (B x N x 3*F)
    outputs:
      output: (B x N x 2 * Dh)
      final hidden state: (2 * num_layers x Dh)
    '''
    out, _= self.rnn(x)
    out= self.dropout(out)

    return out

define CTC model

In [361]:
class CTCModel(nn.Module):
  def __init__(self, cnn_param, rnn_param, ipa2idx, dropout= 0.2):
    # inherit nn.Module __init__
    super(CTCModel, self).__init__()

    # assign properties
    if cnn_param is None or type(cnn_param) !=  dict:
      raise ValueError('cnn_param need to be a dict to contain all params of cnn!')
    if rnn_param is None or type(rnn_param) !=  dict:
      raise ValueError('rnn_param need to be a dict to contain all params of rnn!')

    self.cnn_param= cnn_param
    self.rnn_param= rnn_param
    self.num_class= len(ipa2idx)
    self.num_directions= 2 if rnn_param['biLSTM']['bidirectional'] else 1
    self.dropout= dropout

    # assign ResNet(CNN) module
    ResBlock= cnn_param['ResBlock']
    ResNet= cnn_param['ResNet']


    self.ResBlock= ResidualBlock(in_channels= ResBlock['in_channels'], out_channels= ResBlock['out_channels'],
                                  kernel_size= ResBlock['kernel_size'], stride= ResBlock['stride'])
    self.cnn= ResidualNet(in_channels= ResNet['in_channels'], out_channels= ResNet['out_channels'],
                      kernel_size= ResNet['kernel_size'], stride= ResNet['stride'])

    # assign biLSTM module
    input_size= rnn_param['biLSTM']['input_size']
    hidden_size= rnn_param['biLSTM']['hidden_size']
    bidirectional= rnn_param['biLSTM']['bidirectional']
    num_layers= rnn_param['biLSTM']['num_layers']
    batch_first= rnn_param['biLSTM']['batch_first']
    dropout= rnn_param['biLSTM']['dropout']

    self.rnn= biLSTM(input_size= input_size, hidden_size= hidden_size,
                      bidirectional= bidirectional, num_layers= num_layers,
                      batch_first= batch_first, dropout= dropout, ipa2idx= ipa2idx)


    # define FC layer
    self.fc= nn.Sequential(
        nn.Linear(in_features= self.num_directions * hidden_size, out_features= hidden_size, bias= False),
        nn.ReLU(),
        nn.Linear(in_features= hidden_size, out_features= self.num_class ),
    )

    # define softmax layer
    self.log_softmax= nn.LogSoftmax(dim= -2)


  def forward(self, x):
    '''
    args :
      x : B x 1 x N x D
    return
      softmax over laels
    '''
    # Resnet Encoder Block
    x = x.transpose(1, 2)
    out_cnn= self.cnn(x)
    out_cnn= out_cnn.transpose(1, 2)
    sizes= out_cnn.size()
    if len(sizes) > 3:
      out_cnn= out_cnn.view(sizes[0], sizes[1], sizes[2] * sizes[3])


    # RNN Encoder Block
    out_rnn= self.rnn(out_cnn)
    B, T, _= out_rnn.size()
    out_rnn= out_rnn.contiguous().view(B * T, -1)

    # Dense Layer Block
    out= self.fc(out_rnn)
    out= out.view(B, T, -1)
    out= self.log_softmax(out)

    return out

define decoder function

In [362]:
def decode_ctc(log_probs, input_lengths, blank=0):
  '''
  args:
    log_probs:
    input_lengths:
    blank:
  return:

  '''
    # Greedy decoding
  pred = log_probs.argmax(dim=-1)  # (B, T)
  decoded = []
  for i in range(pred.size(0)):
      seq = []
      prev = blank
      for j in range(input_lengths[i]):
          p = pred[i][j].item()
          if p != prev and p != blank:
              seq.append(p)
          prev = p
      decoded.append(seq)
  return decoded

In [363]:
def evaluate_PER(model, data_loader):
  '''
  args:
    model:
    data_loader:
  return
  '''
  model.eval()
  total_edits, total_length = 0, 0
  for x, y, input_lens, target_lens in data_loader:
    x, y = x.to(SETTING["device"]), y.to(SETTING["device"])
    input_lens, target_lens = input_lens.to(SETTING["device"]), target_lens.to(SETTING["device"])

    log_probs = model(x)  # (B, T, C)
    pred_seqs = decode_ctc(log_probs, input_lens)  # List of predictions
    target_seqs = [y[i][:target_lens[i]].tolist() for i in range(y.size(0))]

    for pred, target in zip(pred_seqs, target_seqs):
      total_edits += ed.eval(target, pred)
      total_length += len(target)
  return total_edits / total_length if total_length > 0 else 0.0


#### Train

define utility codes

In [364]:
def save_checkpoint(model, optimizer, filename= 'checkpoint.pth.tar'):
  '''
  args:
    model: model on training
    optimizer: optimizer on optimizing
    filename: the path to save ongoing training states of model and optimizer
  '''
  checkpoint= {
      "state_dict": model.state_dict(),
      "optimizer": optimizer.state_dict()
  }
  print("= > Saving checkpoint")
  torch.save(checkpoint, filename)


def load_checkpoint(checkpoint, model, optimizer):
  '''
  args:
    checkpoint: a file where states of model and optimizer is saved
    model: instantiated model
    optimizer: instantiated optimizer
  '''
  print("= > Loading checkpoint")
  model.load_state_dict(checkpoint["state_dict"])
  optimizer.load_state_dict(checkpoint["optimizer"])

class EarlyStopping:
  def __init__(self, patience=  10, delta= 1e-5, mode= 'max'):
    self.patience=  patience # the number of epochs to wait observe loss
    self.counter= 0
    self.best_score= None
    self.early_stop= False
    self.delta= delta
    self.mode= mode

  def __call__(self, current):
    if self.best_score is None:
      self.best_score= current
      return False

    improvement= (current - self.best_score) if self.mode ==  "max" else (self.best_score - current)

    if improvement <=  self.delta:
      self.counter +=  1
      if self.counter >=  self.patience:
        self.early_stop= True
    else:
      self.best_score= current
      self.counter= 0

    return self.early_stop

define train function

In [365]:
def train_fn(data_loader, model, optimizer, loss_fn):
  model.train()
  total_loss= []

  inner_loop= tqdm(train_loader, desc= 'Batch', leave= True)
  for x, y, input_lengths, target_lengths in inner_loop:
    x, y= x.to(SETTING["device"]), y.to(SETTING["device"])
    # Corrected dictionary access
    input_lengths, target_lengths = input_lengths.to(SETTING["device"]), target_lengths.to(SETTING["device"])

    log_probs = model(x) # (B, T, C)
    log_probs = log_probs.transpose(0, 1) # CTCLoss requires (T, B, C)

    # flatten target labels to 1D
    y = torch.cat([y[i][:target_lengths[i]] for i in range(y.size(0))])

    loss = loss_fn(log_probs, y, input_lengths, target_lengths)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    inner_loop.set_postfix(loss= loss.item())
    total_loss.append(loss.item())

  return sum(total_loss) / len(total_loss)

define evaluation function

In [366]:
# import Nikita's code?

```
"seed" : 43,
"learning_rate": 2e-5,
"device": "cuda:0" if torch.cuda.is_available() else "cpu"
"batch_size" 64,
"weight_decay": 1e-4,
"num_epochs": 70,
"num_workers": 2,
"pin_memory": True,
```
-------------------------------------------
```
"load_model": True
(if saved model exists)
"load_model_file": "./drive/MyDrive/ResLSTM.path.tar",
```

In [367]:
#------------------------------- Traing Settings -------------------------------#
SETTING= {
    "seed": 43,
    "learning_rate": 2e-5,
    "device": "cuda:0" if torch.cuda.is_available() else "cpu",
    "batch_size": 64,
    "weight_decay": 1e-4,
    "num_epochs": 70,
    "num_workers": 2,
    "pin_memory": True,
    "load_model": False,
    "load_model_file": "./drive/MyDrive/ResLSTM.path.tar",
    "patience": 10,
#    "feat_dir": directory of features
#    "label_dir": directory of labels

}

torch.manual_seed(SETTING["seed"])


#------------------------------- DataLoader -------------------------------#

train_ds= PhonemeASRDataset(train_feats, train_labels, ipa2idx= ipa2idx)
dev_ds= PhonemeASRDataset(dev_feats, dev_labels, ipa2idx= ipa2idx)
# test_ds =  test 어쩌구

train_loader= DataLoader(train_ds,
                          batch_size= SETTING["batch_size"],
                          shuffle= True, collate_fn= pad_collate,
                         num_workers= SETTING["num_workers"],
                         pin_memory= SETTING["pin_memory"]) # yields batch_size x max_len x num_feats as one training batch
dev_loader= DataLoader(dev_ds,
                      batch_size= SETTING["batch_size"],
                       shuffle= False, collate_fn= pad_collate,
                       num_workers= SETTING["num_workers"],
                      pin_memory= SETTING["pin_memory"])

early_stopping =  EarlyStopping(patience= SETTING["patience"], delta= 0.001, mode =  "max")

In [368]:

#------------------------------- CONFIG -------------------------------#
cnn_param= {
    'ResNet': {
        'in_channels': 39,
        'out_channels': 50,
        'kernel_size': 15,
        'stride': 1
},
    'ResBlock': {
        'in_channels': 50,
        'out_channels': 50,
        'kernel_size' : 15,
        'stride': 1
    },
}

rnn_param= {
    'biLSTM': {
        'input_size': 50,
        'hidden_size': 170,
        'bidirectional': True,
        'num_layers': 2,
        'batch_first': True,
        'batch_norm': True,
        'ipa2idx': ipa2idx,
        'dropout': 0.25}
}

#------------------------------- Model & Optimizer -------------------------------#
model= CTCModel(cnn_param= cnn_param, rnn_param= rnn_param,
                   ipa2idx= ipa2idx)

optimizer= optim.Adam(model.parameters(),
                       lr= SETTING["learning_rate"],
                       weight_decay= SETTING["weight_decay"],
                       )
loss_fn = nn.CTCLoss(blank= 0, reduction= 'sum')

if SETTING["load_model"]:
  checkpoint= torch.load(SETTING["load_model_file"])
  load_checkpoint(checkpoint, model, optimizer)



In [369]:
# review
print(model)

CTCModel(
  (ResBlock): ResidualBlock(
    (conv1): Conv1d(50, 50, kernel_size=(15,), stride=(1,), padding=(7,))
    (conv2): Conv1d(50, 50, kernel_size=(15,), stride=(1,), padding=(7,))
    (norm1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (norm2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (PReLU): PReLU(num_parameters=1)
    (shortcut): Identity()
  )
  (cnn): ResidualNet(
    (stem): Sequential(
      (0): Conv1d(39, 50, kernel_size=(15,), stride=(1,), padding=(7,))
      (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (layer1): ResidualBlock(
      (conv1): Conv1d(50, 50, kernel_size=(15,), stride=(1,), padding=(7,))
      (conv2): Conv1d(50, 50, kernel_size=(15,), stride=(1,), padding=(7,))
      (norm1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (norm2): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True,

execute and review

In [None]:
losses, PER_list, PER_list_train= [], [], []
outer_loop= tqdm(range(SETTING["num_epochs"]), desc= "Epoch", position= 0)
eval_interval= 5

for epoch in outer_loop:
  avg_loss= train_fn(train_loader, model, optimizer, loss_fn)
  losses.append(avg_loss)

  if epoch % eval_interval == 0:
    with torch.no_grad():
      PER = evaluate_PER(model, dev_loader)
      PER_train = evaluate_PER(model, train_loader)

  else:
    PER = PER_list[-1] if PER_list else 0.0
    PER_train = PER_list_train[-1] if PER_list_train else 0.0

  PER_list.append(PER)
  PER_list_train.append(PER_train)

  outer_loop.set_postfix({
      "Loss": f"{avg_loss:.6f}",
      "Train PER": f"{PER_train:.6f}" if PER is not None else "N/A",
      "PER": f"{PER:.6f}" if PER is not None else "N/A"
  })

  if early_stopping(PER):
    print(f"Early stopping at epoch {epoch}")
    break

Epoch:   0%|          | 0/70 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]

Batch:   0%|          | 0/55 [00:00<?, ?it/s]