In [1]:
import warnings
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from mpvn.data.grad.lit_data_module import LightningGradDataModule
from mpvn.metric import WordErrorRate, CharacterErrorRate
from mpvn.model import *

from mpvn.configs import DictConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint_callback = ModelCheckpoint(
    save_top_k=3,
    monitor="valid_loss",
    mode="min",
    dirpath="checkpoint",
    filename="mpvn-{epoch:02d}-{valid_loss:.2f}-{valid_per:.2f}-{valid_acc}",
)
early_stop_callback = EarlyStopping(
    monitor="valid_loss", 
    min_delta=0.00, 
    patience=5, 
    verbose=False, 
    mode="min"
)
logger = TensorBoardLogger("tensorboard", name="Pronunciation for Vietnamese")

In [3]:
configs = DictConfig()

pl.seed_everything(configs.seed)
warnings.filterwarnings('ignore')

data_module = LightningGradDataModule(configs)
vocab = data_module.get_vocab() 

trainer = pl.Trainer(accelerator='cpu',
                      gpus=0,
                      logger=logger,
                      max_epochs=configs.max_epochs,
                      callbacks=[checkpoint_callback, early_stop_callback])

Global seed set to 1
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [4]:
model = ConformerRNNModel(
    configs=configs,
    num_classes=len(vocab),
    vocab=vocab,
    per_metric=WordErrorRate(vocab)
)

In [5]:
trainer.test(model, data_module)

Testing DataLoader 0:   0%|          | 0/582 [00:00<?, ?it/s]tensor([1, 1, 0, 1, 0, 1])
Testing DataLoader 0:   0%|          | 1/582 [00:00<03:23,  2.85it/s]tensor([1])
Testing DataLoader 0:   0%|          | 2/582 [00:00<03:04,  3.14it/s]tensor([1, 0, 1, 1, 1, 1])
Testing DataLoader 0:   1%|          | 3/582 [00:00<02:28,  3.90it/s]tensor([1, 1, 1, 1, 0])
Testing DataLoader 0:   1%|          | 4/582 [00:00<02:00,  4.78it/s]tensor([0, 1, 0, 0, 0, 1, 0, 0, 0, 1])
Testing DataLoader 0:   1%|          | 5/582 [00:00<01:45,  5.49it/s]tensor([1, 0, 1])
Testing DataLoader 0:   1%|          | 6/582 [00:00<01:33,  6.18it/s]tensor([1, 0, 1, 1])
Testing DataLoader 0:   1%|          | 7/582 [00:01<01:39,  5.77it/s]tensor([1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1])
Testing DataLoader 0:   1%|▏         | 8/582 [00:01<01:37,  5.91it/s]tensor([1, 1, 1, 1])
Testing DataLoader 0:   2%|▏         | 9/582 [00:01<01:28,  6.44it/s]tensor([1, 1, 1, 1, 0, 1])
Testing DataLoader 0:   2%|▏         | 10/582 [00:01<0

In [None]:
model.df

Unnamed: 0,utt_id,phones,phones_predict,score,score_predict,per,accuracy,f1,precision,recall
0,2022-11-19-HangXomTay_13,sɛ-maɜj-kuə4-mi2ɲ-bi6-hɔ4ŋ<e>,sɛ-maɜj-kuə4-mi2ɲ-bi6-hɔ4ŋ<e>,0 0 0 0 0 0,0 0 1 0 0 0,1.312500,0.833333,0.909091,1.0,0.833333
1,2022-12-12-Saleem_55,vaː2<e>,vaː2<e>,0,0,1.000000,1.000000,1.000000,1.0,1.000000
2,2022-11-20-EthanKellyUcViet_38,saɜw-myəj-tʃiɛ6w-mo6t̪-taːɜŋ-aːɜ<e>,saɜw-myəj-tʃiɛ6w-mo6t̪-taːɜŋ-aːɜ<e>,0 0 0 0 0 0,0 1 0 1 0 0,1.294118,0.666667,0.800000,1.0,0.666667
3,2022-11-19-HangXomTay_2_7,həːn-saːɜ-si6-mo6t̪-t̪iɜ<e>,həːn-saːɜ-si6-mo6t̪-t̪iɜ<e>,0 0 0 0 0,0 0 0 1 0,1.307692,0.800000,0.888889,1.0,0.800000
4,2022-12-11-TraiTimChangTraiNhat-trym_37,ɗe4-kɔɜ-te4-t̪i2m-hiɛ4w-ɲiɛ2w-həːn-ve2-viɛ6t̪-...,ɗe4-kɔɜ-te4-t̪i2m-hiɛ4w-ɲiɛ2w-həːn-ve2-viɛ6t̪-...,0 0 0 0 0 0 0 0 0 0,1 0 1 1 0 0 0 0 0 0,1.296296,0.700000,0.823529,1.0,0.700000
...,...,...,...,...,...,...,...,...,...,...
577,2022-12-12-Saleem_54,ɗuɜŋ-zo2j<e>,ɗuɜŋ-zo2j<e>,0 0,0 0,1.142857,1.000000,1.000000,1.0,1.000000
578,2022-12-11-TraiTimChangTraiNhat-trym_1,mi2ɲ-ɗaːŋ-ɣi-laː6j-t̪y6-zəːɜj-tiɛ6w<e>,mi2ɲ-ɗaːŋ-ɣi-laː6j-t̪y6-zəːɜj-tiɛ6w<e>,0 0 0 0 0 0 0,0 0 0 0 0 0 0,1.250000,1.000000,1.000000,1.0,1.000000
579,2022-11-19-aNcariRoom_81,t̪aː6j-saːw-eɲ-əː4-viɛ6t̪-naːm<e>,t̪aː6j-saːw-eɲ-əː4-viɛ6t̪-naːm<e>,0 0 0 0 0 0,1 0 0 0 0 0,1.312500,0.833333,0.909091,1.0,0.833333
580,2022-12-12-AFRO-trym_59,kaːɜj-zə1w-ku5ŋ-laː2-ma2w-vaː2ŋ<e>,kaːɜj-zə1w-ku5ŋ-laː2-ma2w-vaː2ŋ<e>,0 0 0 0 0 0,0 0 0 0 1 0,1.277778,0.833333,0.909091,1.0,0.833333


In [None]:
model.word_decoder.fc

Sequential(
  (0): Linear(in_features=144, out_features=72, bias=True)
  (1): Tanh()
  (2): Linear(in_features=72, out_features=2, bias=True)
)

In [None]:
import librosa
audio_path = 'Data/label/Audio/2022-12-12-AFRO-trym/2022-12-12-AFRO-trym_55.wav'
librosa.load(audio_path, sr=16000)

(array([-0.1716919 , -0.12313843, -0.16488647, ..., -0.01550293,
        -0.01315308,  0.00091553], dtype=float32),
 16000)

In [3]:
import torch
torch.tensor([1,2,3]) + torch.tensor([1,2]) 

RuntimeError: The size of tensor a (3) must match the size of tensor b (2) at non-singleton dimension 0

In [10]:
torch.concat([torch.tensor([[1,2,3]]), torch.tensor([[1,2]])], dim=0 )

RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 3 but got size 2 for tensor number 1 in the list.