In [1]:
import sys
sys.path.append("..")

In [2]:
CKPT_PATH = "../checkpoints/last.ckpt"
DATA_PATH = "../data/vlsp2020_val_set.tar"

# Load the model from checkpoint

After fine-tuning from [train.py](../finetuning/train.py), checkpoint would be saved in [checkpoints/last.ckpt](../checkpoints/)

In [3]:
import torch
from transformers import (
    Wav2Vec2ForPreTraining,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
)
from finetuning.wav2vec2 import SpeechRecognizer


def load_model(ckpt_path: str):
    model_name = "nguyenvulebinh/wav2vec2-base-vietnamese-250h"

    wav2vec2 = Wav2Vec2ForPreTraining.from_pretrained(model_name)
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_name)
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

    model = SpeechRecognizer.load_from_checkpoint(
        ckpt_path,
        wav2vec2=wav2vec2,
        tokenizer=tokenizer,
        feature_extractor=feature_extractor,
    )

    return model


recognizer = load_model(CKPT_PATH)


Some weights of the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h were not used when initializing Wav2Vec2ForPreTraining: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForPreTraining from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForPreTraining from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vietnamese-250h and are newly initialized: ['quantizer.codevectors', 'project_hid.bias', 'project_hid.weight', 'quantizer.weight_proj.bias', 'project_q.bias', 'project_q.weight', 'quantizer.weight_proj.weight']
You should proba

# Validation

## VLSP2020 Dataset

In [None]:
from src.datamodule import VLSP2020TarDataset
from src.datamodule.vlsp2020 import get_dataloader

dts = VLSP2020TarDataset(DATA_PATH).load()
val_loader = get_dataloader(dts, batch_size=2, return_transcript=True, num_workers=2)

In [None]:
from pytorch_lightning import Trainer

torch.cuda.empty_cache()
metrics = Trainer(accelerator="gpu", devices=1).validate(model=recognizer, dataloaders=val_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val/cer            0.07352624833583832
         val/wer            0.15021772682666779
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


## VIVOS Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("vivos", split="test")

Downloading builder script:   0%|          | 0.00/6.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.53k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.00k [00:00<?, ?B/s]

Downloading and preparing dataset vivos/default to /root/.cache/huggingface/datasets/vivos/default/1.1.0/4872f55990894df809a7d41dc64c483303a4a6f495c50b6e434632190c1b5eeb...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11660 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/760 [00:00<?, ? examples/s]

Dataset vivos downloaded and prepared to /root/.cache/huggingface/datasets/vivos/default/1.1.0/4872f55990894df809a7d41dc64c483303a4a6f495c50b6e434632190c1b5eeb. Subsequent calls will reuse this data.


In [None]:
dataset = dataset.remove_columns(["speaker_id", "path"]).with_format("torch")
dataset

Dataset({
    features: ['audio', 'sentence'],
    num_rows: 760
})

In [None]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    waveforms, transcripts = [], []

    for item in batch:
        waveforms.append(item['audio']['array'])
        transcripts.append(item['sentence'].lower())

    return transcripts, waveforms

val_loader = DataLoader(
    dataset,
    batch_size=2,
    num_workers=2,
    collate_fn=collate_fn
)

In [None]:
torch.cuda.empty_cache()
metrics = Trainer(accelerator="gpu", devices=1).validate(model=recognizer, dataloaders=val_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
         val/cer            0.04655877500772476
         val/wer            0.1258229911327362
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


# Inference

In [4]:
import requests
import torchaudio
from IPython.display import Audio, display


def get_audio(path: str):
    default_sample_rate = 16000
    if path.startswith("http"):
        with requests.get(path, stream=True) as r:
            waveform, sample_rate = torchaudio.load(r.raw)
    else:
        waveform, sample_rate = torchaudio.load(path)

    waveform = torchaudio.transforms.Resample(sample_rate, default_sample_rate)(
        waveform
    )

    return waveform


def play_audio(waveform: torch.Tensor, sample_rate: int = 16000):
    return display(Audio(waveform.cpu().numpy(), rate=sample_rate))


In [10]:
waveform = get_audio("https://github.com/NTT123/Vietnamese-Text-To-Speech-Dataset/blob/master/022878.wav?raw=true")
print(waveform.shape)

recognizer.eval()
play_audio(waveform)
print(recognizer.predict(waveform))

torch.Size([1, 148980])


['ừ thế mày đã nuôi được bố mẹ mày bữa nào chưa hay xưa này vẫn báo hải cơm chà áo mèm mãi mấy hôm thấy ông đơ mặt không thèm nói mày lại làm gà']


In [6]:
import sounddevice as sd

sample_rate = 16_000
duration = 5 # seconds
print("start")
mydata = sd.rec(int(sample_rate * duration), samplerate=sample_rate, channels=1, blocking=True)

sd.wait()
mydata = torch.from_numpy(mydata).T
play_audio(mydata)

print(recognizer.predict(mydata))


start


['xin chào tôi là vũ huy hoàng']
