In [1]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
5. Restart the runtime (Runtime -> Restart Runtime) for any upgraded packages to take effect
"""
# If you're using Google Colab and not running locally, run this cell.

## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install unidecode


## Install NeMo
BRANCH = 'r1.4.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml
!pip install matplotlib==3.1.3
"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=a06a7b5cbc5c2c63cb78be9ce3333079a983bf1fd65a024ae5e26a780e880a0b
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsndfile1 is already the newest version (1.0.28-4ubuntu0.18.04.2).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
The following additional packages will be installed:
  libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa
  libsox-fmt-base libsox3
Suggested packages:
  file libsox-fmt-all
The following NEW packages will be installed:
  libmagic-mgc libmagic1 libopenco

'\nRemember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\nAlternatively, you can uncomment the exit() below to crash and restart the kernel, in the case\nthat you want to use the "Run All Cells" (or similar) option.\n'

In [1]:
# Папка, где будет размещаться датасет Golos
data_dir = '/content'
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import glob
import os
import subprocess
import tarfile
import wget

# Загрузка датасета Golos
def load(golos_url, fname):
  if not os.path.exists(os.path.join(data_dir, fname)):
      golos_path = wget.download(golos_url, os.path.join(data_dir, fname))
      print(f"Dataset downloaded at: {golos_path}")
  else:
      print("Tarfile already exists.")
      golos_path = os.path.join(data_dir, fname)
  return golos_path

golos_path = load("https://sc.link/Kqr", "test.tar")

if not os.path.exists(os.path.join(data_dir, '/test/')):
    tar = tarfile.open(golos_path)
    tar.extractall(path=data_dir)

Dataset downloaded at: /content/test.tar


In [3]:
! head /content/test/crowd/manifest.jsonl

{"id": "e632f7d39c15e7edfc665b91e6f2071f", "audio_filepath": "files/e632f7d39c15e7edfc665b91e6f2071f.wav", "text": "\u0430\u0444\u0438\u043d\u0430 \u0432\u043e\u0441\u043f\u0440\u043e\u0438\u0437\u0432\u0435\u0434\u0438 \u043c\u0443\u0437\u044b\u043a\u0443 \u0432\u043f\u0435\u0440\u0435\u043c\u0435\u0448\u043a\u0443", "duration": 4.9}
{"id": "5db5df8bb9e3b6660b2a04b34d4a355d", "audio_filepath": "files/5db5df8bb9e3b6660b2a04b34d4a355d.wav", "text": "\u043d\u0430\u0439\u0442\u0438 \u0441\u0435\u0440\u0438\u0430\u043b \u0433\u0440\u0438\u0433\u043e\u0440\u0438\u0439 \u0440", "duration": 3.652}
{"id": "2c471aedc6979109f28cd53c58f8c4fb", "audio_filepath": "files/2c471aedc6979109f28cd53c58f8c4fb.wav", "text": "\u043f\u0440\u044f\u043c\u043e\u0439 \u044d\u0444\u0438\u0440 \u0430\u043f\u043b \u043c\u0430\u043d\u0447\u0435\u0441\u0442\u0435\u0440 \u044e\u043d\u0430\u0439\u0442\u0435\u0434 \u0442\u043e\u0442\u0442\u0435\u043d\u0445\u044d\u043c", "duration": 4.34175}
{"id": "756a137ee9debde4a008a

In [4]:
# NeMo's "core" package
import nemo
!pip install torchtext==0.10.1
!pip install torchaudio==0.9.1

# NeMo's ASR collection - this collections contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr

Collecting torchtext==0.10.1
  Downloading torchtext-0.10.1-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 5.2 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.0:
      Successfully uninstalled torchtext-0.11.0
Successfully installed torchtext-0.10.1
Collecting torchaudio==0.9.1
  Downloading torchaudio-0.9.1-cp37-cp37m-manylinux1_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 5.3 MB/s 
Installing collected packages: torchaudio
  Attempting uninstall: torchaudio
    Found existing installation: torchaudio 0.10.0+cu111
    Uninstalling torchaudio-0.10.0+cu111:
      Successfully uninstalled torchaudio-0.10.0+cu111
Successfully installed torchaudio-0.9.1


[NeMo W 2021-11-30 13:33:59 optimizers:47] Apex was not found. Using the lamb optimizer will error out.
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

[NeMo W 2021-11-30 13:34:02 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text_dali._AudioTextDALIDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.


## Исползование предобученной модели


Коллекция для распознавания речи в NeMo содержит готовые блоки, которые можно использовать чтобы тренировать и использовать свою модель. Кроме этого существет ряд предобученных моделей, которые можно просто скачать и исползовать. Давайте скачаем и инициализируем готовую модель QuartzNet15x5, обученную на открытом датасете Golos.



In [48]:
load("https://sc.link/ZMv", "QuartzNet15x5_golos.nemo")

asr_model = nemo_asr.models.EncDecCTCModel.restore_from(os.path.join(data_dir, "QuartzNet15x5_golos.nemo"))

Tarfile already exists.


[NeMo W 2021-11-30 16:09:27 modelPT:131] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: train/golos_and_mcv.jsonl
    sample_rate: 16000
    labels:
    - ' '
    - а
    - б
    - в
    - г
    - д
    - е
    - ж
    - з
    - и
    - й
    - к
    - л
    - м
    - н
    - о
    - п
    - р
    - с
    - т
    - у
    - ф
    - х
    - ц
    - ч
    - ш
    - щ
    - ъ
    - ы
    - ь
    - э
    - ю
    - я
    batch_size: 64
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    num_workers: 20
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    parser: ru
    
[NeMo W 2021-11-30 16:09:27 modelPT:138] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a

[NeMo I 2021-11-30 16:09:27 features:262] PADDING: 16
[NeMo I 2021-11-30 16:09:27 features:279] STFT using torch
[NeMo I 2021-11-30 16:09:28 save_restore_connector:143] Model EncDecCTCModel was successfully restored from /content/QuartzNet15x5_golos.nemo.


In [6]:
# --- Building Test Manifest Files --- #
import json

# Function to build a manifest
def build_manifest(manifest_rel, manifest_abs):
    manifest_path = os.path.split(os.path.abspath(manifest_rel))[0]
    with open(manifest_rel, 'r') as fin:
        with open(manifest_abs, 'w') as fout:
            for line in fin:
                metadata = json.loads(line)
                metadata["audio_filepath"]=os.path.join(manifest_path, metadata["audio_filepath"])
                json.dump(metadata, fout)
                fout.write('\n')
                
# Building Manifests
print("******")
train_rel = os.path.join(data_dir, 'test/farfield/manifest.jsonl')
train_abs = os.path.join(data_dir, 'test/farfield/farfield.jsonl')
if not os.path.isfile(train_abs):
  build_manifest(train_rel, train_abs)
test_manifest = train_abs
print("test_manifest", test_manifest)


******
test_manifest /content/test/farfield/farfield.jsonl


In [7]:
# ! head /content/test/farfield/manifest.jsonl
! head /content/test/farfield/farfield.jsonl

{"id": "58b586f67f5e634506e215df5996b82e", "audio_filepath": "/content/test/farfield/files/58b586f67f5e634506e215df5996b82e.wav", "text": "\u0434\u0436\u043e\u0439 \u0445\u0432\u0430\u0442\u0438\u0442", "duration": 1.696625}
{"id": "26093ef7a8c5ec6a3c586a6a929c1bd3", "audio_filepath": "/content/test/farfield/files/26093ef7a8c5ec6a3c586a6a929c1bd3.wav", "text": "\u0441\u0430\u043b\u044e\u0442 \u0432\u044b\u0437\u043e\u0432 \u0441\u0432\u0435\u0442\u043b\u0430\u043d\u0435 \u0432\u0430\u0441\u0438\u043b\u044c\u0435\u0432\u043d\u0435 \u043d\u0438\u043a\u043e\u043b\u0435\u043d\u043a\u043e", "duration": 3.1720625}
{"id": "b42d13ab3dc159fac39a1120e8011f66", "audio_filepath": "/content/test/farfield/files/b42d13ab3dc159fac39a1120e8011f66.wav", "text": "\u0441\u0430\u043b\u044e\u0442 \u0445\u0432\u0430\u0442\u0438\u0442", "duration": 1.5226875}
{"id": "7e55843e745a239b35212e2caba77239", "audio_filepath": "/content/test/farfield/files/7e55843e745a239b35212e2caba77239.wav", "text": "\u0434\u0436\

### Задаем модель при помощи YAML конфиг файла

Для обучения мы создадим модель *Jasper_4x1*, в которой будет `K=4` блоков, один (`R=1`) под-блок и декодер *greedy CTC*, используя конфиг файл в `./configs/config.yaml`.


Ниже приведен конфиг файл, давайте рассмотрим его и найдем части описанной архитектуры Jasper. Модель (model) содержит поле под названием `encoder` с под-полем `jasper` который состоит из списка полей. Каждое поле в списке задает конфигурацию блока в нашей моделе. Каждый блок выглядит примерно так:

```
- filters: 128
  repeat: 1
  kernel: [11]
  stride: [2]
  dilation: [1]
  dropout: 0.2
  residual: false
  separable: true
  se: true
  se_context_size: -1
```


Первый элемент в спике соответствует первому блоку в Jasper архитектуре.

Параметры обучающего и тестового датасета в полях (`train_ds`) и (`validation_ds`)

Конфиг в формате YAML позволяем легко и в читаемой форме читать и модифицировать модель без необходимости менять код пррограммы.

In [8]:
! cat ./configs/config.yaml

name: &name "QuartzNet15x5"
sample_rate: &sample_rate 16000
repeat: &repeat 1
dropout: &dropout 0.0
separable: &separable true
labels: &labels [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
         "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]

model:
  train_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: *labels
    batch_size: 32
    trim_silence: True
    max_duration: 16.7
    shuffle: True
    is_tarred: False
    tarred_audio_filepaths: null
    tarred_shard_strategy: "scatter"

  validation_ds:
    manifest_filepath: ???
    sample_rate: 16000
    labels: *labels
    batch_size: 32
    shuffle: False

  preprocessor:
    _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
    normalize: "per_feature"
    window_size: 0.02
    sample_rate: *sample_rate
    window_stride: 0.01
    window: "hann"
    features: &n_mels 64
    n_fft: 512
    frame_splicing: 1
    dither: 0.00001
    stft_conv: fa

In [9]:
# --- Config Information ---#
try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML
config_path = './configs/config.yaml'

yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)
    
print(params)

{'name': 'QuartzNet15x5', 'sample_rate': 16000, 'repeat': 1, 'dropout': 0.0, 'separable': True, 'labels': [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'"], 'model': {'train_ds': {'manifest_filepath': '???', 'sample_rate': 16000, 'labels': [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'"], 'batch_size': 32, 'trim_silence': True, 'max_duration': 16.7, 'shuffle': True, 'is_tarred': False, 'tarred_audio_filepaths': None, 'tarred_shard_strategy': 'scatter'}, 'validation_ds': {'manifest_filepath': '???', 'sample_rate': 16000, 'labels': [' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'"], 'batch_size': 32, 'shuffle': False}, 'preprocessor': {'_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'normal

In [10]:
print(params['model']['optim'])

{'name': 'novograd', 'lr': 0.01, 'betas': [0.8, 0.5], 'weight_decay': 0.001, 'sched': {'name': 'CosineAnnealing', 'monitor': 'val_loss', 'reduce_on_plateau': False, 'warmup_steps': None, 'warmup_ratio': None, 'min_lr': 0.0, 'last_epoch': -1}}


In [12]:
train_path = load("https://sc.link/1Z3", "farfield.tar")
tar = tarfile.open(data_dir + '/farfield.tar')
tar.extractall(path=data_dir)

Dataset downloaded at: /content/farfield.tar


In [11]:
%rm -rf test/crowd

In [13]:
%rm -rf farfield.tar
%rm -rf test.tar

In [18]:
import json

original_farfield_texts_files = {}
def build_manifest(manifest_rel, manifest_abs):
    manifest_path = os.path.split(os.path.abspath(manifest_rel))[0][:-8]
    print(manifest_path)
    with open(manifest_rel, 'r') as fin:
        with open(manifest_abs, 'w') as fout:
            for line in fin:
                metadata = json.loads(line)
                if 'farfield' in metadata['audio_filepath']:
                    if metadata['id'] == '375e547e51a71f9e18b811dd89244baf':
                        continue  
                    metadata["audio_filepath"]=os.path.join(manifest_path[:-6] + '/train', metadata["audio_filepath"][:-4] + 'wav')
                    original_farfield_texts_files[metadata["audio_filepath"]] = metadata["text"]
                    json.dump(metadata, fout)
                    fout.write('\n')

train_manifest = data_dir + '/drive/MyDrive/manifest.jsonl'

build_manifest(train_manifest, data_dir + '/train/farfield/manifest_abs.jsonl')

/content/drive


In [19]:
!head "/content/train/farfield/manifest_abs.jsonl"

{"id": "855e01d6ba9a4aa59950d62037b87709", "audio_filepath": "/content/train/farfield/855e01d6ba9a4aa59950d62037b87709.wav", "text": "\u0430\u0444\u0438\u043d\u0430 \u0445\u043e\u0442\u0435\u043b\u0430 \u043d\u043e\u043c\u0435\u0440 \u0442\u0435\u043b\u0435\u0444\u043e\u043d\u0430 \u043f\u043e\u043c\u0435\u043d\u044f\u0442\u044c \u043a\u0430\u0440\u0442\u044b", "duration": 5.4596875}
{"id": "f6075fae0c592183f107cf781117d9b5", "audio_filepath": "/content/train/farfield/f6075fae0c592183f107cf781117d9b5.wav", "text": "\u0441\u0430\u043b\u044e\u0442 \u0445\u043e\u0447\u0443 \u043f\u043e\u0433\u043e\u0432\u043e\u0440\u0438\u0442\u044c \u0441 \u0434\u0436\u043e\u0439", "duration": 4.15125}
{"id": "2599a6cb55795b38ddd80b849d8670bc", "audio_filepath": "/content/train/farfield/2599a6cb55795b38ddd80b849d8670bc.wav", "text": "\u0434\u0436\u043e\u0439 \u043a\u0430\u043a\u0430\u044f \u0432\u0430\u043b\u044e\u0442\u0430 \u0432 \u043a\u0430\u043c\u0431\u043e\u0434\u0436\u0435", "duration": 3.9185625}

In [20]:
files_to_transcribe = []
transcriptions = []

for file in original_farfield_texts_files.keys():
    files_to_transcribe.append(file)
transcriptions = asr_model.transcribe(paths2audio_files=files_to_transcribe)


for file_name, transcription in zip(files_to_transcribe, transcriptions):
    print(f"Audio in {file_name} was recognized as: {transcription}")

Transcribing:   0%|          | 0/31001 [00:00<?, ?it/s]

[NeMo W 2021-11-30 13:59:52 patch_utils:50] torch.stft() signature has been updated for PyTorch 1.7+
    Please update PyTorch to remain compatible with later versions of NeMo.
    To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
      return torch.floor_divide(self, other)
    


[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Audio in /content/train/farfield/7e389e9d11b24371542a19a509ec05c2.wav was recognized as: афина как стать участником бонусов спасибо
Audio in /content/train/farfield/ab72114912741b803274d430950b298e.wav was recognized as: афина сколько денег на счету у меня осталось
Audio in /content/train/farfield/baa82a1081afb4db33fa6c41aeaa2148.wav was recognized as: джой мне нужно нарастить ресницы запиши плиз на ближайшее время
Audio in /content/train/farfield/1d51dcbe6d6c3a7c7141b4e4a364c5ba.wav was recognized as: джой поставь пожалуйста будильник на семь часов утра
Audio in /content/train/farfield/52fb1e71dd009cefd6117d79b12850c0.wav was recognized as: джой первая номинация леонардо дикаприо на премию
Audio in /content/train/farfield/315c59d26108d1a0572c37a96a4f5208.wav was recognized as: афина как подключить услугу спасибо бонусы
Audio in /content/train/farfield/a11e0f0388fde9bfbe71ffa2ffd47278.wav was recognized a

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Audio in /content/train/farfield/f1ef7e52737601eef35cbebbbea77114.wav was recognized as: открой приложение банка и переведи тридцать рублей на номер карты семнадцать семьдесят
Audio in /content/train/farfield/aba5fdaf5869f05f456ce6dc39bf7dbf.wav was recognized as: возвращение блудного сына
Audio in /content/train/farfield/ee5bfca3da0db4e79b31536bc4811bf0.wav was recognized as: салют чем увлекается мальцев
Audio in /content/train/farfield/89b7fc8085c7f8c89daede00a1ae81bf.wav was recognized as: афина в каком ближайшем банкомате можно осуществить внесение наличных денег на карту
Audio in /content/train/farfield/f95bb95a9456f97eaca62bc75bf64138.wav was recognized as:  
Audio in /content/train/farfield/abc4a50d2a4ee44b1f1732e34da3501a.wav was recognized as: сбер какие виды германские премена ты знаешь
Audio in /content/train/farfield/7508223dea49a055cbf4d7293784156b.wav was recognized as: афина какая площадь у

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Audio in /content/train/farfield/8b0948a8d34f3b58c787ef33461c6be5.wav was recognized as: какими популярными цитатами из отечественных фильмов в
Audio in /content/train/farfield/2632db47a3eb7da03399fd438bf6a22c.wav was recognized as: афина что создавала практика
Audio in /content/train/farfield/53e4814cefb96104629dc3faa8b57201.wav was recognized as: афина кто отец светланы керсановой
Audio in /content/train/farfield/d0b7ffeba110c6d2a54f7c0576c685a6.wav was recognized as: афина на каком языке говорят в хеттское царство
Audio in /content/train/farfield/399ba06f2cb3aecef38378927bde4210.wav was recognized as: афина почему никогда не помнишь момент тогда заснул
Audio in /content/train/farfield/8b6a0c0a32056c6c693957e222692121.wav was recognized as: сбер у тебя есть пара или ты свободна
Audio in /content/train/farfield/5fad40a6458b0fec25084ca98271d6e8.wav was recognized as: драма титаник
Audio in /content/train/

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Audio in /content/train/farfield/a445afe01c94714f1e253bc14e6d4997.wav was recognized as: сбер выключи будильник если он зазвонит в пятнадцать часов
Audio in /content/train/farfield/14e3d01775a35a03b2bd76c838a8f9d2.wav was recognized as: ты одинок можно с тобой познакомиться
Audio in /content/train/farfield/21f500fede0e2db79be9b5751625355e.wav was recognized as: ангар восемнадцать
Audio in /content/train/farfield/994fe90bedcd3ef4aa85a9c601b215f7.wav was recognized as: джой позвони с видео контакту
Audio in /content/train/farfield/37c86c482227dcb8526f3832815b8191.wav was recognized as: джой сколько литров в декалитре
Audio in /content/train/farfield/5301f7a18401dbc486f8bb5155512cc0.wav was recognized as: салют а в течение ближайшего часа пойдет ли снег
Audio in /content/train/farfield/0a85674568293b2d29465339d5c35298.wav was recognized as: джой пожалуйста оплати сотовую связь моего друга
Audio in /content/train/farfield/f4d10a0a5a7402dc2eaba5ab45e0dad1.wav was recognized as: найти фильм 

In [21]:
transcriptions_from_model = {}
for id, file_name in enumerate(files_to_transcribe):
    transcriptions_from_model[file_name] = transcriptions[id]

In [22]:
transcriptions_from_model

{'/content/train/farfield/855e01d6ba9a4aa59950d62037b87709.wav': 'афина хотела номер телефона поменять карты',
 '/content/train/farfield/f6075fae0c592183f107cf781117d9b5.wav': 'салют хочу поговорить с джой',
 '/content/train/farfield/2599a6cb55795b38ddd80b849d8670bc.wav': 'джой какая валюта в камбодже',
 '/content/train/farfield/5f7483035b2f26b773b0d3fb125b779d.wav': 'салют я могу что нибудь сделать',
 '/content/train/farfield/4c95a12b71e1262ac937d2fe6a803906.wav': 'джой смотреть операцию ы и другие приключения шурика онлайн',
 '/content/train/farfield/90655e9a042c4e79c424cd7dd82b37a6.wav': 'салют на каком языке создана кавка на пляже',
 '/content/train/farfield/14b5e99616d446c3887c8927403632a0.wav': 'сбер чем платить на канарах',
 '/content/train/farfield/d71107319855ef21249be2ab0edea9c2.wav': 'афина на пару минут перемотай вперед',
 '/content/train/farfield/566e4f7d788d755e2fa76ab57837d7dd.wav': 'афина площадь королевства тонга',
 '/content/train/farfield/40bf19adace8d787cd3c63784ed0

In [23]:
!pip install Levenshtein
import Levenshtein as lev

Collecting Levenshtein
  Downloading Levenshtein-0.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[K     |████████████████████████████████| 110 kB 5.3 MB/s 
Installing collected packages: Levenshtein
Successfully installed Levenshtein-0.16.0


In [27]:
bad_farfield_texts_files = []
for file_id in original_farfield_texts_files.keys():
    Ratio = lev.ratio(original_farfield_texts_files[file_id], transcriptions_from_model[file_id])
    if Ratio < 0.93:
        bad_farfield_texts_files.append(file_id.split('/')[-1][:-4])

print(len(bad_farfield_texts_files))


795


In [28]:
def build_bad_transcribtions_manifest(manifest_rel, manifest_abs):
    manifest_path = os.path.split(os.path.abspath(manifest_rel))[0]
    print(manifest_path)
    with open(manifest_rel, 'r') as fin:
        with open(manifest_abs, 'w') as fout:
            for line in fin:
                metadata = json.loads(line)
                if metadata['id'] in bad_farfield_texts_files:
                    metadata["audio_filepath"]=os.path.join(manifest_path, metadata["audio_filepath"])
                    json.dump(metadata, fout)
                    fout.write('\n')


build_bad_transcribtions_manifest("/content/train/farfield/manifest_abs.jsonl", "/content/train/farfield/train_bad_manifest.jsonl")
train_bad_manifest = "/content/train/farfield/train_bad_manifest.jsonl"

/content/train/farfield


In [49]:
from omegaconf import DictConfig
params_model = asr_model._cfg

# --- Config Information ---#
# try:
#     from ruamel.yaml import YAML
# except ModuleNotFoundError:
#     from ruamel_yaml import YAML
# config_path = '/usr/local/lib/python3.7/dist-packages/nemo/collections/asr/models/configs/'

# yaml = YAML(typ='safe')
# with open(config_path) as f:
#     params = yaml.load(f)
# print(params)
params_model['train_ds']['parser']='base'
params_model['validation_ds']['parser']='base'
params_model['test_ds']['parser']='base'
params_model['train_ds']['manifest_filepath'] = train_bad_manifest
params_model['train_ds']['batch_size']=16
params_model['validation_ds']['manifest_filepath'] = test_manifest
params_model['validation_ds']['batch_size']=8
params_model['test_ds']['manifest_filepath'] = test_manifest
params_model['test_ds']['batch_size']=8

In [50]:
import copy
new_opt = copy.deepcopy(params_model['optim'])
new_opt['lr'] = 0.0007
asr_model.setup_optimization(optim_config=DictConfig(new_opt))
# And then you can invoke trainer.fit(first_asr_model)

[NeMo W 2021-11-30 16:10:03 modelPT:436] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2021-11-30 16:10:03 modelPT:544] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        eps: 1e-08
        grad_averaging: False
        lr: 0.0007
        weight_decay: 0.001
    )


[NeMo W 2021-11-30 16:10:03 lr_scheduler:605] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


(Novograd (
 Parameter Group 0
     amsgrad: False
     betas: [0.9, 0.98]
     eps: 1e-08
     grad_averaging: False
     lr: 0.0007
     weight_decay: 0.001
 ), None)

In [51]:
asr_model.setup_training_data(train_data_config=params_model['train_ds'])
asr_model.setup_test_data(test_data_config=params_model['test_ds'])
# Point to the new validation data for fine-tuning
asr_model.setup_validation_data(val_data_config=params_model['validation_ds'])

[NeMo I 2021-11-30 16:10:07 collections:173] Dataset loaded with 792 files totalling 0.78 hours
[NeMo I 2021-11-30 16:10:07 collections:174] 3 files were filtered totalling 0.01 hours


      cpuset_checked))
    


[NeMo I 2021-11-30 16:10:07 collections:173] Dataset loaded with 1916 files totalling 1.41 hours
[NeMo I 2021-11-30 16:10:07 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2021-11-30 16:10:07 collections:173] Dataset loaded with 1916 files totalling 1.41 hours
[NeMo I 2021-11-30 16:10:07 collections:174] 0 files were filtered totalling 0.00 hours


In [52]:
# Bigger batch-size = bigger throughput
# params['model']['validation_ds']['batch_size'] = 16
# Setup the test data loader and make sure the model is on GPU
# first_asr_model.setup_test_data(test_data_config=params['model']['validation_ds'])
asr_model.cuda()

# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.
# WER is computed as numerator/denominator.
# We'll gather all the test batches' numerators and denominators.
wer_nums = []
wer_denoms = []

# Loop over all test batches.
# Iterating over the model's `test_dataloader` will give us:
# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)
# See the AudioToCharDataset for more details.
for test_batch in asr_model.test_dataloader():
        test_batch = [x.cuda() for x in test_batch]
        targets = test_batch[2]
        targets_lengths = test_batch[3]        
        log_probs, encoded_len, greedy_predictions = asr_model(
            input_signal=test_batch[0], input_signal_length=test_batch[1]
        )
        # Notice the model has a helper object to compute WER
        asr_model._wer.update(greedy_predictions, targets, targets_lengths)
        _, wer_num, wer_denom = asr_model._wer.compute()
        asr_model._wer.reset()
        wer_nums.append(wer_num.detach().cpu().numpy())
        wer_denoms.append(wer_denom.detach().cpu().numpy())

        # Release tensors from GPU memory
        del test_batch, log_probs, targets, targets_lengths, encoded_len, greedy_predictions

# We need to sum all numerators and denominators first. Then divide.
print(f"WER = {sum(wer_nums)/sum(wer_denoms)}")

WER = 0.7424242424242424


In [53]:
import pytorch_lightning as pl
trainer = pl.Trainer(gpus=1, max_epochs=6, precision=16)

Using native 16bit precision.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [54]:
trainer.fit(asr_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[NeMo W 2021-11-30 16:11:14 modelPT:436] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2021-11-30 16:11:14 modelPT:544] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        eps: 1e-08
        grad_averaging: False
        lr: 0.0007
        weight_decay: 0.001
    )


[NeMo W 2021-11-30 16:11:14 lr_scheduler:605] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !

  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConvASREncoder                    | 18.9 M
2 | decoder           | ConvASRDecoder                    | 34.9 K
3 | loss              | CTCLoss                           | 0     
4 | spec_augmentation | SpectrogramAugmentation           | 0     
5 | _wer              | WER                               | 0     
------------------------------------------------------------------------
18.9 M    Trainable params
0         Non-trainable params
18.9 M    Total params
75.718    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

      cpuset_checked))
    


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [55]:
asr_model.cuda()

# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.
# WER is computed as numerator/denominator.
# We'll gather all the test batches' numerators and denominators.
wer_nums = []
wer_denoms = []

# Loop over all test batches.
# Iterating over the model's `test_dataloader` will give us:
# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)
# See the AudioToCharDataset for more details.
for test_batch in asr_model.test_dataloader():
        test_batch = [x.cuda() for x in test_batch]
        targets = test_batch[2]
        targets_lengths = test_batch[3]        
        log_probs, encoded_len, greedy_predictions = asr_model(
            input_signal=test_batch[0], input_signal_length=test_batch[1]
        )
        # Notice the model has a helper object to compute WER
        asr_model._wer.update(greedy_predictions, targets, targets_lengths)
        _, wer_num, wer_denom = asr_model._wer.compute()
        asr_model._wer.reset()
        wer_nums.append(wer_num.detach().cpu().numpy())
        wer_denoms.append(wer_denom.detach().cpu().numpy())

        # Release tensors from GPU memory
        del test_batch, log_probs, targets, targets_lengths, encoded_len, greedy_predictions

# We need to sum all numerators and denominators first. Then divide.
print(f"WER = {sum(wer_nums)/sum(wer_denoms)}")

WER = 0.7044858029109997
