<a href="https://colab.research.google.com/github/indra622/AIAcademy_SpeechRecognition/blob/main/nemo_playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Nemo Installation

In [None]:
!pip install nemo_toolkit['all']

# Pre-trained model 

Model: Conformer-large (https://www.isca-speech.org/archive/interspeech_2020/gulati20_interspeech.html)

Dataset: LibriSpeech (https://openslr.org/12/)

Tokenizer: Byte-pair Encoding by Sentencepiece (https://github.com/google/sentencepiece)



## Model load

In [None]:
import nemo.collections.asr as nemo_asr

In [None]:
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_en_conformer_ctc_large_ls")

## Tokenizer and Vocabulary

### Tokenizer



In [None]:
tokenizer = asr_model.tokenizer.tokenizer
print(tokenizer.encode_as_pieces('hello world'))

['▁he', 'll', 'o', '▁w', 'or', 'l', 'd']


### output units

vocab: 1+127+1 (<unk> + tokens + blank)


In [None]:
vocab = asr_model.tokenizer.vocab
print(vocab)

['<unk>', 'e', 's', '▁', 't', 'a', 'o', 'i', '▁the', 'd', 'l', 'n', '▁a', 'm', 'y', 'u', '▁s', 'p', 'ed', 'c', '▁and', 're', '▁to', '▁of', 'r', 'w', 'ing', '▁w', 'h', '▁p', '▁c', 'er', 'f', 'k', 'ar', '▁in', '▁f', '▁b', 'g', 'an', 'in', '▁i', 'en', '▁he', 'le', '▁g', 'or', 'll', 'b', '▁be', 'ro', 'st', 'on', '▁d', 'v', 'ly', 'ce', 'ur', 'es', '▁that', '▁o', 'us', '▁was', '▁it', '▁th', 've', 'ch', 'un', 'al', '▁t', '▁ma', 'ri', '▁you', '▁on', 'ver', 'ent', '▁for', '▁re', 'ra', "'", '▁his', 'ir', 'ter', '▁with', '▁her', 'it', 'th', '▁mo', '▁me', '▁ha', '▁e', '▁as', 'tion', '▁had', '▁not', '▁no', '▁do', 'ther', '▁but', '▁st', '▁she', '▁is', 'igh', '▁ho', '▁lo', 'ng', '▁him', '▁an', 'ck', 'j', 'ugh', '▁de', '▁li', '▁mi', '▁la', '▁my', '▁con', '▁have', '▁this', '▁which', 'q', '▁up', '▁said', '▁from', '▁who', '▁ex', 'x', 'z']


In [None]:
with open('tokens.txt', 'w') as f:
  for k, v in enumerate(vocab):
    f.write(str(v) + ' '+str(k)+ '\n')

### words

'word.raw' file: All words in language model(librispeech dataset)


In [None]:
with open('word.raw', 'r') as f:
  wlist = f.read().splitlines()


In [None]:
wlist

In [None]:
with open('words.txt', 'w') as f:
  for k, v, in enumerate(wlist):
    f.write(str(v)+' '+str(k)+'\n')

## Lexicon

### wordpiece

In [None]:
pieces = []
for i in wlist:
  pieces.append(tokenizer.encode_as_pieces(i))

### Lexicon format

lexicon: word token pair (except for special symbols)

In [None]:
lexicon = list(zip(wlist, pieces))

In [None]:
with open('lexicon.txt', "w", encoding="utf-8") as f:
  for word, tokens in lexicon[1:-1]: # special symbol removal
    f.write(f"{word} {' '.join(tokens)}\n")

In [None]:
len(lexicon)

976868

# Test set logits

## Test set load

Librispeech test-clean set

HuggingFace URL: https://huggingface.co/datasets/kresnik/librispeech_asr_test



In [None]:
!pip install datasets
from datasets import load_dataset
ds = load_dataset("kresnik/librispeech_asr_test", "clean")
test_ds = ds['test']
fl = test_ds['file']

## logit extraction

extracting log-probabilities using Nemo model and huggingface dataset

In [None]:
r = asr_model.transcribe(fl, logprobs=True)

## save logits

save logits as pt file

In [None]:
import torch

out_list = []
for i in r:
  out_list.append(torch.tensor(i))

In [None]:
out_list

In [None]:
torch.save(out_list, 'logits.pt')

In [None]:
out_list[0].shape

torch.Size([303, 129])

## save reference

save reference text

In [None]:
with open('ref.txt', 'w') as f:
  for i in test_ds['text']:
    f.write(i+'\n')