## **Wav2Vec2 boosted with n-gram Language Model**

In [47]:
import re
import numpy as np
import torch

from IPython.display import display, HTML
import IPython.display as ipd

from datasets import load_dataset, Audio

from transformers import AutoProcessor, Wav2Vec2ProcessorWithLM, pipeline

import pyctcdecode
from pyctcdecode import build_ctcdecoder

### Generate KenLM statistical n-gram model

In [11]:
# Load all transcriptions into a single .txt file
print("Loading data...")

dataset = load_dataset("data_gtts", data_dir="./data_gtts", split='train')

Using custom data configuration default-data_dir=.%2Fdata_gtts
Found cached dataset data_gtts (I:/Repos/HFdatasets/data_gtts/default-data_dir=.%2Fdata_gtts/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68)


Loading data...


In [12]:
# Clean data and extract transcription
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'

def extract_text(batch):
    text = batch["transcription"]
    batch["text"] = re.sub(chars_to_ignore_regex, "", text.lower())
    return batch

dataset = dataset.map(extract_text, remove_columns=dataset.column_names)

dataset

Loading cached processed dataset at I:/Repos/HFdatasets/data_gtts/default-data_dir=.%2Fdata_gtts/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68\cache-e3a5e81e4a3e1a58.arrow


Dataset({
    features: ['text'],
    num_rows: 1000
})

In [14]:
# Combine all strings and save to file
with open("transcription_text.txt", "w") as file:
    file.write(" ".join(dataset["text"]))

In [15]:
correct_arpa = True

if correct_arpa:
    with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
        has_added_eos = False
        for line in read_file:
            if not has_added_eos and "ngram 1=" in line:
                count=line.strip().split("=")[-1]
                write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
            elif not has_added_eos and "<s>" in line:
                write_file.write(line)
                write_file.write(line.replace("<s>", "</s>"))
                has_added_eos = True
            else:
                write_file.write(line)

### Combine Wav2vec2 with n-gram

In [35]:
# Reload the dataset
dataset_tmp = load_dataset("data_gtts", data_dir="I:/Repos/STT_FineTune/nats/data_gtts")
# Resample to 16kHz
dataset_tmp = dataset_tmp.cast_column('audio', Audio(sampling_rate=16000))

Using custom data configuration default-9b41ac5bfd8c70c2
Found cached dataset data_gtts (I:/Repos/HFdatasets/data_gtts/default-9b41ac5bfd8c70c2/0.1.0/99611922a2fe30672e990db44b070dc747a16dd2cb691d0d2c33dc670a2e3b68)


  0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
# Load model
processor = AutoProcessor.from_pretrained("./model_wav2vec2")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
# Extract vocabulary for pyctcdecode
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k.lower(): v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}

In [21]:
# Build beam-search decoder
decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path="5gram_correct.arpa",
)

Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Only 70 unigrams passed as vocabulary. Is this small or artificial data?


In [26]:
# Create model with LM head
processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)
processor_with_lm.save_pretrained("./model_wav2vec2_lm/")

In [48]:
processor_tmp = Wav2Vec2ProcessorWithLM.from_pretrained("./model_wav2vec2_lm/")

ImportError: 
Wav2Vec2ProcessorWithLM requires the pyctcdecode library but it was not found in your environment. You can install it with pip:
`pip install pyctcdecode`. Please note that you may need to restart your runtime after installation.


In [44]:
pipe = pipeline('automatic-speech-recognition', model='./model_wav2vec2_lm/')
pipe.__dict__

Only 69 unigrams passed as vocabulary. Is this small or artificial data?


{'task': 'automatic-speech-recognition',
 'model': Wav2Vec2ForCTC(
   (wav2vec2): Wav2Vec2Model(
     (feature_extractor): Wav2Vec2FeatureEncoder(
       (conv_layers): ModuleList(
         (0): Wav2Vec2GroupNormConvLayer(
           (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
           (activation): GELUActivation()
           (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
         )
         (1): Wav2Vec2NoLayerNormConvLayer(
           (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
           (activation): GELUActivation()
         )
         (2): Wav2Vec2NoLayerNormConvLayer(
           (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
           (activation): GELUActivation()
         )
         (3): Wav2Vec2NoLayerNormConvLayer(
           (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
           (activation): GELUActivation()
         )
         (4): Wav2Vec2NoLayerNormConvLayer(
     

In [45]:
# Inference
idx = np.random.randint(0, len(dataset_tmp['test']))
display(ipd.Audio(data=np.asarray(dataset_tmp["test"][idx]["audio"]["array"]), autoplay=False, rate=16000))

# Print label
print(f"Label: {dataset_tmp['test'][idx]['transcription'].lower()}")

# Print prediction
output = pipe(dataset_tmp["test"][idx]["audio"]["array"])
output

Label: astraeus four quebec victor fly heading one one fife degrees


{'text': 'astraeus four quebec victor fly heading one one fife degrees'}