In [None]:
from pathlib import Path
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2ProcessorWithLM
import re
from utils import load_custom_dataset, fix_arpa_file
from pyctcdecode import build_ctcdecoder
from datasets import load_dataset

In [None]:
base_directory = Path.cwd().parent

dataset_name = "yale_econ251"
data_dir = base_directory / 'data'

dataset_size = "normal" # or 'tiny'

if dataset_size == "tiny":
    audio_dir = data_dir / 'inputs' / dataset_name / 'lectures-tiny'
    transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts-tiny'

else:
    audio_dir = data_dir / 'inputs' / dataset_name / 'lectures'
    transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts'

predictions_dir = data_dir / 'predictions' / dataset_name

###  Extract all text from dataset

In [None]:
dataset = load_custom_dataset(audio_dir, transcripts_dir)

In [None]:
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # change to the ignored characters of your fine-tuned model
# ignored
# characters of your fine-tuned model
def extract_text(batch):
    text = batch["sentence"]
    batch["txt"] = re.sub(chars_to_ignore_regex, "", text.lower())
    return batch


In [None]:
dataset = dataset.map(extract_text, remove_columns=dataset.column_names)

In [None]:
# save all the text to one file
full_text_file = data_dir / 'inputs' / dataset_name / "full_text.txt"
arpa_file = data_dir / 'inputs' / dataset_name / "5gram.arpa"
with open(full_text_file, "w") as file:
    file.write(" ".join(dataset["txt"]))

### Create the arpa file

In [None]:
# create the arpa file
!../kenlm/build/bin/lmplz -o 5 <{full_text_file} > {arpa_file} --skip_symbols

In [None]:
# fix issue with arpa file
arpa_file_correct = data_dir / 'inputs' / dataset_name / "5gram_correct.arpa"
fix_arpa_file(arpa_file, arpa_file_correct)

### Create the processor with the language model

In [None]:
# load the original processor
model_name = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(model_name)

In [None]:
# load the vocabulary from this processor
vocab_dict = processor.tokenizer.get_vocab()
sorted_vocab_dict = {k : v for k, v in sorted(vocab_dict.items(), key=lambda
    item: item[1])}

In [None]:
decoder = build_ctcdecoder(
    labels=list(sorted_vocab_dict.keys()),
    kenlm_model_path=str(arpa_file_correct),
)

In [None]:
processor_with_lm = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder
)

### Predict using model with lm

In [None]:
from transformers import pipeline
pipe = pipeline("automatic-speech-recognition",
                model=model_name,
                tokenizer=processor_with_lm.tokenizer,
                feature_extractor=processor_with_lm.feature_extractor,
                decoder=decoder, device=0)

In [None]:
import time

model_dir = model_name.split('/')[1]
chunk_length = 10
#
selected_files = ['06']#, '07', '08', '09', '10', '11', '12', '13','14','15',
# '16','17','18','19','20','21','22','23','24','25','26' ]
#
for audio_file in audio_dir.glob('*.mp3'):
    file_no = audio_file.stem.split('_')[1]
    if file_no not in selected_files:
        continue

    print(f"Extracting text from speech for {audio_file}")
    start = time.time()
    output = pipe(str(audio_file), chunk_length_s=chunk_length)
    text = output['text']
    time_taken = time.time()-start
    print(f"Speech recognition took {time.time()-start} seconds")

    print('Saving text to file')
    file_name = 'pred_' + file_no +"_" + dataset_size  + '_with_lm'

    out_file = (predictions_dir / model_dir / file_name).with_suffix('.txt')
    print(out_file)
    if not out_file.is_file():
        print(f"Saved at {out_file}")
        with open(out_file, 'w') as f:
            f.write(text)
    else:
        print(f"{out_file} already exists")