In [13]:
!pip install https://github.com/kpu/kenlm/archive/master.zip pyctcdecode

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Using cached https://github.com/kpu/kenlm/archive/master.zip


In [1]:
import json
import random
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union
import jiwer
import numpy as np
import torch
from datasets import Dataset, Audio
from datasets import load_metric
# from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import Wav2Vec2ProcessorWithLM

ModuleNotFoundError: No module named 'jiwer'

In [2]:
base_directory = Path.cwd().parent

dataset_name = "yale/econ251"
data_dir = base_directory / 'data'
# audio_dir = data_dir / 'inputs' / dataset_name / 'lectures'
audio_dir = data_dir / 'inputs' / dataset_name / 'lectures-tiny'
predictions_dir = data_dir / 'predictions' / dataset_name

# transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts'
transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts-tiny'

In [3]:
txt_files = [str(text_file) for text_file in transcripts_dir.glob('*.txt') if
             'tiny' in str(text_file)]
# txt_files = sorted(txt_files)[:7]

mp3_files = [str(audio_file) for audio_file in audio_dir.glob('*.mp3') if
             'tiny' in str(audio_file)]
# mp3_files = sorted(mp3_files)[:7]

data_dict = {
    'mp3': mp3_files,
    'txt': txt_files,
}

dataset = Dataset.from_dict(data_dict, split="all")
dataset = dataset.train_test_split(test_size=0.2)
dataset = dataset.cast_column("mp3", Audio(sampling_rate=16_000))

In [4]:

# chars_to_ignore_regex = '[\,\?\.\!\-\;\:"]'
# chars_to_ignore_regex = '[\,\?\.\!\-\;\:\½"]'

# ignore_list = ['½', 'à', 'â', 'é', 'ï', '–', '—', '‘', '’', '“', '”', '…<', '=', '>',
#                '$', '%', '&', '(', ')', '+', '/', '0', '1', '2', '3', '4', '5', '6',
#                '7', '8', '9']
# '%': 'percent',
# '$': 'dollar',
# '+': 'plus',
# '-': 'minus',
# '½': 'half',

chars_to_ignore_regex = "[\,\?\.\!\-\;\:\"½+-0123456789&%$()=><…—–\n]"
#
replace_dict = {
    'à': 'a',
    'â': 'a',
    'é': 'e',
    'ï': 'i',
    '”': '"',
    '“': '"',
    '‘': "'",
    '’': "'",
}


def retrieve_text(batch):
    # load the contents of the file as a string
    txt_file = batch["txt"]
    with open(txt_file, 'r') as f:
        text = f.read()

    for k, v in replace_dict.items():
        text = text.replace(k, v)

    # text = re.sub('[\n]', ' ', text)

    # text = re.sub(chars_to_replace_1, '"', text)

    # do some processing
    batch["txt"] = re.sub(chars_to_ignore_regex, ' ', text).lower()
    return batch


dataset = dataset.map(retrieve_text)

dataset["train"][0]["txt"][:50]


  0%|          | 0/20 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ex/s]

' okay  but now i want to move to the next topic  w'

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['mp3', 'txt'],
        num_rows: 20
    })
    test: Dataset({
        features: ['mp3', 'txt'],
        num_rows: 6
    })
})

In [6]:
audio_sample = dataset["train"][2]
audio_sample["txt"].lower()

" so we're talking now about mortgages and how to value them  and if you remember now a mortgage so the first mortgages  by the way  that we know of  come from babylonian times  it's not like some american invented the mortgage or something   this was             years old and we have on these cuneiform tablets these mortgages  and so the idea of a mortgage is you make a promise  you back your promise with collateral  so if you don't keep the promise they can take your house  and there's some way of getting out of the promise because everybody knows the collateral  you might want to leave the home  and then you have to have some way of dissolving the promise because the promise involves many payments over time   so it's making a promise  backing it with collateral  and finding a way to dissolve the promise at prearranged terms in case you want to end it by prepaying  and that prepaying is called the refinancing option  and because there's a refinancing option it makes the mortgage a mu

In [7]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-100h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-100h")

Some weights of the model checkpoint at facebook/wav2vec2-base-100h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.mask_time_emb_vector']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-100h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
inputs = processor(audio_sample["mp3"]["array"], sampling_rate=audio_sample["mp3"]["sampling_rate"], return_tensors="pt")

In [None]:
with torch.no_grad():
    logits = model(**inputs).logits


In [None]:
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

no_lm = transcription[0].lower()

## NGRAM

In [11]:
processor = Wav2Vec2ProcessorWithLM.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")

In [12]:
" ".join(sorted(processor.tokenizer.get_vocab()))

"' </s> <pad> <s> <unk> A B C D E F G H I J K L M N O P Q R S T U V W X Y Z |"

In [13]:
transcription = processor.batch_decode(logits.numpy()).text
with_lm = transcription[0].lower()

In [16]:
gt = """Time to start. So this class and the next class and a half are going to be about Fisher’s theory of present value and the interest rate, and then we’re going to move to uncertainty.

So up until now what we’ve done is we found out first, if you know the whole economic system, how to solve for equilibrium. To figure out from the primitives of people’s tastes, their impatience, the technology, the economy, how to figure out the real rate of interest if provided there is no uncertainty in the world and people can forecast what is going to happen later.

We’ve found that once you’ve done that, the price of every asset, if people are rational and looking forward to the future, the price of every asset is going to be the present value of the future payments of the asset. So if you think of the payments as real payments, which is what Fisher always recommended, you discount by the real interest rate. If you think of them as cash payments then you discount by the nominal interest rate. So every asset corresponds to its present value of its dividends either discounted by the real rate or the nominal rate."""


In [21]:
transformation = jiwer.Compose([
    jiwer.ToUpperCase(),
    jiwer.RemoveWhiteSpace(replace_by_space=True),
    jiwer.RemoveMultipleSpaces(),
    jiwer.RemovePunctuation(),
    jiwer.ReduceToListOfListOfWords(word_delimiter=" ")
])

In [22]:
print(jiwer.compute_measures(gt,
                             no_lm,
                             truth_transform=transformation,
                             hypothesis_transform=transformation))

{'wer': 1.3349753694581281, 'mer': 0.9033333333333333, 'wil': 0.9861904761904762, 'wip': 0.013809523809523808, 'hits': 29, 'substitutions': 174, 'deletions': 0, 'insertions': 97}


In [23]:
print(jiwer.compute_measures(gt,
                             with_lm,
                             truth_transform=transformation,
                             hypothesis_transform=transformation))

{'wer': 1.29064039408867, 'mer': 0.903448275862069, 'wil': 0.9866825208085612, 'wip': 0.013317479191438763, 'hits': 28, 'substitutions': 175, 'deletions': 0, 'insertions': 87}
