# Finetuning

In [1]:
import json
import random
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Union

import numpy as np
import torch
from datasets import Dataset, Audio
from datasets import load_metric
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2ForCTC, TrainingArguments, Trainer


### Loading the dataset

In [2]:
base_directory = Path.cwd().parent

dataset_name = "yale/econ251"
data_dir = base_directory / 'data'
# audio_dir = data_dir / 'inputs' / dataset_name / 'lectures'
audio_dir = data_dir / 'inputs' / dataset_name / 'lectures-tiny'
predictions_dir = data_dir / 'predictions' / dataset_name

# transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts'
transcripts_dir = data_dir / 'inputs' / dataset_name / 'transcripts-tiny'

In [3]:
txt_files = [str(text_file) for text_file in transcripts_dir.glob('*.txt') if
             'tiny' in str(text_file)]
txt_files = sorted(txt_files)[:7]

mp3_files = [str(audio_file) for audio_file in audio_dir.glob('*.mp3') if
             'tiny' in str(audio_file)]
mp3_files = sorted(mp3_files)[:7]

data_dict = {
    'mp3': mp3_files,
    'txt': txt_files,
}

dataset = Dataset.from_dict(data_dict, split="all")
dataset = dataset.train_test_split(test_size=0.2)
dataset = dataset.cast_column("mp3", Audio(sampling_rate=16_000))

In [4]:
mp3_files

['/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/01-tiny.mp3',
 '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/02-tiny.mp3',
 '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/03-tiny.mp3',
 '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/04-tiny.mp3',
 '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/05-tiny.mp3',
 '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/06_tiny.mp3',
 '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/07_tiny.mp3']

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['mp3', 'txt'],
        num_rows: 5
    })
    test: Dataset({
        features: ['mp3', 'txt'],
        num_rows: 2
    })
})

In [6]:
dataset['train']['mp3']

[{'path': '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/07_tiny.mp3',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00393769,
         -0.00591562, -0.00895704], dtype=float32),
  'sampling_rate': 16000},
 {'path': '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/03-tiny.mp3',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00150798,
         -0.00490163, -0.00127959], dtype=float32),
  'sampling_rate': 16000},
 {'path': '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/04-tiny.mp3',
  'array': array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
         -5.2574551e-04, -1.7570147e-04,  7.2206960e-05], dtype=float32),
  'sampling_rate': 16000},
 {'path': '/home/hvwesten/Projects/uni/ASR/asr-transcribe-lecture/data/inputs/yale/econ251/lectures-tiny/01-tiny.mp3',
  'array': array([0.        , 0.        , 0.

### Creating the tokenizer

First we remove special characters and normalize text

In [7]:

# chars_to_ignore_regex = '[\,\?\.\!\-\;\:"]'
# chars_to_ignore_regex = '[\,\?\.\!\-\;\:\½"]'

# ignore_list = ['½', 'à', 'â', 'é', 'ï', '–', '—', '‘', '’', '“', '”', '…<', '=', '>',
#                '$', '%', '&', '(', ')', '+', '/', '0', '1', '2', '3', '4', '5', '6',
#                '7', '8', '9']
# '%': 'percent',
# '$': 'dollar',
# '+': 'plus',
# '-': 'minus',
# '½': 'half',

chars_to_ignore_regex = "[\,\?\.\!\-\;\:\"½+-0123456789&%$()=><…—–\n]"
#
replace_dict = {
    'à': 'a',
    'â': 'a',
    'é': 'e',
    'ï': 'i',
    '”': '"',
    '“': '"',
    '‘': "'",
    '’': "'",
}


def retrieve_text(batch):
    # load the contents of the file as a string
    txt_file = batch["txt"]
    with open(txt_file, 'r') as f:
        text = f.read()

    for k, v in replace_dict.items():
        text = text.replace(k, v)

    # text = re.sub('[\n]', ' ', text)

    # text = re.sub(chars_to_replace_1, '"', text)

    # do some processing
    batch["txt"] = re.sub(chars_to_ignore_regex, ' ', text).lower()
    return batch


dataset = dataset.map(retrieve_text)

dataset["train"][0]["txt"][:50]


  0%|          | 0/5 [00:00<?, ?ex/s]

  0%|          | 0/2 [00:00<?, ?ex/s]

"i think i'm going to start  so  so far where have "

We then extract all the characters in the text to use in our output

In [8]:
# def extract_all_chars(batch):
#     all_text = " ".join(batch["txt"])
#     vocab = list(set(all_text))
#     return {"vocab": [vocab], "all_text": [all_text]}
#
#
# vocabs = dataset.map(extract_all_chars,
#                      batched=True, batch_size=-1,
#                      keep_in_memory=True,
#                      remove_columns=dataset.column_names["train"])
#
# vocab_list = list(
#     set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
#
# vocab_dict = {v: k for k, v in enumerate(vocab_list)}
#
# print(sorted(vocab_dict, key=lambda x: x[0]))
#
# vocab_dict["|"] = vocab_dict[" "]
# del vocab_dict[" "]
#
#
# # TEMP FIX
# # vocab_dict["x"] = len(vocab_dict)
#
#
# vocab_dict["[UNK]"] = len(vocab_dict)
# vocab_dict["[PAD]"] = len(vocab_dict)
# print(len(vocab_dict))
#
#
# with open('../data/inputs/yale/econ251/vocab.json', 'w') as vocab_file:
#     json.dump(vocab_dict, vocab_file)


In [9]:
# tokenizer = Wav2Vec2CTCTokenizer("../data/inputs/yale/econ251/vocab_alt.json",
#                                  unk_token="[UNK]", pad_token="[PAD]",
#                                  word_delimiter_token="|")
tokenizer = Wav2Vec2CTCTokenizer("../data/inputs/yale/econ251/vocab.json",
                                 unk_token="[UNK]", pad_token="[PAD]",
                                 word_delimiter_token="|")

### Creating the feature extractor and combine with tokenizer into processor

In [10]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=16000,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=False)
#                                              return_attention_mask=True)

In [11]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                              tokenizer=tokenizer)

### Preprocess data

In [12]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(dataset["train"])-1)

print(dataset["train"][rand_int]["txt"])
ipd.Audio(data=np.asarray(dataset["train"][rand_int]["mp3"]["array"]),
          autoplay=False, rate=16000)


now  the course  just to summarize again  the course is the standard financial theory course that was made popular over the last ten years in a bunch of business schools  and those guys who developed this material basically thought that markets were great and finance was almost a separate part could be walled off from much of economics  so here at yale we've never taught finance that way   we've always taught it as a part of economics and the crisis recently  i think  has made it clear that that's probably the way one should really think about the problem  so it's become very fashionable now to say that financial theorists had everything all wrong and to ask how it is that they got everything all wrong  why didn't they anticipate the crash  and the two standard critiques of standard financial economics are a  it didn't allow for psychology  and you'll hear about that from shiller next semester  and b  it didn't take into account collateral  and it was all done in a very special case  a

In [13]:
#rand_int = random.randint(0, len(dataset["train"]))
print("Target text [:100]:", dataset["train"][rand_int]["txt"][:100])
print("Input array shape:",
      np.asarray(dataset["train"][rand_int]["mp3"]["array"]).shape)
print("Sampling rate:", dataset["train"][rand_int]["mp3"]["sampling_rate"])

Target text [:100]: now  the course  just to summarize again  the course is the standard financial theory course that wa
Input array shape: (1104249,)
Sampling rate: 16000


In [14]:
def prepare_dataset(batch):
    audio = batch["mp3"] # load and resample data, this takes the longest time

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio[
        "sampling_rate"]).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["txt"]).input_ids

    return batch

In [15]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset
                      .column_names["train"], num_proc=2)

    

#0:   0%|          | 0/3 [00:00<?, ?ex/s]

#1:   0%|          | 0/2 [00:00<?, ?ex/s]

    

#0:   0%|          | 0/1 [00:00<?, ?ex/s]

#1:   0%|          | 0/1 [00:00<?, ?ex/s]

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 2
    })
})

### Training & evaluation

In [17]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [18]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [19]:
wer_metric = load_metric("wer")

In [20]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # print('before', pred.label_ids)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    # print('after', pred.label_ids)

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [21]:
model = Wav2Vec2ForCTC.from_pretrained(
#     "facebook/wav2vec2-base-960h",
    "facebook/wav2vec2-base",
#     'facebook/wav2vec2-large-robust',
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
#     vocab_size = processor.tokenizer.vocab_size
)
model.config.ctc_zero_infinity = True
# vocab_size = len(processor.tokenizer)
# vocab_size = processor.tokenizer.vocab_size
#model.freeze_feature_extractor()
model.freeze_feature_encoder()

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_q.weight', 'project_q.bias', 'quantizer.weight_proj.weight', 'quantizer.codevectors', 'project_hid.bias', 'project_hid.weight', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [22]:
training_args = TrainingArguments(
    output_dir="../output/tiny-model",
    push_to_hub=False,
    group_by_length=True,
    per_device_train_batch_size=1, # (2 if 5tiny) max 1 to fit in memory
    evaluation_strategy="steps", #"epoch"
    num_train_epochs=500,#250,#500,#30,50
    warmup_steps=200,  #1000,
    eval_steps=100, # 500,
    logging_steps=100, #500,
    save_steps=100, # 500,
    # save_total_limit=2
    fp16=True,
    gradient_checkpointing=True,
    learning_rate=3e-4,  # was 1e-4 -- finetune these parameters
    weight_decay=0.005,  # finetune these parameters
    #
    gradient_accumulation_steps=2, # use this for fitting in memory
    eval_accumulation_steps=2, # use this for fitting in memory
    optim="adamw_torch" # use the pytorch adam implementation
)

In [23]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [24]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics= compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


In [25]:
trainer.train()

***** Running training *****
  Num examples = 5
  Num Epochs = 500
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 1000


Step,Training Loss,Validation Loss,Wer
100,4.6863,2.830871,1.0
200,3.5472,2.82483,1.0
300,3.5421,2.828549,1.0
400,3.5397,2.829017,1.0
500,3.5394,2.830775,1.0
600,3.5385,2.827633,1.0
700,3.5383,2.827776,1.0
800,3.5383,2.828296,1.0
900,3.5373,2.83071,1.0
1000,3.5371,2.827964,1.0


***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
Saving model checkpoint to ../output/tiny-model/checkpoint-100
Configuration saved in ../output/tiny-model/checkpoint-100/config.json
Model weights saved in ../output/tiny-model/checkpoint-100/pytorch_model.bin
Feature extractor saved in ../output/tiny-model/checkpoint-100/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
Saving model checkpoint to ../output/tiny-model/checkpoint-200
Configuration saved in ../output/tiny-model/checkpoint-200/config.json
Model weights saved in ../output/tiny-model/checkpoint-200/pytorch_model.bin
Feature extractor saved in ../output/tiny-model/checkpoint-200/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 2
  Batch size = 8
Saving model checkpoint to ../output/tiny-model/checkpoint-300
Configuration saved in ../output/tiny-model/checkpoint-300/config.json
Model weights saved in ../output/tiny-model/checkpoint-300/pytorch_

TrainOutput(global_step=1000, training_loss=3.6544224243164063, metrics={'train_runtime': 5322.0885, 'train_samples_per_second': 0.47, 'train_steps_per_second': 0.188, 'total_flos': 1.3500520427656453e+18, 'train_loss': 3.6544224243164063, 'epoch': 499.8})

***** Running training *****
  Num examples = 4
  Num Epochs = 1000
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 2000


Step,Training Loss,Validation Loss,Wer
100,4.0095,2.824042,1.0
200,2.8423,2.82641,1.0
300,2.8404,2.826035,1.0
400,2.8393,2.831851,1.0
500,2.837,2.826387,1.0
600,2.8364,2.826151,1.0
700,2.8379,2.824481,1.0
800,2.8368,5.039119,1.0
900,2.8478,2.831783,1.0
1000,2.8371,2.827201,1.0


***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to ../output/tiny-model/checkpoint-100
Configuration saved in ../output/tiny-model/checkpoint-100/config.json
Model weights saved in ../output/tiny-model/checkpoint-100/pytorch_model.bin
Feature extractor saved in ../output/tiny-model/checkpoint-100/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to ../output/tiny-model/checkpoint-200
Configuration saved in ../output/tiny-model/checkpoint-200/config.json
Model weights saved in ../output/tiny-model/checkpoint-200/pytorch_model.bin
Feature extractor saved in ../output/tiny-model/checkpoint-200/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1
  Batch size = 8
Saving model checkpoint to ../output/tiny-model/checkpoint-300
Configuration saved in ../output/tiny-model/checkpoint-300/config.json
Model weights saved in ../output/tiny-model/checkpoint-300/pytorch_

KeyboardInterrupt: 

In [32]:
save_name = '17-05_1835'
trainer.save_model(f'../output/tiny-model/{save_name}')
processor.save_pretrained(f'../output/tiny-model/{save_name}')

Saving model checkpoint to ../output/tiny-model/17-05_1835
Configuration saved in ../output/tiny-model/17-05_1835/config.json
Model weights saved in ../output/tiny-model/17-05_1835/pytorch_model.bin
Feature extractor saved in ../output/tiny-model/17-05_1835/preprocessor_config.json
Feature extractor saved in ../output/tiny-model/17-05_1835/preprocessor_config.json
tokenizer config file saved in ../output/tiny-model/17-05_1835/tokenizer_config.json
Special tokens file saved in ../output/tiny-model/17-05_1835/special_tokens_map.json


In [33]:
trainer.state.log_history

[{'loss': 3.1857,
  'learning_rate': 9.020000000000001e-05,
  'epoch': 50.0,
  'step': 100},
 {'eval_loss': 2.8738222122192383,
  'eval_wer': 1.0,
  'eval_runtime': 0.7047,
  'eval_samples_per_second': 1.419,
  'eval_steps_per_second': 1.419,
  'epoch': 50.0,
  'step': 100},
 {'loss': 2.8255,
  'learning_rate': 8.020000000000001e-05,
  'epoch': 100.0,
  'step': 200},
 {'eval_loss': 2.8551204204559326,
  'eval_wer': 1.0,
  'eval_runtime': 0.7004,
  'eval_samples_per_second': 1.428,
  'eval_steps_per_second': 1.428,
  'epoch': 100.0,
  'step': 200},
 {'loss': 2.8287, 'learning_rate': 7.02e-05, 'epoch': 150.0, 'step': 300},
 {'eval_loss': 2.872225284576416,
  'eval_wer': 1.0,
  'eval_runtime': 0.7034,
  'eval_samples_per_second': 1.422,
  'eval_steps_per_second': 1.422,
  'epoch': 150.0,
  'step': 300}]

In [None]:
# CUDA out of memory. Tried to allocate 13.23 GiB (GPU 0; 7.93 GiB total capacity; 757.87 MiB already allocated; 6.16 GiB free; 800.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Evaluation

In [51]:
from transformers import AutoModelForCTC, Wav2Vec2Processor

model_dir = "./output/tiny-model/checkpoint-200" #+ '17-05_1835'


model2 = AutoModelForCTC.from_pretrained(model_dir)
processor2 = Wav2Vec2Processor.from_pretrained(model_dir)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model2.to(device)


loading configuration file ./output/tiny-model/checkpoint-200/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "./output/tiny-model/checkpoint-200",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": true,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_norm": "group",
  "feat_proj_dropout": 0.1,
  "feat_quantizer_

OSError: Can't load tokenizer for './output/tiny-model/checkpoint-200'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './output/tiny-model/checkpoint-200' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.

In [None]:
dataset

In [26]:
def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)

    return batch

results = dataset["test"].map(map_to_result, remove_columns=dataset["test"]
                        .column_names)

  0%|          | 0/2 [00:00<?, ?ex/s]

In [27]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test WER: 1.000


In [28]:
print('fill')

fill


In [29]:
results['pred_str']

['', '']

In [56]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 4
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 1
    })
})

In [30]:
results['text']

["i think i'm going to start  so this is really the beginning of the finance part of the course  so far we've reviewed general equilibrium  which i said fisher invented or reinvented in order to do finance  and as you remember the main conclusions from general equilibrium are first that the market functioning by itself without interference from the outside  in other words a situation of laissez faire  leads to allocations that are pareto efficient   so they're in some sense good for the economy and good for the society  they don't maximize total welfare  that's not even a well defined thing as we saw last time because how can you measure  how can you add one person's utility to another  it doesn't even make sense  so economists at first were wrong to think of that as the criterion for good allocations  but there's another better definition of efficiency that pareto invented  called pareto efficiency  and the free market achieves pareto efficiency at least if there are no externalities 

In [32]:
# model.to("cuda")

with torch.no_grad():
    logits = model(torch.tensor(dataset["test"][:1]["input_values"],
                                device="cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)

# convert ids to tokens
" ".join(processor.tokenizer.convert_ids_to_tokens(pred_ids[0].tolist()))

'[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 2
    })
})

In [34]:
results['text']

["i think i'm going to start  so this is really the beginning of the finance part of the course  so far we've reviewed general equilibrium  which i said fisher invented or reinvented in order to do finance  and as you remember the main conclusions from general equilibrium are first that the market functioning by itself without interference from the outside  in other words a situation of laissez faire  leads to allocations that are pareto efficient   so they're in some sense good for the economy and good for the society  they don't maximize total welfare  that's not even a well defined thing as we saw last time because how can you measure  how can you add one person's utility to another  it doesn't even make sense  so economists at first were wrong to think of that as the criterion for good allocations  but there's another better definition of efficiency that pareto invented  called pareto efficiency  and the free market achieves pareto efficiency at least if there are no externalities 

In [35]:
# model.to("cuda")

with torch.no_grad():
    logits = model(torch.tensor(dataset["test"][:1]["input_values"],
                                device="cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)

# convert ids to tokens
" ".join(processor.tokenizer.convert_ids_to_tokens(pred_ids[0].tolist()))

'[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA