In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
 
# load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2PhonemeCTCTokenizer'. 
The class this function is called from is 'Wav2Vec2CTCTokenizer'.


In [2]:
def speech_file_to_array_fn(batch):
    import torchaudio
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["audio"] = speech_array[0].numpy()
    batch["sampling_rate"] = 16_000
    #batch["target_text"] = batch["sentence"]
    return batch

In [3]:
from pathlib import Path
_BASE = Path("/media/phonetics/asr_data_irish/audio/")

In [4]:
MSF = _BASE / "mul_mo_sceal_fein" / "wav"

In [None]:
msf_files = [str(f) for f in MSF.glob("*.wav")]

In [None]:
from datasets import Dataset
dataset = Dataset.from_dict({
    "path": msf_files,
})

In [None]:
dataset = dataset.map(speech_file_to_array_fn)

  0%|          | 0/3634 [00:00<?, ?ex/s]

In [None]:
model.cuda()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureExtractor(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (4): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,)

In [None]:
inputs = processor(dataset["audio"], sampling_rate=16_000, return_tensors="pt", padding=True)
#input_values = processor(msf_ds, return_tensors="pt").input_values

In [None]:
# retrieve logits
with torch.no_grad():
   logits = model(input_values).logits

# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)

In [None]:
for t in transcription:
    with open("w2v.out", "w") as outf:
        outf.write(f"{t['path']}\t{t['transcription']}\n")

In [None]:
def remove_long_common_voicedata(dataset, max_seconds=6):

  #convert pyarrow table to pandas

  dftest= dataset.to_pandas()

  #find out length of input_values

  dftest['len']= dftest['input_values'].apply(len)

  #for wav2vec training we already resampled to 16khz

  #remove data that is longer than max_seconds (6 seconds ideal)

  maxLength = max_seconds*16000 

  dftest= dftest[dftest['len']<maxLength]

  dftest = dftest.drop('len', 1)

  #convert back to pyarrow table to use in trainer

  dataset= dataset.from_pandas(dftest)

  #directly remove do not wait for gc

  del dftest

  return dataset