In [36]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from datasets import load_dataset
import torch
 
# load model and processor
#processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
#model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
processor = Wav2Vec2Processor.from_pretrained("jimregan/wav2vec2-large-xls-r-300m-irish-colab")
model = Wav2Vec2ForCTC.from_pretrained("jimregan/wav2vec2-large-xls-r-300m-irish-colab")

Downloading:   0%|          | 0.00/214 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

In [5]:
def speech_file_to_array_fn(batch):
    import torchaudio
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["audio"] = speech_array[0].numpy()
    batch["sampling_rate"] = 16_000
    #batch["target_text"] = batch["sentence"]
    return batch

In [6]:
from pathlib import Path
_BASE = Path("/media/phonetics/asr_data_irish/audio/")

In [7]:
MSF = _BASE / "mul_mo_sceal_fein" / "wav"

In [8]:
msf_files = [str(f) for f in MSF.glob("*.wav")]

In [9]:
from datasets import Dataset
dataset = Dataset.from_dict({
    "path": msf_files,
})

In [10]:
dataset = dataset.map(speech_file_to_array_fn)

  0%|          | 0/3634 [00:00<?, ?ex/s]

In [12]:
filt = remove_long_common_voicedata(dataset)

In [13]:
filt

Dataset({
    features: ['path', 'audio', 'sampling_rate', '__index_level_0__'],
    num_rows: 2992
})

In [38]:
model.cuda()

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureExtractor(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (1): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (2): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (3): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (4): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,)

In [27]:
def evaluate(batch):
  import torch
  inputs = processor(batch["audio"], sampling_rate=16_000, return_tensors="pt", padding=True)

  with torch.no_grad():
    logits = model(inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda")).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_strings"] = processor.batch_decode(pred_ids)
  return batch

In [39]:
result = filt.map(evaluate, batched=True, batch_size=8)

  0%|          | 0/374 [00:00<?, ?ba/s]

In [40]:
with open("w2v.out2", "w") as outf:
    for t in result:
        outf.write(f"{t['path']}\t{t['pred_strings']}\n")

In [11]:
# https://discuss.huggingface.co/t/wav2vec2-0-memory-issue/4868/8
def remove_long_common_voicedata(dataset, max_seconds=10):
  #convert pyarrow table to pandas
  dftest = dataset.to_pandas()

  #find out length of input_values
  dftest['len'] = dftest['audio'].apply(len)

  #for wav2vec training we already resampled to 16khz
  #remove data that is longer than max_seconds (6 seconds ideal)
  maxLength = max_seconds * 16000
  dftest = dftest[dftest['len'] < maxLength]
  dftest = dftest.drop('len', 1)

  #convert back to pyarrow table to use in trainer
  dataset = dataset.from_pandas(dftest)

  #directly remove do not wait for gc
  del dftest

  return dataset