In [8]:
import json
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import DatasetDict, load_dataset, load_metric
from glob import glob
from tqdm import tqdm


In [9]:
def audio_to_array(batch: DatasetDict) -> DatasetDict:
    audio_array, sampling_rate = torchaudio.load(batch['path'])
    resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16_000)
    batch['audio'] = resampler(audio_array).squeeze().numpy()
    return batch


In [10]:
PATH = 'indonesian-nlp/wav2vec2-large-xlsr-indonesian'

processor = Wav2Vec2Processor.from_pretrained(PATH)
model = Wav2Vec2ForCTC.from_pretrained(PATH)
model.to('cuda')

prefix_path = '..\\malay_youtube'

label_paths = glob(pathname=f'{prefix_path}\\*.txt')
audio_paths = glob(pathname=f'{prefix_path}\\*.wav')

assert len(label_paths) == len(audio_paths), 'Number of labels and audios must be equal'

dataset = []

for l, a in tqdm(zip(label_paths, audio_paths)):
    with open(l) as f:
        dataset.append({
            'label': f.readlines(),
            'path': a
            })

json_path = f'{prefix_path}\\malay_youtube.json'


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
16it [00:00, 8022.58it/s]


In [11]:
with open(json_path, 'w') as f:
    for d in dataset:
        json.dump(d, f)


In [12]:
inference_dataset = load_dataset('json', data_files=json_path)


Using custom data configuration default-323ba8c376595263


Downloading and preparing dataset json/default to C:\Users\liana\.cache\huggingface\datasets\json\default-323ba8c376595263\0.0.0\da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to C:\Users\liana\.cache\huggingface\datasets\json\default-323ba8c376595263\0.0.0\da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
inference_dataset = inference_dataset.map(audio_to_array)


  0%|          | 0/16 [00:00<?, ?ex/s]

In [14]:
def evaluate(batch: DatasetDict) -> DatasetDict:
    inputs = processor(batch['audio'], sampling_rate=16_000, return_tensors='pt', padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values.to('cuda'), attention_mask=inputs.attention_mask.to('cuda')).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch['pred'] = processor.batch_decode(pred_ids)
    return batch


In [15]:
results = inference_dataset.map(evaluate)


  0%|          | 0/16 [00:00<?, ?ex/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ..\aten\src\ATen\native\BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


In [16]:
metric = load_metric('wer')
print(f'WER: {metric.compute(predictions=results["train"]["pred"], references=results["train"]["label"])}')


WER: 1.444954128440367


In [17]:
results['train'].remove_columns('audio').to_json('eval_results.json')

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

8256