In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # or "0,1" for multiple GPUs

import collections
if not hasattr(collections, "Container"):
    import collections.abc
    collections.Container = collections.abc.Container
# import transformers
from transformers import AutoTokenizer, BertModel
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM, TrainingArguments, Trainer
from datasets import load_dataset, load_metric, ClassLabel, Audio, Dataset
import random
import pandas as pd
# import math
import numpy as np
# import librosa
import os
import torch
# from pydub import AudioSegment
# from IPython.display import display, HTML
import re
import json
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import wandb
import argparse
# import types
from customCTCwithASD import *
import sys

  if not hasattr(collections, "Container"):


Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [38]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\*\(\)\'\_]'
def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

def prepare_dataset(batch):
    audio = batch["audio"]
    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [40]:
def load_dataset_from_files(data_dir_list:list[str], split_ratio=0.1, csv_export=True):
    frames = []
    for path in data_dir_list:
        source = os.path.basename(os.path.dirname(path))
        wavfile_data = []
        textfile_data = []
        for (root, dirs, files) in os.walk(path, topdown=True):
            if source == "Rundkast":  # to modify depending on Rundkast cuts folder name
                for fn in files:
                    if fn.endswith(".wav"):
                        wav_id = source + "_" + os.path.splitext(fn)[0]
                        path = os.path.join(root, fn)
                        wavfile_data.append((wav_id, fn, path, source))
                    elif fn.endswith(".txt"):
                        text_id = source + "_" + os.path.splitext(fn)[0]
                        with open(os.path.join(root, fn), encoding="utf-8") as text_file:
                            text = text_file.read()
                        textfile_data.append((text_id, text))
            else:
                for fn in files:
                    if fn.endswith(".wav"):
                        wav_id = source + "_" + os.path.splitext(fn)[0]
                        path = os.path.join(root, fn)
                        wavfile_data.append((wav_id, fn, path, source))
                    elif fn.endswith(".txt-utf8"):
                        text_id = source + "_" + os.path.splitext(fn)[0]
                        with open(os.path.join(root, fn), encoding="utf-8-sig") as text_file:
                            text = text_file.read()
                        textfile_data.append((text_id, text))
        df_wav = pd.DataFrame(wavfile_data, columns=["segment_id", "wav_file", "path", "source"])
        df_wav = df_wav.set_index("segment_id")
        df_text = pd.DataFrame(textfile_data, columns=["segment_id", "text"])
        df_text = df_text.set_index("segment_id")
        dataset_df = df_wav.merge(df_text, left_index=True, right_index=True)
        frames.append(dataset_df)
    # concat to full dataframe and convert to Dataset with special characters removed
    full_dataset_df = pd.concat(frames)
    raw_dataset = Dataset.from_pandas(full_dataset_df)
    raw_dataset = raw_dataset.map(remove_special_characters)
    # split dataset
    raw_dataset = raw_dataset.train_test_split(test_size=split_ratio)
    # save copy of dataset
    if csv_export is True:
        df_train = pd.DataFrame(raw_dataset["train"])
        # df_train.to_csv(os.path.join(csv_export_dir, "train_set.csv"))
        df_dev = pd.DataFrame(raw_dataset["test"])
        # df_dev.to_csv(os.path.join(csv_export_dir, "dev_set.csv"))
    # loading audio
    dataset = raw_dataset.cast_column("path", Audio())
    dataset = dataset.rename_column("path", "audio")
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
    # preprocess dataset
    dataset = dataset.map(prepare_dataset,
                          remove_columns=dataset.column_names["train"],
                          num_proc=4)
    return raw_dataset, dataset

In [37]:
model_name = "NbAiLab/nb-wav2vec2-300m-bokmaal"
processor = Wav2Vec2Processor.from_pretrained(model_name)

In [41]:
print("Loading dataset direct from data dir to pandas dataframe")

data_dir_list = ["../../datasets/NordTrans_TUL/train_small/Rundkast/"]

raw_dataset, dataset = load_dataset_from_files(data_dir_list, split_ratio=0.1, csv_export=True)

print(raw_dataset)
print(dataset)

Loading dataset direct from data dir to pandas dataframe


100%|██████████| 7500/7500 [00:00<00:00, 20779.89ex/s]
#0:   0%|          | 0/1688 [00:00<?, ?ex/s]
[A




[A[A
#0:   7%|▋         | 112/1688 [00:00<00:04, 332.93ex/s]

[A[A
#0:  14%|█▍        | 233/1688 [00:00<00:02, 593.52ex/s]

[A[A
#0:  21%|██▏       | 359/1688 [00:00<00:01, 792.44ex/s]

[A[A
#0:  29%|██▊       | 482/1688 [00:00<00:01, 922.16ex/s]

[A[A
#0:  36%|███▌      | 604/1688 [00:00<00:01, 1009.46ex/s]

[A[A
#0:  43%|████▎     | 725/1688 [00:00<00:00, 1067.76ex/s]

[A[A
#0:  50%|█████     | 847/1688 [00:01<00:00, 1113.19ex/s]

[A[A
#0:  58%|█████▊    | 971/1688 [00:01<00:00, 1150.72ex/s]

#0:  65%|██████▍   | 1091/1688 [00:01<00:00, 609.09ex/s]
[A

#0:  73%|███████▎  | 1228/1688 [00:01<00:00, 747.24ex/s]
[A

#0:  80%|████████  | 1357/1688 [00:01<00:00, 859.20ex/s]
[A

#0:  88%|████████▊ | 1488/1688 [00:01<00:00, 961.55ex/s]
[A

#0:  96%|█████████▌| 1622/1688 [00:01<00:00, 1054.28ex/s]
[A

#1: 100%|██████████| 1688/1688 [00:01<00:00, 861.57ex/s] 
#0: 100

DatasetDict({
    train: Dataset({
        features: ['wav_file', 'path', 'source', 'text', 'segment_id'],
        num_rows: 6750
    })
    test: Dataset({
        features: ['wav_file', 'path', 'source', 'text', 'segment_id'],
        num_rows: 750
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 6750
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 750
    })
})


In [42]:
len(processor.tokenizer)

34