In [1]:
from datasets import load_dataset, Audio, concatenate_datasets
import pandas as pd
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
from datasets import Dataset, load_dataset
import soundfile as sf
import torch
import re
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\*]'
# chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch


def prepare_dataset(batch):
    audio = batch["audio"]
    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch


def load_dataset_from_files(data_dir_list:list[str], csv_export_dir:str, split_ratio=0.1, csv_export=True):
    frames = []
    for path in data_dir_list:
        source = os.path.basename(os.path.dirname(path))
        wavfile_data = []
        textfile_data = []
        for (root, dirs, files) in os.walk(path, topdown=True):
            if source == "Rundkast":  # to modify depending on Rundkast cuts folder name
                for fn in files:
                    if fn.endswith(".wav"):
                        wav_id = source + "_" + os.path.splitext(fn)[0]
                        path = os.path.join(root, fn)
                        wavfile_data.append((wav_id, fn, path, source))
                    elif fn.endswith(".txt"):
                        text_id = source + "_" + os.path.splitext(fn)[0]
                        with open(os.path.join(root, fn), encoding="utf-8") as text_file:
                            text = text_file.read()
                        textfile_data.append((text_id, text))
            else:
                for fn in files:
                    if fn.endswith(".wav"):
                        wav_id = source + "_" + os.path.splitext(fn)[0]
                        path = os.path.join(root, fn)
                        wavfile_data.append((wav_id, fn, path, source))
                    elif fn.endswith(".txt-utf8"):
                        text_id = source + "_" + os.path.splitext(fn)[0]
                        with open(os.path.join(root, fn), encoding="utf-8-sig") as text_file:
                            text = text_file.read()
                        textfile_data.append((text_id, text))
        df_wav = pd.DataFrame(wavfile_data, columns=["segment_id", "wav_file", "path", "source"])
        df_wav = df_wav.set_index("segment_id")
        df_text = pd.DataFrame(textfile_data, columns=["segment_id", "text"])
        df_text = df_text.set_index("segment_id")
        dataset_df = df_wav.merge(df_text, left_index=True, right_index=True)
        frames.append(dataset_df)
    # concat to full dataframe and convert to Dataset with special characters removed
    full_dataset_df = pd.concat(frames)
    raw_dataset = Dataset.from_pandas(full_dataset_df)
    raw_dataset = raw_dataset.map(remove_special_characters)
    # split dataset
    raw_dataset = raw_dataset.train_test_split(test_size=split_ratio)
    # save copy of dataset
    if csv_export is True:
        df_train = pd.DataFrame(raw_dataset["train"])
        df_train.to_csv(os.path.join(csv_export_dir, "train_set.csv"))
        df_dev = pd.DataFrame(raw_dataset["test"])
        df_dev.to_csv(os.path.join(csv_export_dir, "dev_set.csv"))
    # loading audio
    dataset = raw_dataset.cast_column("path", Audio())
    dataset = dataset.rename_column("path", "audio")
    dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
    # preprocess dataset
    # dataset = dataset.map(prepare_dataset,
    #                       remove_columns=dataset.column_names["train"],
    #                       num_proc=4)
    return raw_dataset, dataset

In [3]:
data_dir_list = ["../../datasets/NordTrans_TUL/train_small/Stortinget/",
                 "../../datasets/NordTrans_TUL/train_small/NRK/",
                 "../../datasets/NordTrans_TUL/train_small/Rundkast/"]

# data_dir_list = ["../../datasets/NordTrans_TUL/train_small/Stortinget/"]

In [4]:
csv_export_dir = "./code_trial/"

raw_dataset, dataset = load_dataset_from_files(data_dir_list, csv_export_dir, split_ratio=0.1, csv_export=True)

100%|██████████| 24300/24300 [00:01<00:00, 21981.72ex/s]


In [5]:
dataset.map(remove_special_characters)

100%|██████████| 21870/21870 [00:01<00:00, 14693.13ex/s]
100%|██████████| 2430/2430 [00:00<00:00, 15024.86ex/s]


DatasetDict({
    train: Dataset({
        features: ['wav_file', 'audio', 'source', 'text', 'segment_id'],
        num_rows: 21870
    })
    test: Dataset({
        features: ['wav_file', 'audio', 'source', 'text', 'segment_id'],
        num_rows: 2430
    })
})

In [6]:
model_name = "NbAiLab/nb-wav2vec2-300m-bokmaal"
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)

dataset = dataset.map(prepare_dataset,
                        remove_columns=dataset.column_names["train"],
                        num_proc=4)


Please use `allow_patterns` and `ignore_patterns` instead.
Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 1665.07it/s]

#2:   0%|          | 0/5467 [00:00<?, ?ex/s]




#0:   2%|▏         | 123/5468 [00:01<00:32, 166.99ex/s]
[A

#0:   5%|▍         | 249/5468 [00:01<00:15, 344.40ex/s]
[A

#0:   7%|▋         | 373/5468 [00:01<00:09, 512.85ex/s]
[A

#0:   9%|▉         | 498/5468 [00:01<00:07, 668.24ex/s]
[A

#0:  11%|█▏        | 621/5468 [00:01<00:06, 798.23ex/s]
[A

#0:  14%|█▎        | 749/5468 [00:01<00:05, 916.63ex/s]
[A

#0:  16%|█▌        | 878/5468 [00:01<00:04, 1013.68ex/s]
[A

[A[A
[A

#0:  18%|█▊        | 1001/5468 [00:02<00:09, 476.13ex/s]
#0:  21%|██        | 1141/5468 [00:02<00:07, 611.93ex/s]

[A[A
#0:  24%|██▎       | 1286/5468 [00:02<00:05, 757.02ex/s]

[A[A
#0:  26%|██▌       | 1424/5468 [00:02<00:04, 880.03ex/s]

[A[A
#0:  29%|██▊       | 1568/5468 [00:02<00:03, 1002.50ex/s]

[A[A
#0:  31%|███▏      | 1711/5468 [00:02<00:03, 1103.90ex/s]

[A[A


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 21870
    })
    test: Dataset({
        features: ['input_values', 'labels'],
        num_rows: 2430
    })
})

In [6]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [7]:
vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])

100%|██████████| 1/1 [00:00<00:00,  2.61ba/s]
100%|██████████| 1/1 [00:00<00:00, 26.59ba/s]


In [8]:
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'2': 0,
 'b': 1,
 '1': 2,
 'k': 3,
 'è': 4,
 "'": 5,
 '3': 6,
 'ó': 7,
 'r': 8,
 'v': 9,
 't': 10,
 'u': 11,
 'l': 12,
 'å': 13,
 'p': 14,
 'm': 15,
 'æ': 16,
 'f': 17,
 'w': 18,
 'c': 19,
 'o': 20,
 'í': 21,
 'n': 22,
 ' ': 23,
 '–': 24,
 '4': 25,
 'ü': 26,
 'd': 27,
 'g': 28,
 'z': 29,
 'i': 30,
 'e': 31,
 '`': 32,
 'q': 33,
 'ö': 34,
 'y': 35,
 'ä': 36,
 'a': 37,
 'x': 38,
 '6': 39,
 'á': 40,
 'h': 41,
 'ø': 42,
 'é': 43,
 'j': 44,
 '9': 45,
 'ò': 46,
 's': 47}

In [61]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [62]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

52


In [14]:
processor.tokenizer.vocab_size

32

In [15]:
for i in range(32):
    print(i, processor.tokenizer.convert_ids_to_tokens(i))

0 |
1 a
2 b
3 c
4 d
5 e
6 f
7 g
8 h
9 i
10 j
11 k
12 l
13 m
14 n
15 o
16 p
17 q
18 r
19 s
20 t
21 u
22 v
23 w
24 x
25 y
26 z
27 å
28 æ
29 ø
30 [UNK]
31 [PAD]
