# Notebook for preprocessing Wikipedia (Czech) dataset

In [1]:
import os
import os.path as osp
import yaml
import phonemizer
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, load_from_disk, concatenate_datasets
from pebble import ProcessPool
from concurrent.futures import TimeoutError

In [2]:
N_CPUS = int(os.environ["PBS_NUM_PPN"])
print(f"> Number of CPUs: {N_CPUS}")

> Number of CPUs: 1


In [3]:
from text_utils import TextCleaner
text_cleaner = TextCleaner()
print(f'Symbols: {len(text_cleaner)}\n{text_cleaner.symbols}')

Symbols: 178
['$', ';', ':', ',', '.', '!', '?', '¡', '¿', '—', '…', '"', '«', '»', '“', '”', ' ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ɑ', 'ɐ', 'ɒ', 'æ', 'ɓ', 'ʙ', 'β', 'ɔ', 'ɕ', 'ç', 'ɗ', 'ɖ', 'ð', 'ʤ', 'ə', 'ɘ', 'ɚ', 'ɛ', 'ɜ', 'ɝ', 'ɞ', 'ɟ', 'ʄ', 'ɡ', 'ɠ', 'ɢ', 'ʛ', 'ɦ', 'ɧ', 'ħ', 'ɥ', 'ʜ', 'ɨ', 'ɪ', 'ʝ', 'ɭ', 'ɬ', 'ɫ', 'ɮ', 'ʟ', 'ɱ', 'ɯ', 'ɰ', 'ŋ', 'ɳ', 'ɲ', 'ɴ', 'ø', 'ɵ', 'ɸ', 'θ', 'œ', 'ɶ', 'ʘ', 'ɹ', 'ɺ', 'ɾ', 'ɻ', 'ʀ', 'ʁ', 'ɽ', 'ʂ', 'ʃ', 'ʈ', 'ʧ', 'ʉ', 'ʊ', 'ʋ', 'ⱱ', 'ʌ', 'ɣ', 'ɤ', 'ʍ', 'χ', 'ʎ', 'ʏ', 'ʑ', 'ʐ', 'ʒ', 'ʔ', 'ʡ', 'ʕ', 'ʢ', 'ǀ', 'ǁ', 'ǂ', 'ǃ', 'ˈ', 'ˌ', 'ː', 'ˑ', 'ʼ', '̝', '̊', 'ʰ', 'ʱ', 'ʲ', 'ʷ', 'ˠ', '˞', '↓', '↑', '→', '↗', '↘', "'", '̩', '̃', 'ᵻ']


In [4]:
CONFIG_PATH = 'configs/config.yml'
LANG = 'cs'
DATASET = '../BERT_cs_phn_ipa/vety.phn.ipa.txt'
ROOT_DIR = "./wiki_phoneme" # set up root directory for multiprocessor processing
NUM_SHARDS = 100
MAX_WORKERS = N_CPUS # change this to the number of CPU cores your machine has

In [5]:
# Input:  phonetic sentence from a phonetically transcribed dataset
# Output: list of phonetic words IDs
#         list of phonetic words
def process_ph_dataset(phone_sent, tokenizer):
    ph_words = tokenizer.tokenize(phone_sent)
    inp_ids = [tokenizer.encode(w)[0] for w in ph_words]
    assert len(inp_ids) == len(ph_words)
    return {'input_ids': inp_ids, 'phonemes': ph_words}

# Process shard: add phonetic word IDs and phonetic words to the dataset
def process_shard(i):
    directory = f'{ROOT_DIR}/shard_{i}'
    if osp.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=NUM_SHARDS, index=i)
    processed_dataset = shard.map(lambda t: process_ph_dataset(t['text'], tokenizer), remove_columns=['text'])
    if not osp.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

### Initilizing phonemizer and tokenizer

config = yaml.safe_load(open(CONFIG_PATH))

global_phonemizer = phonemizer.backend.EspeakBackend(
    language=LANG,
    preserve_punctuation=True, 
    with_stress=False,
    language_switch='remove-flags',
)

In [6]:
config = yaml.safe_load(open(CONFIG_PATH))
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer'])

### Process dataset

In [7]:
dataset = load_dataset('text', data_files=DATASET)['train']
# dataset = load_dataset("wikipedia", "20220301.en")['train'] # you can use other version of this dataset

In [8]:
# dataset.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])
# dataset = dataset.select(range(100))

In [9]:
dataset

Dataset({
    features: ['text'],
    num_rows: 524472
})

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [10]:
with ProcessPool(max_workers=MAX_WORKERS) as pool:
    pool.map(process_shard, range(NUM_SHARDS), timeout=None)

Processing shard 0 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 1 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 2 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 3 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 4 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 5 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 6 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 7 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 8 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 9 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 10 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 11 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 12 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 13 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 14 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 15 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 16 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 17 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 18 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 19 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 20 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 21 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 22 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 23 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 24 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 25 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 26 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 27 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 28 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 29 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 30 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 31 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 32 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 33 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 34 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 35 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 36 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 37 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 38 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 39 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 40 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 41 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 42 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 43 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 44 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 45 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 46 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 47 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 48 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 49 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 50 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 51 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 52 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 53 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 54 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 55 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 56 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 57 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 58 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 59 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 60 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 61 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 62 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 63 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 64 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 65 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 66 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 67 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 68 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 69 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 70 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 71 ...


Saving the dataset (0/1 shards):   0%|          | 0/5245 [00:00<?, ? examples/s]

Processing shard 72 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 73 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 74 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 75 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 76 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 77 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 78 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 79 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 80 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 81 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 82 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 83 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 84 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 85 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 86 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 87 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 88 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 89 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 90 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 91 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 92 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 93 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 94 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 95 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 96 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 97 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 98 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

Processing shard 99 ...


Saving the dataset (0/1 shards):   0%|          | 0/5244 [00:00<?, ? examples/s]

### Collect all shards to form the processed dataset

In [11]:
output = [d for d in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, d))]
datasets = []
for o in output:
    directory = f'{ROOT_DIR}/{o}'
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print(f'{o} loaded')
    except:
        continue

shard_0 loaded
shard_1 loaded
shard_2 loaded
shard_3 loaded
shard_4 loaded
shard_5 loaded
shard_6 loaded
shard_7 loaded
shard_8 loaded
shard_9 loaded
shard_10 loaded
shard_11 loaded
shard_12 loaded
shard_13 loaded
shard_14 loaded
shard_15 loaded
shard_16 loaded
shard_17 loaded
shard_18 loaded
shard_19 loaded
shard_20 loaded
shard_21 loaded
shard_22 loaded
shard_23 loaded
shard_24 loaded
shard_25 loaded
shard_26 loaded
shard_27 loaded
shard_28 loaded
shard_29 loaded
shard_30 loaded
shard_31 loaded
shard_32 loaded
shard_33 loaded
shard_34 loaded
shard_35 loaded
shard_36 loaded
shard_37 loaded
shard_38 loaded
shard_39 loaded
shard_40 loaded
shard_41 loaded
shard_42 loaded
shard_43 loaded
shard_44 loaded
shard_45 loaded
shard_46 loaded
shard_47 loaded
shard_48 loaded
shard_49 loaded
shard_50 loaded
shard_51 loaded
shard_52 loaded
shard_53 loaded
shard_54 loaded
shard_55 loaded
shard_56 loaded
shard_57 loaded
shard_58 loaded
shard_59 loaded
shard_60 loaded
shard_61 loaded
shard_62 loaded
sh

In [12]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print(f'Dataset saved to {config["data_folder"]}')

Saving the dataset (0/1 shards):   0%|          | 0/524472 [00:00<?, ? examples/s]

Dataset saved to datasets/cz-phon-sentences.processed


In [13]:
# check the dataset size
dataset

Dataset({
    features: ['input_ids', 'phonemes'],
    num_rows: 524472
})

### Test the dataset with dataloader


In [14]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=4, num_workers=0, dataset_config=config['dataset_params'])

In [15]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [16]:
words[0]

tensor([  1817,   1817,   1817,   1817,   1817,   1817, 291686,     15,     15,
        291686,      8,      8, 291686,     88,     88,     88,     88, 291686,
            77,     77,     77,     77, 291686,  58674,  58674,  58674,  58674,
         58674,  58674,  58674,  58674, 291686,  10744,  10744,  10744,  10744,
         10744,  10744,  10744,  10744,  10744,  10744, 291686,      7, 291686,
        141155, 141155, 141155, 141155, 141155, 141155, 141155, 141155, 141155,
        141155, 141155, 291686,      8,      8, 291686, 282857, 282857, 282857,
        282857, 282857, 282857, 291686,      5, 291686,     48,     48,     48,
            48,     48,     48, 291686,     19,     19, 291686,    724,    724,
           724,    724,    724,    724,    724,    724,    724,    724,    724,
           724, 291686,    382,    382,    382,    382,    382,    382,    382,
           382, 291686,  30248,  30248,  30248,  30248,  30248,  30248,  30248,
         30248,  30248,  30248,  30248, 

In [17]:
labels[0]

tensor([ 57,  58,  62, 131,  43,  56,  16,  44,  51,  16,  61,  47,  16,  62,
         47,  46,  51,  16,  55, 114,  47,  54,  16,  56,  43,  58,  60,  43,
         64,  46,  63,  16,  61,  58,  57,  54,  47,  50,  56,  57, 135,  62,
         16,  43,  16,  56,  47,  61,  66,  57,  64,  43, 158,  64,  43,  62,
         16,  61,  47,  16,  68,  43,  54,  57,  44,  51,  16,   3,  16,  53,
         62,  47,  60,  43, 158,  16,  64,  47,  16,  61,  53,  63,  62,  47,
         62, 131,  56,  57,  61,  45,  51,  16,  68,  56,  43,  55,  47,  56,
         43, 158,  16,  58,  60,  57,  60,  63, 158,  61,  62,  43, 158, 114,
         51, 158,  16,  55,  47, 158,  46,  51,  52,  51, 158,  16,  46,  57,
         58,  60,  43,  53,  62,  51,  62,  61,  53,  47, 158,  16,  58,  57,
         54,  51,  62,  51,  53,  51,  16,   3,  16,  62,  61,  57, 147,  16,
         52,  47,  16,  58,  60, 161, 162,  51,  56,  47, 102,  55,  47,  56,
        131,  51, 158,  55,  16,  61,  62,  47, 102, 114,  47,  

In [18]:
phonemes[0]

tensor([ 29,  29,  29,  29,  29,  29,  16,  44,  51,  16,  61,  47,  16,  62,
         47,  46,  51,  16,  55, 114,  47,  54,  16,  29,  29,  29,  29,  29,
         29,  29,  29,  16,  29,  29,  29,  29,  29,  29,  29,  29,  29,  29,
         16,  29,  16,  29,  29,  29,  29,  29,  29,  29,  29,  29,  29,  29,
         16,  61,  47,  16,  68,  43,  54,  57,  44,  51,  16,   3,  16,  53,
         62,  47,  60,  43, 158,  16,  64,  47,  16,  61,  53,  63,  62,  47,
         62, 131,  56,  57,  61,  45,  51,  16,  68,  56,  43,  55,  47,  56,
         43, 158,  16,  58,  60,  57,  60,  63, 158,  61,  62,  43, 158, 114,
         51, 158,  16,  55,  47, 158,  46,  51,  52,  51, 158,  16,  46,  57,
         58,  60,  43,  53,  62,  51,  62,  61,  53,  47, 158,  16,  58,  57,
         54,  51,  62,  51,  53,  51,  16,   3,  16,  62,  61,  57, 147,  16,
         52,  47,  16,  29,  29,  29,  29,  29,  29,  29,  29,  29,  29,  29,
         29,  29,  29,  29,  16,  61,  62,  47, 102, 114,  47,  