# Notebook for preprocessing Wikipedia (Czech) dataset

In [None]:
import os
import os.path as osp
import yaml
import phonemizer
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, load_from_disk, concatenate_datasets
from pebble import ProcessPool
from concurrent.futures import TimeoutError

In [None]:
N_CPUS = int(os.environ["PBS_NUM_PPN"])
print(f"> Number of CPUs: {N_CPUS}")

In [None]:
from text_utils import TextCleaner
text_cleaner = TextCleaner()
print(f'Symbols: {len(text_cleaner)}\n{text_cleaner.symbols}')

In [None]:
CONFIG_PATH = 'configs/config.yml'
LANG = 'cs'
DATASET = '../BERT_cs_phn_ipa/vety.phn.ipa.txt'
ROOT_DIR = "./wiki_phoneme" # set up root directory for multiprocessor processing
NUM_SHARDS = 100
MAX_WORKERS = N_CPUS # change this to the number of CPU cores your machine has

In [None]:
# Input:  phonetic sentence from a phonetically transcribed dataset
# Output: list of phonetic words IDs
#         list of phonetic words
def process_ph_dataset(phone_sent, tokenizer):
    ph_words = tokenizer.tokenize(phone_sent)
    inp_ids = [tokenizer.encode(w)[0] for w in ph_words]
    assert len(inp_ids) == len(ph_words)
    return {'input_ids': inp_ids, 'phonemes': ph_words}

# Process shard: add phonetic word IDs and phonetic words to the dataset
def process_shard(i):
    directory = f'{ROOT_DIR}/shard_{i}'
    if osp.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=NUM_SHARDS, index=i)
    processed_dataset = shard.map(lambda t: process_ph_dataset(t['text'], tokenizer), remove_columns=['text'])
    if not osp.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

### Initilizing phonemizer and tokenizer

config = yaml.safe_load(open(CONFIG_PATH))

global_phonemizer = phonemizer.backend.EspeakBackend(
    language=LANG,
    preserve_punctuation=True, 
    with_stress=False,
    language_switch='remove-flags',
)

In [None]:
config = yaml.safe_load(open(CONFIG_PATH))
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer'])

### Process dataset

In [None]:
dataset = load_dataset('text', data_files=DATASET)['train']
# dataset = load_dataset("wikipedia", "20220301.en")['train'] # you can use other version of this dataset

In [None]:
# dataset.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])
# dataset = dataset.select(range(100))

In [None]:
dataset

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [None]:
with ProcessPool(max_workers=MAX_WORKERS) as pool:
    pool.map(process_shard, range(NUM_SHARDS), timeout=None)

### Collect all shards to form the processed dataset

In [None]:
output = [d for d in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, d))]
datasets = []
for o in output:
    directory = f'{ROOT_DIR}/{o}'
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print(f'{o} loaded')
    except:
        continue

In [None]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print(f'Dataset saved to {config["data_folder"]}')

In [None]:
# check the dataset size
dataset

### Test the dataset with dataloader


In [None]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=4, num_workers=0, dataset_config=config['dataset_params'])

In [None]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [None]:
words[0]

In [None]:
labels[0]

In [None]:
phonemes[0]