# Notebook for preprocessing Wikipedia (English) dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml
import sys

# Set path to compatible transformers 4.33.3 library
sys.path.insert(0, '/storage/plzen4-ntis/home/jmatouse/.local/transformers-4.33.3/lib/python3.10/site-packages')

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [2]:
from phonemize import phonemize

In [3]:
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)

In [4]:
from transformers import TransfoXLTokenizer
tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to

### Process dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("wikipedia", "20220301.en")['train'] # you can use other version of this dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [6]:
dataset = dataset.select(range(100))

In [7]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [8]:
import os
num_shards = 10

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

In [9]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [10]:
max_workers = 1 # change this to the number of CPU cores your machine has 

with ProcessPool(max_workers=max_workers) as pool:
    pool.map(process_shard, range(num_shards), timeout=60)

Processing shard 0 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 1 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 2 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 3 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 4 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 5 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 6 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 7 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 8 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Processing shard 9 ...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

### Collect all shards to form the processed dataset

In [11]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

shard_0 loaded
shard_1 loaded
shard_2 loaded
shard_3 loaded
shard_4 loaded
shard_5 loaded
shard_6 loaded
shard_7 loaded
shard_8 loaded
shard_9 loaded


In [12]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset saved to wikipedia_20220301.en.processed


In [13]:
# check the dataset size
dataset

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes'],
    num_rows: 100
})

In [25]:
dataset[0]

{'id': '12',
 'url': 'https://en.wikipedia.org/wiki/Anarchism',
 'title': 'Anarchism',
 'input_ids': [66637,
  23,
  8,
  470,
  4161,
  5,
  955,
  16,
  23,
  22792,
  4,
  1795,
  5,
  13142,
  73,
  26029,
  2,
  35962,
  1345,
  4,
  11661,
  3,
  66637,
  2170,
  17,
  1,
  12296,
  4,
  1,
  185,
  2,
  34,
  29,
  3154,
  6,
  35,
  9247,
  2,
  23314,
  2,
  5,
  13047,
  3,
  147,
  8,
  6070,
  214,
  1260,
  1625,
  955,
  2,
  624,
  15,
  1,
  26305,
  214,
  4,
  1,
  470,
  8134,
  2,
  29,
  23,
  841,
  244,
  1525,
  128834,
  5,
  25511,
  22514,
  18,
  1,
  25511,
  1625,
  22,
  25511,
  13584,
  21,
  4,
  1,
  8363,
  955,
  2,
  5,
  54,
  8,
  586,
  1875,
  2920,
  19,
  962,
  1260,
  13466,
  5,
  13584,
  3,
  21481,
  1122,
  7,
  7661,
  338,
  3094,
  40020,
  178,
  97,
  1,
  2872,
  4,
  3094,
  800,
  2,
  18756,
  2,
  47,
  22835,
  3,
  486,
  1,
  2135,
  4,
  4402,
  28669,
  1878,
  2,
  31708,
  1703,
  1795,
  44,
  3018,
  3,
  341,
  9088

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [14]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
# loader = build_dataloader(file_data, num_workers=32, batch_size=128)
loader = build_dataloader(file_data, num_workers=1, batch_size=4)

In [15]:
special_token = config['dataset_params']['word_separator']

In [16]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

100%|██████████| 25/25 [00:01<00:00, 22.11it/s]


In [17]:
# get each token's lower case

lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    if word.lower() != word:
        t = tokenizer.encode([word.lower()])[0]
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)

  0%|          | 0/33846 [00:00<?, ?it/s]2024-09-25 14:04:42.245201: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9360] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-25 14:04:42.245260: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-25 14:04:42.245292: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1537] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|██████████| 33846/33846 [00:03<00:00, 8517.66it/s] 


In [18]:
lower_tokens = (list(set(lower_tokens)))

In [19]:
# redo the mapping for lower number of tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word = word.lower()
    new_t = tokenizer.encode([word.lower()])[0]
    token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)}

100%|██████████| 33846/33846 [00:03<00:00, 10587.79it/s]


In [20]:
len(token_maps)

33846

In [24]:
token_maps

{1: {'word': 'the', 'token': 0},
 2: {'word': ',', 'token': 1},
 3: {'word': '.', 'token': 2},
 4: {'word': 'of', 'token': 3},
 5: {'word': 'and', 'token': 4},
 6: {'word': 'to', 'token': 5},
 7: {'word': 'in', 'token': 6},
 8: {'word': 'a', 'token': 7},
 9: {'word': '=', 'token': 8},
 10: {'word': '"', 'token': 9},
 11: {'word': 'was', 'token': 10},
 262155: {'word': 'oxoacid', 'token': 11},
 13: {'word': 'the', 'token': 0},
 14: {'word': "'s", 'token': 12},
 15: {'word': 'on', 'token': 13},
 16: {'word': 'that', 'token': 14},
 17: {'word': 'for', 'token': 15},
 18: {'word': 'as', 'token': 16},
 19: {'word': 'with', 'token': 17},
 20: {'word': 'by', 'token': 18},
 21: {'word': ')', 'token': 19},
 22: {'word': '(', 'token': 20},
 23: {'word': 'is', 'token': 21},
 24: {'word': '<unk>', 'token': 22},
 25: {'word': 'his', 'token': 23},
 26: {'word': 'from', 'token': 24},
 27: {'word': 'at', 'token': 25},
 28: {'word': 'were', 'token': 26},
 29: {'word': 'it', 'token': 27},
 30: {'word': '

In [21]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


### Test the dataset with dataloader


In [22]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=4, num_workers=0, dataset_config=config['dataset_params'])

177


In [23]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

['ðə', 'ˈɑːɹdwʊlf', '(', 'pɹˈoʊɾəlz', 'kɹɪstˈɑːɾə', ')', 'ɪz', 'æn', 'ˌɪnsɛktˈɪvɚɹəs', 'mˈæməl', 'ˈɪn', 'ðə', 'fˈæmɪli', 'hˈaɪiːnˌɪdiː', ',', 'nˈeɪɾɪv', 'tuː', 'ˈiːst', 'ænd', 'sˈʌðɚn', 'ˈæfɹɪkə', '.', 'ɪts', 'nˈeɪm', 'mˈiːnz', '"', 'ˈɜːθ', '-', 'wˈʊlf', '"', 'ˈɪn', 'ˈæfɹɪkˌɑːnz', 'ænd', 'dˈʌtʃ', '.', 'ɪt', 'ɪz', 'ˈɔːlsoʊ', 'kˈɔːld', '"', 'mɑːnhˈɑːɹ', '-', 'dʒˈækəl', '"', '(', 'ˈæfɹɪkˌɑːnz', 'fɔːɹ', '"', 'mˈeɪn', '-', 'dʒˈækəl', '"', ')', ',', '"', 'tˈɜːmaɪt', '-', 'ˈiːɾɪŋ', 'haɪˈiːnə', '"', 'ænd', '"', 'sˈaɪvət', 'haɪˈiːnə', ',', '"', 'bˈeɪst', 'ˈɔn', 'ɪts', 'hˈæbɪt', 'ʌv', 'sᵻkɹˈiːɾɪŋ', 'sˈʌbstənsᵻz', 'fɹʌm', 'ɪts', 'ˈeɪnəl', 'ɡlˈænd', ',', 'ɐ', 'kˌæɹɪktɚɹˈɪstɪk', 'ʃˈɛɹd', 'wɪð', 'ðɪ', 'ˈæfɹɪkən', 'sˈaɪvət', '.', 'ʌnlˈaɪk', 'mˈɛni', 'ʌv', 'ɪts', 'ɹˈɛlətˌɪvz', 'ˈɪn', 'ðɪ', 'ˈɔːɹdɚ', 'kˈɑːɹnɪvˌoːɹə', ',', 'ðɪ', 'ˈɑːɹdwʊlf', 'dˈʌz', 'nˈɑːt', 'hˈʌnt', 'lˈɑːɹdʒ', 'ˈænɪməlz', '.', 'ɪt', 'ˈiːts', 'ˈɪnsɛkts', 'ænd', 'ðɛɹ', 'lˈɑːɹviː', ',', 'mˈeɪnli', 'tˈɜːmaɪts', ';', 'wˈʌn', 'ˈɑːɹdwʊlf', 'k