# Notebook for preprocessing Wikipedia (English) dataset

### Initilizing phonemizer and tokenizer

In [None]:
import yaml

config_path = "Configs/config_yue.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [None]:
from phonemize import phonemize
import ToJyutping

In [None]:
from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("hon9kon9ize/bert-large-cantonese") # you can use any other tokenizers if you want to
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to

In [None]:
def phonemeizer(text):
    return ToJyutping.get_jyutping(text)

phonemize("hello ! 你好》啊嗎？加崙", phonemeizer, tokenizer)

### Process dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("wikipedia", "20220301.zh-yue")['train'] # you can use other version of this dataset

In [None]:
df = dataset.to_pandas()

In [None]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [None]:
dataset = dataset.map(lambda t: phonemize(t['text'], phonemeizer, tokenizer), remove_columns=['text'], num_proc=16, cache_file_name=f"{root_directory}/phonemized_dataset.arrow")

In [None]:
dataset.save_to_disk(config['data_folder'])

print('Dataset saved to %s' % config['data_folder'])

In [None]:

from datasets import load_from_disk

dataset = load_from_disk(config['data_folder'])

In [None]:
# df = dataset.to_pandas()

phoneme_vocab = []
for char in df['phonemes'].explode().unique():
    phoneme_vocab.append(char)
    
phoneme_vocab = sorted(list(set(phoneme_vocab)))
    
with open(f"{root_directory}/phoneme_vocab.txt", "w") as f:
    f.write("\n".join(phoneme_vocab))

In [None]:
# check the dataset size
dataset

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [None]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [None]:
special_token = config['dataset_params']['word_separator']

### Test the dataset with dataloader


In [None]:
from dataloader import build_dataloader
import yaml
from datasets import load_from_disk
from transformers import AutoTokenizer
from text_utils import symbols

config_path = "Configs/config_yue.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))
dataset = load_from_disk(config['data_folder'])
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to
train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])
train_loader.token_maps = {}

_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

print(tokenizer.decode(words[0]))
print([symbols[i] for i in labels[0]])