# Notebook for preprocessing Wikipedia (English) dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [2]:
import phonemizer
from phonemizer.backend import EspeakBackend

import sys
import os

import os
import os
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/home/lemoi18/local/espeak-ng/lib/libespeak-ng.so.1.52.0.1'
global_phonemizer = phonemizer.backend.EspeakBackend(language='nb', preserve_punctuation=True,  with_stress=True)

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-bert-base')  # You can change this to any other Norwegian tokenizer if needed

In [5]:
!pip install pandas singleton-decorator datasets accelerate nltk phonemizer sacremoses pebble

Collecting singleton-decorator
  Downloading singleton-decorator-1.0.0.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting pebble
  Downloading Pebble-5.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading mul

### Process dataset

In [3]:
from datasets import load_dataset
from huggingface_hub import login

# Perform the login with your Hugging Face API token
login('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')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/lemoi18/.cache/huggingface/token
Login successful


In [7]:
dataset = load_dataset("NbAiLab/NCC")

Downloading builder script:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/54.2k [00:00<?, ?B/s]

The repository for NbAiLab/NCC contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/NbAiLab/NCC.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0/46 [00:00<?, ?files/s]

Downloading data:   0%|          | 0.00/400M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/92 [00:00<?, ?it/s]

In [8]:
root_directory = "./NbAiLab_phoneme" # set up root directory for multiprocessor processing

In [19]:
!pip install -U sentence-transformers



Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [4]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
import re

# Load the model and tokenizer
model = SentenceTransformer('NbAiLab/nb-sbert-base')
tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-sbert-base')

# Define text
text = "This is a Norwegian boy. Dette er en norsk gutt."

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Convert tokens back to string for splitting
tokenized_text = tokenizer.convert_tokens_to_string(tokens)

# Split the text into sentences using a simple regex for punctuation
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', tokenized_text)

# Print the individual sentences
print("Individual sentences:")
for sentence in sentences:
    print(sentence)

# Encode the sentences using the model
embeddings = model.encode(sentences)
print("Embeddings:")
print(embeddings)


Individual sentences:
This is a Norwegian boy.
Dette er en norsk gutt.
Embeddings:
[[ 1.4627721   0.48626402 -0.02692403 ...  1.173989    0.6043484
  -0.15306477]
 [ 1.6063936  -0.54897845 -0.20847474 ...  1.3207005   0.8890352
   0.2163613 ]]


In [18]:
# Test the tokenizer
text = "Dette er en norsk gutt"
tokenized_text = tokenizer.tokenize(text,padding=True, truncation=True)
print(f"Tokenized text: {tokenized_text}")

# Test the phonemizer separately
phonemized_text = global_phonemizer.phonemize([text], strip=True)
print(f"Phonemized text: {phonemized_text}")

#t = tokenizer.encode([tokenized_text])[0]
#t


Tokenized text: ['Dette', 'er', 'en', 'norsk', 'gut', '##t']
Phonemized text: ['dˌɛtːa ˌɛːr eːn nˈɔrʃk ɡˈʉtː']


In [16]:
from datasets import Dataset
import string
from phonemize import phonemize

# Sample data
data_chunk = Dataset.from_dict({'text': ["Dette er den første testen.", "This is the second sentance in english."]})

# Minimal phonemize function for debugging
def phonemize_debug(text, global_phonemizer, tokenizer):
    text = text.lower()  # Simplified normalization
    words = tokenizer.tokenize(text,padding=True, truncation=True)
    phonemes = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in words]
    return {'words': words, 'phonemes': phonemes}

# Map function with minimal processing
processed_dataset = data_chunk.map(
    lambda t: phonemize_debug(t['text'], global_phonemizer, tokenizer),
    remove_columns=['text']
)

print(processed_dataset)
def reconstruct_words(tokens):
    words = []
    current_word = ""
    for token in tokens:
        if token.startswith("##"):
            current_word += token[2:]
        else:
            if current_word:
                words.append(current_word)
            current_word = token
    if current_word:
        words.append(current_word)
    return words

def phonemize_debug(text, global_phonemizer, tokenizer):
    try:
        print(f"Original text: {text}")
        text = text.lower()
        print(f"Normalized text: {text}")
        
        words = tokenizer.tokenize(text)
        print(f"Tokenized words: {words}")

        words = reconstruct_words(words)
        print(f" reconstructed Tokenized words: {words}")

        
        phonemes = [global_phonemizer.phonemize([word], strip=True)[0] if word not in string.punctuation else word for word in words]
        print(f"Phonemes: {phonemes}")
        
        return {'words': words, 'phonemes': phonemes}
    except Exception as e:
        print(f"Error: {e}")
        raise e

processed_dataset = data_chunk.map(
    lambda t: phonemize_debug(t['text'], global_phonemizer, tokenizer),
    remove_columns=['text']
)


word = "setning"
if word in tokenizer.get_vocab():
    print(f"'{word}' is in the vocabulary.")
else:
    print(f"'{word}' is not in the vocabulary.")


tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
encoded_tokens= tokenizer.encode(tokens[0])
print("Tokens:", tokens)
print("Token IDs:", token_ids)
print("encoded tokens:", encoded_tokens)

sample_text = "Dette er en test."
result = phonemize(sample_text)
print(result)



Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Dataset({
    features: ['words', 'phonemes'],
    num_rows: 2
})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Original text: Dette er den første testen.
Normalized text: dette er den første testen.
Tokenized words: ['dette', 'er', 'den', 'første', 'teste', '##n', '.']
 reconstructed Tokenized words: ['dette', 'er', 'den', 'første', 'testen', '.']
Phonemes: ['dˈɛtːa', 'ˌɛːr', 'dˈeːn', 'fˈœrʃta', 'tˈɛstən', '.']
Original text: This is the second sentance in english.
Normalized text: this is the second sentance in english.
Tokenized words: ['this', 'is', 'the', 'second', 'sent', '##ance', 'in', 'engl', '##ish', '.']
 reconstructed Tokenized words: ['this', 'is', 'the', 'second', 'sentance', 'in', 'english', '.']
Phonemes: ['thˈiːs', 'ˈiːs', 'thˈeː', 'sˈeːkʊnn', 'sˈɛntɑnka', 'iːn', 'ˈɛŋlɪsh', '.']
'setning' is not in the vocabulary.
Tokens: ['ø']
Token IDs: [278]
encoded tokens: [101, 278, 102]
dɛt ɜːɹ ɛn tɛst 


In [None]:
import yaml
import os
import sys
from pyspark.sql import SparkSession
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
from transformers import AutoTokenizer
from phonemizer.backend import EspeakBackend
from phonemizer import phonemize
from concurrent.futures import ProcessPoolExecutor
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

# Load config
config_path = "Configs/config.yml"  # you can change it to anything else
config = yaml.safe_load(open(config_path))

# Initialize environment variables and paths
sys.path.insert(0, '/home/lemoi18/StyleTTS2/local/lib/python3.10/dist-packages/transformer_engine-1.4.0.dev0+a950061-py3.10-linux-x86_64.egg')
os.environ["PYTHONPATH"] = "/home/lemoi18/StyleTTS2/local/lib/python3.10/dist-packages/transformer_engine-1.4.0.dev0+a950061-py3.10-linux-x86_64.egg:" + os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] = "/home/lemoi18/StyleTTS2/local/lib/python3.10/dist-packages/accelerate-0.29.0.dev0-py3.10.egg:" + os.environ.get("PYTHONPATH", "")
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = '/home/lemoi18/StyleTTS2/Modules/espeak-ng/build/src/libespeak-ng/libespeak-ng.so.1.52.0.1'

# Initialize phonemizer and tokenizer
global_phonemizer = EspeakBackend(language='nb', preserve_punctuation=True, with_stress=True)
tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-bert-base')  # You can change this to any other Norwegian tokenizer if needed

# Load dataset for streaming
root_directory = "./NbAiLab_phoneme"  # set up root directory for multiprocessor processing
num_shards = 46  # Set the number of shards

In [None]:
from huggingface import load_dataset
from phonemize import phonemize_function

def process_shard(shard, root_directory):
    directory = os.path.join(root_directory, f"shard_{shard_index}")
    print(f"Processing {directory}")

    if os.path.exists(directory):
        print(f"Shard {shard_index} already exists!")
        return

    try:
        processed_shard = shard.map(
            lambda t: phonemize_function(t['text']),
            num_proc=96  # Adjust this number to the number of CPU cores you want to use
        )
    except Exception as e:
        print(f"Error in mapping for shard {shard_index}: {e}")
        return

    try:
        os.makedirs(directory, exist_ok=True)
        print(f"Created directory for shard {shard_index}: {directory}")
    except Exception as e:
        print(f"Error creating directory for shard {shard_index}: {e}")
        return

    try:
        processed_shard.save_to_disk(directory)
        print(f"Shard {shard_index} saved to disk at {directory}.")
    except Exception as e:
        print(f"Error saving shard {shard_index} to disk: {e}")

if __name__ == "__main__":

    global_phonemizer = EspeakBackend(language='nb', preserve_punctuation=True, with_stress=True)
    tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-bert-base')  # You can change this to any other Norwegian tokenizer if needed

    root_directory = "./NbAiLab_phoneme" 
    dataset = load_dataset("NbAiLab/NCC", trust_remote_code=True)
    dataset = dataset.filter(lambda example: example['lang_fasttext'] == 'no')
    for shard_index in range(len(dataset['train'].shards)):
        shard = dataset['train'].shard(num_shards=len(dataset['train'].shards), shard_index=shard_index)
        process_shard(shard)
        print(f"Completed processing of shard {shard_index}")








In [None]:
def process_shard(shard, shard_index, root_directory, shard_type):
    directory = os.path.join(root_directory, f"{shard_type}_{shard_index}")
    print(f"Processing {directory}")

    if os.path.exists(directory):
        print(f"{shard_type} shard {shard_index} already exists!")
        return

    try:
        processed_shard = shard.map(
            lambda t: {
                'text': phonemize_function(t['text'], t['lang_fasttext'])
            },
            num_proc=64  # Adjust this number to the number of CPU cores you want to use
        )
    except Exception as e:
        print(f"Error in mapping for {shard_type} shard {shard_index}: {e}")
        return

    try:
        os.makedirs(directory, exist_ok=True)
        print(f"Created directory for {shard_type} shard {shard_index}: {directory}")
    except Exception as e:
        print(f"Error creating directory for {shard_type} shard {shard_index}: {e}")
        return

    try:
        processed_shard.save_to_disk(directory)
        print(f"{shard_type} shard {shard_index} saved to disk at {directory}.")
    except Exception as e:
        print(f"Error saving {shard_type} shard {shard_index} to disk: {e}")

if __name__ == "__main__":
    root_directory = "./NbAiLab_phoneme"
    dataset_root_path = '/home/lemoi18/.cache/huggingface/datasets/NbAiLab___ncc/NCC/0.0.0/7c1a935f39eef300b876e380a7f1936a28519bfdf7fc3e0be4f3db8eb006aa57'
    num_train_shards = 92  # total number of training shards
    num_val_shards = 3    # total number of validation shards

    # Process training shards
    for shard_index in range(num_train_shards):
        shard_path = f"{dataset_root_path}/ncc-train-{shard_index:05d}-of-00092.arrow"
        shard_dataset = load_dataset('arrow', data_files=shard_path, split='train')
        process_shard(shard_dataset, shard_index, root_directory, "train")
        print(f"Completed processing of train shard {shard_index}")

    # Process validation shards
    for shard_index in range(num_val_shards):
        shard_path = f"{dataset_root_path}/ncc-validation-{shard_index:05d}-of-00003.arrow"
        shard_dataset = load_dataset('arrow', data_files=shard_path, split='train')  # split might need to be adjusted based on how the data is structured
        process_shard(shard_dataset, shard_index, root_directory, "validation")
        print(f"Completed processing of validation shard {shard_index}")

Generating train split: 0 examples [00:00, ? examples/s]

Processing ./NbAiLab_phoneme/train_0


Map (num_proc=64):   0%|          | 0/224000 [00:00<?, ? examples/s]

In [9]:
from phonemize import phonemize_function

def process_shard(shard_index, shard, root_directory):
    directory = os.path.join(root_directory, f"shard_{shard_index}")
    print(f"Processing {directory}")

    if os.path.exists(directory):
        print(f"Shard {shard_index} already exists!")
        return

    try:
        processed_shard = shard.map(
            lambda t: phonemize_function(t['text']),
            num_proc=96  # Adjust this number to the number of CPU cores you want to use
        )
    except Exception as e:
        print(f"Error in mapping for shard {shard_index}: {e}")
        return

    try:
        os.makedirs(directory, exist_ok=True)
        print(f"Created directory for shard {shard_index}: {directory}")
    except Exception as e:
        print(f"Error creating directory for shard {shard_index}: {e}")
        return

    try:
        processed_shard.save_to_disk(directory)
        print(f"Shard {shard_index} saved to disk at {directory}.")
    except Exception as e:
        print(f"Error saving shard {shard_index} to disk: {e}")

if __name__ == "__main__":
    dataset = dataset.filter(lambda example: example['lang_fasttext'] == 'no')

    # Split the dataset into shards and process each shard
    num_shards = 46  # Adjust the number of shards as needed
    for shard_index in range(num_shards):
        shard = dataset.shard(num_shards=num_shards, index=shard_index)
        process_shard(shard_index, shard, root_directory)

Processing ./NbAiLab_phoneme/shard_0
Shard 0 already exists!
Processing ./NbAiLab_phoneme/shard_1


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 1: ./NbAiLab_phoneme/shard_1


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 1 saved to disk at ./NbAiLab_phoneme/shard_1.
Processing ./NbAiLab_phoneme/shard_2


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 2: ./NbAiLab_phoneme/shard_2


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 2 saved to disk at ./NbAiLab_phoneme/shard_2.
Processing ./NbAiLab_phoneme/shard_3


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 3: ./NbAiLab_phoneme/shard_3


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 3 saved to disk at ./NbAiLab_phoneme/shard_3.
Processing ./NbAiLab_phoneme/shard_4


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 4: ./NbAiLab_phoneme/shard_4


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 4 saved to disk at ./NbAiLab_phoneme/shard_4.
Processing ./NbAiLab_phoneme/shard_5


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 5: ./NbAiLab_phoneme/shard_5


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 5 saved to disk at ./NbAiLab_phoneme/shard_5.
Processing ./NbAiLab_phoneme/shard_6


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 6: ./NbAiLab_phoneme/shard_6


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 6 saved to disk at ./NbAiLab_phoneme/shard_6.
Processing ./NbAiLab_phoneme/shard_7


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 7: ./NbAiLab_phoneme/shard_7


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 7 saved to disk at ./NbAiLab_phoneme/shard_7.
Processing ./NbAiLab_phoneme/shard_8


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 8: ./NbAiLab_phoneme/shard_8


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 8 saved to disk at ./NbAiLab_phoneme/shard_8.
Processing ./NbAiLab_phoneme/shard_9


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 9: ./NbAiLab_phoneme/shard_9


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 9 saved to disk at ./NbAiLab_phoneme/shard_9.
Processing ./NbAiLab_phoneme/shard_10


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 10: ./NbAiLab_phoneme/shard_10


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 10 saved to disk at ./NbAiLab_phoneme/shard_10.
Processing ./NbAiLab_phoneme/shard_11


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 11: ./NbAiLab_phoneme/shard_11


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 11 saved to disk at ./NbAiLab_phoneme/shard_11.
Processing ./NbAiLab_phoneme/shard_12


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 12: ./NbAiLab_phoneme/shard_12


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 12 saved to disk at ./NbAiLab_phoneme/shard_12.
Processing ./NbAiLab_phoneme/shard_13


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 13: ./NbAiLab_phoneme/shard_13


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 13 saved to disk at ./NbAiLab_phoneme/shard_13.
Processing ./NbAiLab_phoneme/shard_14


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 14: ./NbAiLab_phoneme/shard_14


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 14 saved to disk at ./NbAiLab_phoneme/shard_14.
Processing ./NbAiLab_phoneme/shard_15


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 15: ./NbAiLab_phoneme/shard_15


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 15 saved to disk at ./NbAiLab_phoneme/shard_15.
Processing ./NbAiLab_phoneme/shard_16


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 16: ./NbAiLab_phoneme/shard_16


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 16 saved to disk at ./NbAiLab_phoneme/shard_16.
Processing ./NbAiLab_phoneme/shard_17


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 17: ./NbAiLab_phoneme/shard_17


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 17 saved to disk at ./NbAiLab_phoneme/shard_17.
Processing ./NbAiLab_phoneme/shard_18


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 18: ./NbAiLab_phoneme/shard_18


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 18 saved to disk at ./NbAiLab_phoneme/shard_18.
Processing ./NbAiLab_phoneme/shard_19


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 19: ./NbAiLab_phoneme/shard_19


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 19 saved to disk at ./NbAiLab_phoneme/shard_19.
Processing ./NbAiLab_phoneme/shard_20


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 20: ./NbAiLab_phoneme/shard_20


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 20 saved to disk at ./NbAiLab_phoneme/shard_20.
Processing ./NbAiLab_phoneme/shard_21


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 21: ./NbAiLab_phoneme/shard_21


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 21 saved to disk at ./NbAiLab_phoneme/shard_21.
Processing ./NbAiLab_phoneme/shard_22


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 22: ./NbAiLab_phoneme/shard_22


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 22 saved to disk at ./NbAiLab_phoneme/shard_22.
Processing ./NbAiLab_phoneme/shard_23


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 23: ./NbAiLab_phoneme/shard_23


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 23 saved to disk at ./NbAiLab_phoneme/shard_23.
Processing ./NbAiLab_phoneme/shard_24


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 24: ./NbAiLab_phoneme/shard_24


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 24 saved to disk at ./NbAiLab_phoneme/shard_24.
Processing ./NbAiLab_phoneme/shard_25


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 25: ./NbAiLab_phoneme/shard_25


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 25 saved to disk at ./NbAiLab_phoneme/shard_25.
Processing ./NbAiLab_phoneme/shard_26


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 26: ./NbAiLab_phoneme/shard_26


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 26 saved to disk at ./NbAiLab_phoneme/shard_26.
Processing ./NbAiLab_phoneme/shard_27


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 27: ./NbAiLab_phoneme/shard_27


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 27 saved to disk at ./NbAiLab_phoneme/shard_27.
Processing ./NbAiLab_phoneme/shard_28


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 28: ./NbAiLab_phoneme/shard_28


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 28 saved to disk at ./NbAiLab_phoneme/shard_28.
Processing ./NbAiLab_phoneme/shard_29


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

Created directory for shard 29: ./NbAiLab_phoneme/shard_29


Saving the dataset (0/5 shards):   0%|          | 0/373418 [00:00<?, ? examples/s]

Shard 29 saved to disk at ./NbAiLab_phoneme/shard_29.
Processing ./NbAiLab_phoneme/shard_30


Map (num_proc=96):   0%|          | 0/373418 [00:00<?, ? examples/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Created directory for shard 43: ./NbAiLab_phoneme/shard_43


Saving the dataset (0/5 shards):   0%|          | 0/373417 [00:00<?, ? examples/s]

Shard 43 saved to disk at ./NbAiLab_phoneme/shard_43.
Processing ./NbAiLab_phoneme/shard_44


Map (num_proc=96):   0%|          | 0/373417 [00:00<?, ? examples/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [9]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError
from concurrent.futures import ProcessPoolExecutor


#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [None]:
max_workers = 1 # change this to the number of CPU cores your machine has 

with ProcessPoolExecutor(max_workers=max_workers) as executor:
    future_to_shard = {executor.submit(process_shard, i): i for i in range(num_shards)}

    for future in future_to_shard:
        shard_index = future_to_shard[future]
        try:
            future.result(timeout=500)
        except Exception as exc:
            print(f"Shard {shard_index} generated an exception: {exc}")

./NbAiLab_phoneme/shard_0
./NbAiLab_phoneme/shard_1
./NbAiLab_phoneme/shard_2
./NbAiLab_phoneme/shard_3
./NbAiLab_phoneme/shard_4

./NbAiLab_phoneme/shard_5./NbAiLab_phoneme/shard_6
./NbAiLab_phoneme/shard_7
./NbAiLab_phoneme/shard_8

./NbAiLab_phoneme/shard_9./NbAiLab_phoneme/shard_10
./NbAiLab_phoneme/shard_11
./NbAiLab_phoneme/shard_12
./NbAiLab_phoneme/shard_13
./NbAiLab_phoneme/shard_14
./NbAiLab_phoneme/shard_15

./NbAiLab_phoneme/shard_16./NbAiLab_phoneme/shard_17
./NbAiLab_phoneme/shard_18

./NbAiLab_phoneme/shard_19./NbAiLab_phoneme/shard_20
./NbAiLab_phoneme/shard_21
./NbAiLab_phoneme/shard_22
./NbAiLab_phoneme/shard_23
./NbAiLab_phoneme/shard_24
./NbAiLab_phoneme/shard_25
./NbAiLab_phoneme/shard_26
./NbAiLab_phoneme/shard_27
./NbAiLab_phoneme/shard_28
./NbAiLab_phoneme/shard_29
./NbAiLab_phoneme/shard_30

./NbAiLab_phoneme/shard_31./NbAiLab_phoneme/shard_32
./NbAiLab_phoneme/shard_33
./NbAiLab_phoneme/shard_34
./NbAiLab_phoneme/shard_35
./NbAiLab_phoneme/shard_36
./NbAiLab_p

### Collect all shards to form the processed dataset

In [10]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

train_28 loaded
train_20 loaded
train_44 loaded
train_22 loaded
train_71 loaded
train_27 loaded
train_42 loaded
train_91 loaded
train_51 loaded
train_6 loaded
train_38 loaded
train_29 loaded
train_88 loaded
train_63 loaded
train_86 loaded
train_5 loaded
train_47 loaded
train_46 loaded
train_30 loaded
train_54 loaded
train_90 loaded
train_58 loaded
train_9 loaded
train_84 loaded
train_37 loaded
train_73 loaded
train_57 loaded
train_67 loaded
train_11 loaded
train_8 loaded
train_78 loaded
train_50 loaded
train_14 loaded
train_69 loaded
train_77 loaded
train_41 loaded
train_25 loaded
train_15 loaded
train_26 loaded
train_31 loaded
train_65 loaded
train_70 loaded
train_60 loaded
train_49 loaded
train_79 loaded
train_59 loaded
train_87 loaded
train_21 loaded
train_16 loaded
train_83 loaded
train_4 loaded
train_61 loaded
train_33 loaded
train_48 loaded
train_56 loaded
train_81 loaded
train_39 loaded
train_85 loaded
train_80 loaded
train_72 loaded
train_43 loaded
train_3 loaded
train_35 loade

In [11]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (0/214 shards):   0%|          | 0/17177223 [00:00<?, ? examples/s]

Dataset saved to nb


In [None]:
# check the dataset size
dataset

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [20]:
from simple_loader import FilePathDataset, build_dataloader
from transformers import AutoTokenizer
from datasets import load_from_disk
tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-bert-base')  # You can change this to any other Norwegian tokenizer if needed
dataset = load_from_disk(config['data_folder'])

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [21]:
special_token = config['dataset_params']['word_separator']

In [22]:
# get all unique tokens in the entire dataset
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

  0%|          | 0/134197 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

100%|█████████▉| 134188/134197 [28:54<00:00, 36.37it/s]  

	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 134197/134197 [28:59<00:00, 77.13it/s]


In [24]:
lower_tokens = set()
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word_lower = word.lower()
    lower_token_id = tokenizer.encode(word_lower, add_special_tokens=False)[0]
    lower_tokens.add(lower_token_id)
lower_tokens = list(lower_tokens)

  0%|          | 0/41528 [00:00<?, ?it/s]2024-06-13 22:05:42.862235: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████| 41528/41528 [00:09<00:00, 4552.69it/s] 


In [25]:
lower_tokens = (list(set(lower_tokens)))


In [27]:
token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word_lower = word.lower()
    new_t = tokenizer.encode(word_lower, add_special_tokens=False)[0]
    token_maps[t] = {'word': word_lower, 'token': lower_tokens.index(new_t)}

100%|██████████| 41528/41528 [00:06<00:00, 6154.19it/s]


In [31]:
# Convert the token_maps dictionary into a DataFramend
import pandas as pd
token_maps_df = pd.DataFrame.from_dict(token_maps, orient='index')

print("Total unique lowercase tokens:", len(lower_tokens))
token_maps_df

Total unique lowercase tokens: 26981


Unnamed: 0,word,token
100,[unk],22
106,!,1
107,"""",2
108,#,3
110,%,4
...,...,...
110829,funda,26978
110837,aufmerksam,26979
110843,fargo,6247
110844,manon,8180


In [32]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


In [None]:
from simple_loader import FilePathDataset, build_dataloader
from transformers import AutoTokenizer, AutoModel
from datasets import load_from_disk
from sklearn.cluster import MeanShift
import numpy as np
import pickle
from tqdm import tqdm

# Load the tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained('NbAiLab/nb-bert-base')  # You can change this to any other Norwegian tokenizer if needed
model = AutoModel.from_pretrained('NbAiLab/nb-bert-base')
dataset = load_from_disk(config['data_folder'])

# Prepare the data loader
file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)
special_token = config['dataset_params']['word_separator']

# Get all unique tokens in the entire dataset
unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

# Get each token's lower case
lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    if word.lower() != word:
        t = tokenizer.encode(word.lower())[0]
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)
lower_tokens = list(set(lower_tokens))

# Initialize lower_tokens and token_maps
lower_tokens = []
token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word_lower = word.lower()
    new_t = tokenizer.encode(word_lower)[0]
    if new_t not in lower_tokens:
        lower_tokens.append(new_t)
    token_maps[t] = {'word': word_lower, 'token': lower_tokens.index(new_t)}

In [6]:
import torch
# Extract token embeddings
def get_token_embedding(token_id):
    input_ids = torch.tensor([[token_id]])
    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_states = outputs.last_hidden_state
        return last_hidden_states[0, 0, :].numpy()

embeddings = np.array([get_token_embedding(t) for t in lower_tokens])

# Perform mean shift clustering
mean_shift = MeanShift()
mean_shift.fit(embeddings)
labels = mean_shift.labels_

# Prune the tokenizer vocabulary
cluster_centers = mean_shift.cluster_centers_
pruned_tokens = []
for center in cluster_centers:
    distances = np.linalg.norm(embeddings - center, axis=1)
    closest_token = lower_tokens[np.argmin(distances)]
    pruned_tokens.append(closest_token)

# Create a new token map with pruned tokens
new_token_maps = {}
for t in unique_index:
    word = tokenizer.decode([t])
    word_lower = word.lower()
    new_t = tokenizer.encode(word_lower)[0]
    if new_t in pruned_tokens:
        new_token_maps[t] = {'word': word_lower, 'token': pruned_tokens.index(new_t)}

# Save the new token maps
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(new_token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


### Test the dataset with dataloader


In [33]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

177


In [34]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

tensor([[ 29,  29,  29,  ...,  60,  43,  16],
        [ 64,  16, 156,  ...,  62,  16, 156],
        [ 67, 158,  16,  ...,  61,  60, 134],
        ...,
        [ 48, 156,  76,  ...,   0,   0,   0],
        [157,  86,  56,  ...,   0,   0,   0],
        [ 29,  29,  29,  ...,   0,   0,   0]])