In [None]:
import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as tf_text

sys.path.append(os.path.join("..", "training"))
from tokenizer import CharTokenizer

In [None]:
# Constants and utilities
tok = tf_text.UnicodeCharTokenizer()
PAD_LENGTH: int = 128  # Length to which to pad sequences.
BATCH_SIZE: int = 256  # Size of dataset batches
VOCAB_SIZE: int = 256  # Vocabulary size (we use ASCII characters)
DATASET_PATH: str = os.path.join("..", "data", "processed-text")  # Path to dataset files

# Functions to prepare input for model
def drop_newlines(strings):
    return tf.strings.regex_replace(strings, r"\n", "")

def prepare_input(strings, labels):
    # Remove newline characters from strings
    strings_clean = drop_newlines(strings)
    
    # Tokenize
    tokens = tok.tokenize(strings_clean)
    
    # Pad sequences
    return tokens.to_tensor(0, shape=(BATCH_SIZE, PAD_LENGTH)), labels

In [None]:
# Training and validation datasets
train_dataset = keras.preprocessing.text_dataset_from_directory(
    os.path.join(DATASET_PATH, "train"),
    batch_size=BATCH_SIZE
)
valid_dataset = keras.preprocessing.text_dataset_from_directory(
    os.path.join(DATASET_PATH, "test"),
    batch_size=BATCH_SIZE
)

# Preprocess data
train_dataset = train_dataset.map(prepare_input).prefetch(2)
valid_dataset = valid_dataset.map(prepare_input).prefetch(2)