In [55]:
import sys

assert sys.version_info >= (3, 7)
from packaging import version
import tensorflow as tf

assert version.parse(tf.__version__) >= version.parse("2.8.0")

import numpy as np 

In [56]:
with open("cleaned_combined_Elytis.txt") as f:
    data = f.read() 

def clean_text(text):
    import re 
    """
    Removes all non-Greek characters and numbers from the given text.

    Args:
    text (str): The input text to process.

    Returns:
    str: The cleaned text containing only Greek characters.
    """
    # Define a regular expression pattern that matches Greek characters (uppercase and lowercase)
    # and spaces (optional if you want to preserve spaces)
    pattern = r'[^\u0370-\u03FF\u1F00-\u1FFF\s]'
    
    # Use re.sub to replace all characters that don't match the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    
    return cleaned_text
data = clean_text(data) 

In [57]:
print(set(data)) 
n_distinct_characters = len(set(data)) 
print(f"We should set the output layer to have {n_distinct_characters} distinct characters.")

{'ξ', 'ϋ', 'χ', 'ψ', 'ύ', 'ή', 'έ', 'ρ', 'κ', 'β', 'ε', 'ά', 'ώ', 'ΐ', 'ι', 'θ', 'ο', 'δ', 'ς', 'λ', 'ζ', 'η', 'σ', 'μ', 'φ', 'ω', 'τ', 'ν', 'ί', 'π', 'ΰ', ' ', 'υ', 'ό', 'α', 'ϊ', 'γ'}
We should set the output layer to have 37 distinct characters.


Next, we’ll use a tf.keras.layers.TextVectorization layer to encode this text. We set split="character" to get character-
level encoding rather than the default word-level encoding, and we use
standardize="lower" to convert the text to lowercase (which will simplify the
task):

In [58]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character",
standardize="lower")
text_vec_layer.adapt([data])
encoded = text_vec_layer([data])[0] 

Each character is now mapped to an integer, starting at 2. The
TextVectorization layer reserved the value 0 for padding tokens, and it
reserved 1 for unknown characters. We won’t need either of these tokens fornow, so let’s subtract 2 from the character IDs and compute the number of
distinct characters and the total number of characters:

In [59]:
encoded -= 2  # drop tokens 0 (pad) and 1 (unknown), which we will not use
n_tokens = text_vec_layer.vocabulary_size() - 2  # number of distinct chars = 37
dataset_size = len(encoded) 
dataset_size

489572

Next we can turn this very long sequence into
a dataset of windows that we can then use to train a sequence-to-sequence
RNN. The targets will be similar to the inputs, but shifted by one time step
into the “future”.
Let’s write a small utility function to convert a long sequence of character
IDs into a dataset of input/target window pairs:

In [60]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    """
    Creates a TensorFlow dataset from a sequence of tokens.
    
    Args:
    sequence (list or np.array): The sequence of tokens.
    length (int): Length of each sequence window.
    shuffle (bool): Whether to shuffle the dataset.
    seed (int): Random seed for shuffling.
    batch_size (int): Batch size for training.
    
    Returns:
    tf.data.Dataset: The TensorFlow dataset ready for training.
    """
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=100_000, seed=seed)
    
    ds = ds.map(lambda window: (window[:-1], window[1:]))  # Create (inputs, targets) pairs
    ds = ds.batch(batch_size)  # Batch the data
    ds = ds.prefetch(tf.data.AUTOTUNE)  # Prefetch for performance
    
    return ds


In [61]:
length = 100
tf.random.set_seed(42)

tf.random.set_seed(42)
train_set = to_dataset(encoded[:int(.9*len(encoded))], length=length, shuffle=True,
seed=42)
valid_set = to_dataset(encoded[int(.9*len(encoded)):int(.95*len(encoded))], length=length)
test_set = to_dataset(encoded[int(.95*len(encoded)):], length=length)


In [62]:
def save_datasets():
    # Save the train, validation, and test sets to separate directories
    train_set.save("train_set_directory")
    valid_set.save("valid_set_directory")
    test_set.save("test_set_directory")

    print("Datasets exported successfully.")
# save_datasets() 

In [63]:
def train_model(train_set, valid_set, n_tokens=37, embedding_dim=16, rnn_units=128, epochs=10):
    """
    Function to train a text generation model using an RNN with early stopping.

    Args:
    train_set (tf.data.Dataset): The training dataset.
    valid_set (tf.data.Dataset): The validation dataset.
    n_tokens (int): Number of distinct characters in the dataset.
    embedding_dim (int): Dimension of the embedding layer.
    rnn_units (int): Number of units in the RNN layer.
    epochs (int): Number of epochs to train the model.

    Returns:
    model (tf.keras.Model): The trained Keras model.
    history (History object): The training history object.
    """
    # Create the model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=embedding_dim),
        tf.keras.layers.GRU(rnn_units, return_sequences=True),
        tf.keras.layers.Dense(n_tokens, activation="softmax")
    ])

    # Compile the model
    model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

    # Define the checkpoint callback to save the best model based on validation accuracy
    model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_ELytis_model", monitor="val_accuracy", save_best_only=True)

    # Define the early stopping callback
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",  # Monitor validation loss
        patience=3,          # Stop if no improvement for 3 epochs
        restore_best_weights=True  # Restore model weights from the epoch with the best validation loss
    )

    # Train the model with both callbacks
    history = model.fit(train_set, validation_data=valid_set, epochs=epochs, callbacks=[model_ckpt, early_stopping])

    return model, history

# Call the function with the updated output layer size
model, history = train_model(train_set, valid_set)


Epoch 1/10


KeyboardInterrupt: 

In [40]:
Elytis_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2), # no <PAD> or <UNK> tokens
    model
])

In [82]:
# Load the model from the saved directory
Elytis_model = tf.keras.models.load_model('my_ELytis_model')

In [79]:
# Example input text
input_text = "πρέπ"  # Example Greek seed text

# Preprocess the input text using the text vectorization layer
# Ensure the input is a batch (list) of strings
input_data = text_vec_layer([input_text])

# Predict the next character's probability distribution
y_proba = Elytis_model.predict(input_data)[0, -1]  # Get probabilities for the last character

# Choose the most probable character ID
y_pred = tf.argmax(y_proba).numpy()  # Convert to a NumPy integer

# Map the predicted character ID back to the actual character
char = text_vec_layer.get_vocabulary()[y_pred + 2]  # Adjust index offset if necessary

print("Predicted next character:", char)

Predicted next character: σ


In [45]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]]) # probas = 50%, 40%, and 10%
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 1, 0, 2, 1, 0, 0, 1]])>

In [68]:
def next_char(text):
    """
    Predicts the next character for the given text using the trained model.

    Args:
    text (str): The input text to predict the next character for.

    Returns:
    str: The next predicted character.
    """

    input_data = tf.constant([text])  # Shape (1,)

    # Vectorize the input using the text vectorization layer
    vectorized_input = text_vec_layer(input_data)  # Shape (1, sequence_length)

    # Predict the next character's probability distribution
    y_proba = Elytis_model.predict(vectorized_input)[0, -1]  # Get probabilities for the last character

    # Choose the most probable character ID
    y_pred = tf.argmax(y_proba).numpy()  # Get the ID of the most probable character

    # Convert character ID back to character
    return text_vec_layer.get_vocabulary()[y_pred + 2]  # Offset for any special tokens


In [70]:
def next_char(text, temperature):
    """
    Generates the next character in the sequence based on the model's prediction and the given temperature.

    Args:
    text (str): The input text used as the seed for generating the next character.
    temperature (float): The temperature value used to control the randomness of the predictions.

    Returns:
    str: The next predicted character.
    """
    # Preprocess the input text to vector form
    input_data = text_vec_layer([text])  # Assuming text_vec_layer is the preprocessing layer

    # Predict the next character's probability distribution
    y_proba = Elytis_model.predict(input_data)[0, -1, :]  # Get probabilities for the last character

    # Apply temperature scaling to logits
    rescaled_logits = tf.math.log(y_proba) / temperature

    # Sample the next character using tf.random.categorical
    char_id = tf.random.categorical(rescaled_logits[None, :], num_samples=1)[0, 0].numpy()

    # Convert character ID back to the corresponding character
    return text_vec_layer.get_vocabulary()[char_id + 2]


In [74]:
def extend_text(text, temperature=0.1, n_chars=500):
    """
    Extends the input text by generating new characters using the trained model.

    Args:
    text (str): The initial text to extend.
    n_chars (int): The number of characters to generate.

    Returns:
    str: The extended text after generating new characters.
    """
    for _ in range(n_chars):
        next_character = next_char(text,temperature)
        text += next_character  # Append the generated character to the text
    return text


In [83]:
print(extend_text("Κάπου εδώ πρέπει"))

Κάπου εδώ πρέπειςαεεεαιςααι    ςο ι  ςεαοςαοαη  ςο ι ςοκ    ςεαοκ    ςο ςο   ςειςαεοσεοωοσεααοααο  ςεαο ςοςιςο  ςοςιςαο   ςιςοοο   ςοοααο   ςι ςοο  ςιι ςοο  ςεαο  ςο  ςο   ςο   ςο   ςοαειςαο   ςιίααινι ύονι χ υ     ςο  ςειςααειςαι ςήνησααεααο    ςο   ςο  ςεαο ςαο  ςο  ςοςοά ςοαήνθοααι  ςο ι ςοσααειςαοο   ςο  ςεαοκ     ςο ςοστοαζάμά ςοσαηνο    ςεαο ςαη  ςο  ςο ςοσααοκ   ςεαοκ     ςο ςο   ςειιςαο υ υ    ςεαο  ςοςιςαεαο  ςηααητηνοσεαη    ςο  ςο ςοσααοο  ςειςοά ςηο ςοσαεαοςαο  ςοααι μτ ι ςοαοκ ννοαειςαο   ςι ςο   ς
