In [32]:
import string
import re
from tensorflow import keras
from tensorflow.keras import layers

strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

vocab_size = 15
sequence_length = 4

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)
eng_texts = ["Hello world.", "How are you?", "Goodbye!"]
spa_texts = ["Hola mundo.", "kemon acos??", "thak"]



source_vectorization.adapt(eng_texts)
target_vectorization.adapt(spa_texts)
print(source_vectorization.get_vocabulary())
print(target_vectorization.get_vocabulary())

['', '[UNK]', 'you', 'world', 'how', 'hello', 'goodbye', 'are']
['', '[UNK]', 'thak', 'mundo', 'kemon', 'hola', 'acos']


In [26]:
import tensorflow as tf

# Dummy data
eng_texts = ["Hello world.", "How are you?", "Goodbye!"]
spa_texts = ["Hola mundo.", "kemon acos??", "thak"]

# Create a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))

# Batch the data
batch_size = 2
dataset = dataset.batch(batch_size)

# Dummy map function
def format_dataset(eng_batch, spa_batch):
    # This is a dummy map function; you would replace it with your actual processing logic
    # For illustration purposes, we concatenate the English and Spanish sentences
    return tf.strings.join([eng_batch, spa_batch], separator=" ||| ")

# Apply the map function
dataset = dataset.map(format_dataset, num_parallel_calls=4)

# Display the processed dataset
for batch in dataset.take(-1):
    print("Processed Batch:")
    print(batch.numpy())


Processed Batch:
[b'Hello world. ||| Hola mundo.' b'How are you? ||| kemon acos??']
Processed Batch:
[b'Goodbye! ||| thak']


In [33]:

batch_size = 1
dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))

dataset = dataset.batch(batch_size)

# Format the dataset using your format_dataset function
def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
        "english": eng,
        "spanish": spa[:, :-1],
    }, spa[:, 1:])

# Apply the map function
dataset = dataset.map(format_dataset, num_parallel_calls=4)

# Display the processed dataset
for batch in dataset.take(2):
    print("Processed Batch:")
    print(batch)
    print("English:", batch[0]["english"].numpy())
    print("Spanish Input:", batch[0]["spanish"].numpy())
    print("Spanish Output:", batch[1].numpy())
    print("\n>>>>>>>>")

Processed Batch:
({'english': <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[5, 3, 0, 0]], dtype=int64)>, 'spanish': <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[5, 3, 0, 0]], dtype=int64)>}, <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[3, 0, 0, 0]], dtype=int64)>)
English: [[5 3 0 0]]
Spanish Input: [[5 3 0 0]]
Spanish Output: [[3 0 0 0]]

>>>>>>>>
Processed Batch:
({'english': <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[4, 7, 2, 0]], dtype=int64)>, 'spanish': <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[4, 6, 0, 0]], dtype=int64)>}, <tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[6, 0, 0, 0]], dtype=int64)>)
English: [[4 7 2 0]]
Spanish Input: [[4 6 0 0]]
Spanish Output: [[6 0 0 0]]

>>>>>>>>


In [19]:
# Assuming 'train_english_texts' and 'train_spanish_texts' are your training data
train_pairs = [("Hello world.", "Hola mundo."), ("How are you?", "¿Cómo estás?")]

# Example usage of the format_dataset function
for eng, spa in train_pairs:
    eng_processed = source_vectorization(eng)
    spa_processed = target_vectorization(spa)

    print("Original English:", eng)
    print("Processed English:", eng_processed.numpy(),len(eng_processed.numpy()))
    print("Original Spanish:", spa)
    print("Processed Spanish:", spa_processed.numpy(),len(spa_processed.numpy()))
    print("\n")


Original English: Hello world.
Processed English: [5 3 0 0] 4
Original Spanish: Hola mundo.
Processed Spanish: [5 3 0 0 0] 5


Original English: How are you?
Processed English: [4 7 2 0] 4
Original Spanish: ¿Cómo estás?
Processed Spanish: [1 1 0 0 0] 5




In [None]:
# Example sentences
eng_sentence1 = "Hello world."
spa_sentence1 = "Hola mundo."

eng_sentence2 = "How are you?"
spa_sentence2 = "¿Cómo estás?"

# Process the sentences using format_dataset
example1 = format_dataset(eng_sentence1, spa_sentence1)
example2 = format_dataset(eng_sentence2, spa_sentence2)

# Display the processed examples
print("Example 1:")
print("Input (English):", example1[0]["english"].numpy())
print("Output (Spanish, shifted):", example1[1].numpy())
print("\n")

print("Example 2:")
print("Input (English):", example2[0]["english"].numpy())
print("Output (Spanish, shifted):", example2[1].numpy())


In [34]:
import numpy as np
from tensorflow.keras.losses import categorical_crossentropy, sparse_categorical_crossentropy
from tensorflow.keras.utils import to_categorical

# Example data
num_classes = 3
num_samples = 2

# True class labels (assuming a classification task with three classes)
true_labels = np.array([1, 2])  # Class indices (not one-hot encoded)

# One-hot encoding of true labels
one_hot_labels = to_categorical(true_labels, num_classes=num_classes)

# Predictions (example output from a neural network)
predictions = np.array([[0.2, 0.7, 0.1], [0.6, 0.2, 0.2]])

# Using categorical_crossentropy (requires one-hot encoded targets)
loss_categorical = categorical_crossentropy(one_hot_labels, predictions)
print("Categorical Crossentropy Loss (one-hot encoded):", loss_categorical.numpy())

# Using sparse_categorical_crossentropy (integers as targets)
loss_sparse_categorical = sparse_categorical_crossentropy(true_labels, predictions)
print("Sparse Categorical Crossentropy Loss (integers):", loss_sparse_categorical.numpy())


Categorical Crossentropy Loss (one-hot encoded): [0.35667494 1.60943791]
Sparse Categorical Crossentropy Loss (integers): [0.35667494 1.60943791]


In [35]:
one_hot_labels

array([[0., 1., 0.],
       [0., 0., 1.]], dtype=float32)