In [29]:
import os
import warnings

warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [30]:
import pickle
import sys

import evaluate
import keras_nlp
import numpy as np
import pandas as pd
import tensorflow as tf
import utils_preproc
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import GRU, Dense, Embedding, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

print(tf.__version__)

2.12.0


In [31]:
SEED = 0
MODEL_PATH = "translate_models/baseline"
DATA_URL = (
    "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
)
LOAD_CHECKPOINT = False

In [32]:
tf.random.set_seed(SEED)

In [33]:
path_to_zip = tf.keras.utils.get_file(
    "spa-eng.zip", origin=DATA_URL, extract=True
)

path_to_file = os.path.join(os.path.dirname(path_to_zip), "spa-eng/spa.txt")
print("Translation data stored at:", path_to_file)

Translation data stored at: /home/jupyter/.keras/datasets/spa-eng/spa.txt


In [34]:
VOCAB_SIZE = 4096  # Limits parameters in model
MIN_TRAINING_SEQ_LEN = 46
BATCH_SIZE = 64

keras.utils.get_file(
    origin="https://storage.googleapis.com/asl-public/text/data/simplebooks.zip",
    extract=True,
)
data_dir = os.path.expanduser("~/.keras/datasets/simplebooks/")

# Load simplebooks-92 train set and filter out short lines using MIN_TRAINING_SEQ_LEN
raw_train_ds = (
    tf.data.TextLineDataset(data_dir + "simplebooks-92-raw/train.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

# Load simplebooks-92 validation set and filter out short lines using MIN_TRAINING_SEQ_LEN
raw_val_ds = (
    tf.data.TextLineDataset(data_dir + "simplebooks-92-raw/valid.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
)

In [35]:
# # Train tokenizer vocabulary
# print("Training the word piece tokenizer. This will take 5-10 mins...")
# vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
#     raw_train_ds,
#     vocabulary_size=VOCAB_SIZE,
#     lowercase=True,
#     reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
# )
# print("Training is complete!!")

In [36]:
# with open("vocab.txt", "w", encoding="utf-8") as f:
#     for token in vocab:
#         f.write(token + "\n")

In [37]:
with open("vocab.txt", encoding="utf-8") as f:
    vocab = [line.strip() for line in f.readlines()]

In [38]:
SEQ_LEN = 128
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [39]:
data = pd.read_csv(
    path_to_file, sep="\t", header=None, names=["english", "spanish"]
)

In [40]:
def load_and_preprocess(path, num_examples):
    with open(path_to_file) as fp:
        lines = fp.read().strip().split("\n")

    sentence_pairs = [
        [utils_preproc.preprocess_sentence(sent) for sent in line.split("\t")]
        for line in lines[:num_examples]
    ]

    return zip(*sentence_pairs)

In [41]:
def load_and_integerize(path, num_examples=None):
    targ_lang, inp_lang = load_and_preprocess(path, num_examples)

    # TODO 1b
    input_tensor = tokenizer(inp_lang)
    target_tensor = tokenizer(targ_lang)

    return (
        input_tensor,
        target_tensor,
    )

In [42]:
TEST_PROP = 0.2
NUM_EXAMPLES = None

In [43]:
input_tensor, target_tensor = load_and_integerize(path_to_file, NUM_EXAMPLES)

In [44]:
print(input_tensor.shape)
print(target_tensor.shape)

(118964, 128)
(118964, 128)


In [45]:
max_length_targ = target_tensor.shape[1]
print(max_length_targ)

128


In [46]:
splits = train_test_split(
    input_tensor.numpy(),
    target_tensor.numpy(),
    test_size=TEST_PROP,
    random_state=SEED,
)

input_tensor_train = splits[0]
input_tensor_val = splits[1]

target_tensor_train = splits[2]
target_tensor_val = splits[3]

In [47]:
(
    len(input_tensor_train),
    len(target_tensor_train),
    len(input_tensor_val),
    len(target_tensor_val),
)

(95171, 95171, 23793, 23793)

In [48]:
def create_dataset(_, decoder_input):

    # shift ahead by 1
    target = tf.roll(decoder_input, -1, 1)

    # replace last column with 0s
    zeros = tf.zeros([target.shape[0], 1], dtype=tf.int32)
    target = tf.concat((target[:, :-1], zeros), axis=-1)

    dataset = tf.data.Dataset.from_tensor_slices((decoder_input, target))

    return dataset

In [49]:
BUFFER_SIZE = len(target_tensor_train)
BATCH_SIZE = 16

In [50]:
train_dataset = (
    create_dataset(input_tensor_train, target_tensor_train)
    .shuffle(BUFFER_SIZE)
    .repeat()
    .batch(BATCH_SIZE, drop_remainder=True)
)


eval_dataset = create_dataset(input_tensor_val, target_tensor_val).batch(
    BATCH_SIZE, drop_remainder=True
)

In [51]:
# for d in train_dataset:
#     print(d[0])
#     print(d[1])
#     break

In [52]:
# tokenizer train

In [53]:
# this should output "Num GPUs Available: 1" if you have one GPU attached
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))

Num GPUs Available:  1


In [54]:
# Data

# MIN_TRAINING_SEQ_LEN = 450

# # Model
# EMBED_DIM = 256
# FEED_FORWARD_DIM = 256
# NUM_HEADS = 3
# NUM_LAYERS = 2

In [55]:
# model build

In [56]:
"""here it is"""

import datetime
import os

import keras_nlp
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    Input,
    Layer,
    LayerNormalization,
    MultiHeadAttention,
)


class TimestampedModelCheckpoint(tf.keras.callbacks.Callback):
    """timestamp check point call back"""

    def __init__(self, save_dir):
        super().__init__()
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)
        self.saved_models = []

    def on_epoch_end(self, epoch, logs=None):
        """on epoch end"""
        if epoch // 12 == 0:
            _ = logs
            timestamp = datetime.datetime.now().isoformat(timespec="seconds")
            safe_timestamp = timestamp.replace(":", "_")
            filename = f"model_{safe_timestamp}_epoch{epoch}"
            filepath = os.path.join(self.save_dir, filename)
            self.model.save(filepath)
            print(f">>> Saved model to {filepath}")
            self.saved_models.append(filepath)

            while len(self.saved_models) > 2:
                to_delete = self.saved_models.pop(0)
                print(f">>> Deleting old model: {to_delete}")
                tf.io.gfile.rmtree(to_delete)


class TransformerBlock(Layer):
    """transformer block"""

    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                Dense(ff_dim, activation="relu"),
                Dense(embed_dim),
            ]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):
        """call"""
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(Layer):
    """class"""

    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.maxlen = maxlen
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        """call"""
        seq_len = tf.shape(x)[-1]
        pad_len = self.maxlen - seq_len

        x = tf.cond(
            pad_len > 0,
            lambda: tf.pad(
                x, paddings=[[0, 0], [0, pad_len]], constant_values=0
            ),
            lambda: x[:, : self.maxlen],
        )
        positions = tf.range(start=0, limit=self.maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


class Transformer:
    """transformer"""

    def __init__(
        self,
        embed_dim: int = 32,  # Embedding size for each token
        num_heads: int = 2,  # Number of attention heads
        ff_dim: int = 32,  # Hidden layer size in feed forward network inside transformer
        maxlen: int = 2048,
        loop_n: int = 12,
        vocab_size: int = 32000,
        tokenizer=None,
    ):
        self.history = None
        self.maxlen = maxlen
        inputs = Input(shape=(maxlen,))
        self.embedding_layer = TokenAndPositionEmbedding(
            maxlen, vocab_size, embed_dim
        )
        x = self.embedding_layer(inputs)
        for _ in range(loop_n):
            transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
            x = transformer_block(x)

        x = Dropout(0.1)(x)
        x = Dense(ff_dim, activation="relu")(x)
        x = Dropout(0.1)(x)
        outputs = Dense(vocab_size, activation="softmax")(x)

        self.model = keras.Model(inputs=inputs, outputs=outputs)
        self.model.compile(
            optimizer="adam",
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"],
        )
        self.tokenizer = tokenizer
        if self.tokenizer:
            self.start_packer = keras_nlp.layers.StartEndPacker(
                sequence_length=self.maxlen,
                start_value=tokenizer.token_to_id("[BOS]"),
            )

    def train_tokenizer(self, data, vocab_size=4096):
        """train_tokenizer"""
        vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
            data,
            vocabulary_size=vocab_size,
            lowercase=True,
            reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
        )
        tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
            vocabulary=vocab,
            sequence_length=self.maxlen,
            lowercase=True,
        )
        self.model.tokenizer = tokenizer
        self.start_packer = keras_nlp.layers.StartEndPacker(
            sequence_length=self.maxlen,
            start_value=tokenizer.token_to_id("[BOS]"),
        )

    def train(self, *, train_dataset, validation_data, steps_per_epoch, epochs):
        """train"""
        self.history = self.model.fit(
            train_dataset,
            steps_per_epoch=steps_per_epoch,
            validation_data=validation_data,
            epochs=epochs,
            callbacks=[TimestampedModelCheckpoint(save_dir="./variables")],
        )

    def generate(self, text: str, p: float = 0.2):
        """generate"""
        input_tokens = self.tokenizer([text])
        packed_tokens = self.start_packer(input_tokens)
        token_length = tf.where(packed_tokens != 0)[-1, 1]
        initial_sequence_length = token_length + 1
        gen_ittr = self._generate_step(
            tokens=packed_tokens,
            p=p,
            start_index=int(initial_sequence_length.numpy()),
        )
        generated_text_parts = [text]
        for word in gen_ittr:
            generated_text_parts.append(word)
            print(word, end=" ")

        return " ".join(generated_text_parts)

    def _generate_step(self, tokens, p=0.2, start_index=1):
        tokens = tokens.numpy()
        for i in range(start_index, self.maxlen):
            sampled_token = len(self.tokenizer.vocabulary)
            while sampled_token > len(self.tokenizer.vocabulary) - 1:
                logits = self.model.predict([tokens], verbose=0)[:, i - 1, :]
                logits = tf.constant(logits)
                sampled_token = top_p_sample(logits[0], p)

            tokens[0][i] = sampled_token
            next_word = (
                self.tokenizer.detokenize([sampled_token]).numpy().decode()
            )
            yield next_word
            if sampled_token == 2:  # EOS token
                raise StopIteration


def _build_token_dataset():
    """
    for create dataset to train tokenizer
    if you want to train tokenizer local,

    ds = _build_token_dataset()
    Run Transformer.train_tokenizer(ds)
    """
    # Data
    BATCH_SIZE = 64
    MIN_TRAINING_SEQ_LEN = 512

    keras.utils.get_file(
        origin="https://storage.googleapis.com/asl-public/text/data/simplebooks.zip",
        extract=True,
    )
    data_dir = os.path.expanduser("./data/")

    # Load simplebooks-92 train set and filter out short lines using MIN_TRAINING_SEQ_LEN
    raw_train_ds = (
        tf.data.TextLineDataset(data_dir + "simplebooks-92-raw/train.txt")
        .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
        .batch(BATCH_SIZE)
        .shuffle(buffer_size=256)
    )

    # Load simplebooks-92 validation set and filter out short lines using MIN_TRAINING_SEQ_LEN
    raw_val_ds = (
        tf.data.TextLineDataset(data_dir + "simplebooks-92-raw/valid.txt")
        .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
        .batch(BATCH_SIZE)
    )
    return raw_train_ds, raw_val_ds


def top_p_sample(logits, p=0.2):
    """top sample"""
    probs = tf.nn.softmax(logits)
    sorted_probs, sorted_indices = tf.sort(
        probs, direction="DESCENDING"
    ), tf.argsort(probs, direction="DESCENDING")
    cumulative_probs = tf.cumsum(sorted_probs)

    cutoff_index = tf.reduce_min(tf.where(cumulative_probs > p))
    cutoff_index = tf.maximum(cutoff_index, 1)
    top_p_indices = sorted_indices[:cutoff_index]
    top_p_logits = tf.gather(logits, top_p_indices)
    sampled_relative = tf.random.categorical([top_p_logits], num_samples=1)[
        0, 0
    ]
    sampled_token = top_p_indices[sampled_relative]

    return sampled_token

In [57]:
EMBEDDING_DIM = 512

model = Transformer(
    embed_dim=EMBEDDING_DIM,
    num_heads=16,
    ff_dim=EMBEDDING_DIM * 2,
    maxlen=max_length_targ,
    loop_n=8,
    vocab_size=VOCAB_SIZE,
    tokenizer=tokenizer,
)

In [None]:
model.train(
    train_dataset=train_dataset,
    validation_data=eval_dataset,
    steps_per_epoch=128,
    epochs=128,
)

Epoch 1/128


In [None]:
model.generate("hello, Tom. Today")