<a href="https://colab.research.google.com/github/joash-alonso/joash-alonso/blob/main/LatestKeras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q keras-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.1/590.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m950.8/950.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.5/6.5 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os

os.environ["KERAS_BACKEND"] = "tensorflow"

import keras_nlp
import numpy as np
import tensorflow as tf
import keras_core as keras
from typing import Tuple, Dict

physical_devices = tf.config.list_physical_devices("GPU")
physical_devices

Using TensorFlow backend


[]

In [None]:
# Download pretraining data.
keras.utils.get_file(
    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
    extract=True,
)
wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")

# Download finetuning data.
keras.utils.get_file(
    origin="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
    extract=True,
)
sst_dir = os.path.expanduser("~/.keras/datasets/SST-2/")

# Download vocabulary data.
vocab_file = keras.utils.get_file(
    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
)

Downloading data from https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
[1m191984949/191984949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 0us/step
Downloading data from https://dl.fbaipublicfiles.com/glue/data/SST-2.zip
[1m7439277/7439277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt
[1m231508/231508[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1us/step


In [None]:
# Preprocessing params.
PRETRAINING_BATCH_SIZE = 128
FINETUNING_BATCH_SIZE = 32
SEQ_LENGTH = 128
MASK_RATE = 0.25
PREDICTIONS_PER_SEQ = 32

In [None]:
# Load SST-2.
sst_train_ds = tf.data.experimental.CsvDataset(
    sst_dir + "train.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
).batch(FINETUNING_BATCH_SIZE)

sst_val_ds = tf.data.experimental.CsvDataset(
    sst_dir + "dev.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
).batch(FINETUNING_BATCH_SIZE)

# Load wikitext-103 and filter out short lines.
wiki_train_ds = (
    tf.data.TextLineDataset(wiki_dir + "wiki.train.raw")
    .filter(lambda x: tf.strings.length(x) > 100)
    .batch(PRETRAINING_BATCH_SIZE)
)

wiki_val_ds = (
    tf.data.TextLineDataset(wiki_dir + "wiki.valid.raw")
    .filter(lambda x: tf.strings.length(x) > 100)
    .batch(PRETRAINING_BATCH_SIZE)
)

# Take a peak at the sst-2 dataset.
print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())

(<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'hide new secretions from the parental units ',
       b'contains no wit , only labored gags ',
       b'that loves its characters and communicates something rather beautiful about human nature ',
       b'remains utterly satisfied to remain the same throughout '],
      dtype=object)>, <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 0, 1, 0], dtype=int32)>)


In [None]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_file,
    sequence_length=SEQ_LENGTH,
    strip_accents=True,
    lowercase=True
)

masker = keras_nlp.layers.MaskedLMMaskGenerator(
    vocabulary_size=tokenizer.vocabulary_size(),
    mask_selection_rate=MASK_RATE,
    mask_token_id=tokenizer.token_to_id('[MASK]'),
    mask_selection_length=1
)

In [None]:
X_train = wiki_train_ds.map(lambda x: tokenizer.tokenize(x))

def preprocess(input):
  values = masker(input)
  token_ids, masked_positions, masked_ids, mask_weights = values['token_ids'], values['mask_positions'], values['mask_ids'], values['mask_weights']
  return token_ids, masked_ids[:, 0]

masked_dict = X_train.map(lambda x: preprocess(x))

# X = tf.TensorArray(dynamic_size=True, dtype=tf.int32, size=0)
# y = tf.TensorArray(dynamic_size=True, dtype=tf.int32, size=0)
# for Xs, ys in masked_dict.unbatch():
#   X = X.write(X.size(), Xs)
#   y = y.write(y.size(), ys[0])

# X = tf.stack(X.stack(), axis=0)
# y = tf.stack(y.stack(), axis=0)

In [None]:
masked_dict.take(1).get_single_element()

(<tf.Tensor: shape=(128, 128), dtype=int32, numpy=
 array([[12411,  5558,  2053, ...,  1037, 18476,  2510],
        [ 1996,  2208,  2211, ...,     0,     0,     0],
        [ 2009,  2777,  2007, ...,     0,     0,     0],
        ...,
        [ 1996,  3103, 27184, ...,  2000,  1996,  3692],
        [ 1996,  4592,  3681, ...,     0,     0,     0],
        [  103,  2045,  1005, ...,  2062,  2004, 16781]], dtype=int32)>,
 <tf.Tensor: shape=(128,), dtype=int32, numpy=
 array([ 2890,  1012,  1024,  3523,  2169,  1012,  2025,  1012,  2004,
         2147,  2208,  3517,  2000,  1010,  3523,  3893,  3674,  1005,
         2007,  2000,  2011,  3540,  1996,  1997,  2508,  2600,  2031,
         3099,  2033,  2010,  3167,  1998, 25970,  2013,  1010,  4255,
         2079,  9922,  2020,  1996,  8055,  1010, 18031,  1010,  9433,
         2035, 12872,  2044,  2011,  2141,  5684,  1012,  2734,  4585,
        13267,  2381,  2212,  2147,  1037, 16607,  2006,  2020,  1010,
         1012,  1030,  1049,  2005