<a href="https://colab.research.google.com/github/ko-i/study/blob/main/tensorflow2/code06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U tensorflow-text

In [None]:
import collections
import pathlib
import re
import string

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text as tf_text

In [None]:
data_url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'

dataset = tf.keras.utils.get_file('stack_overflow_16k.tar.gz', data_url, untar = True, cache_dir = 'stack_overflow', cache_subdir = '')

dataset_dir = pathlib.Path(dataset).parent

In [None]:
dataset_dir

In [None]:
list(dataset_dir.iterdir())

In [None]:
train_dir = dataset_dir/'train'

In [None]:
list(train_dir.iterdir())

In [None]:
sample_file = train_dir/'python/1755.txt'

with open(sample_file) as f:
    print(f.read())

In [None]:
batch_size = 32
seed = 42

In [None]:
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(train_dir, batch_size = batch_size, validation_split = 0.2, subset = 'training', seed = seed)

In [None]:
raw_train_ds

In [None]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(10):
        print("Question: ", text_batch.numpy()[i])
        print("Label: ", label_batch.numpy()[i])

In [None]:
for i, label in enumerate(raw_train_ds.class_names):
    print("Label ", i, " is corresponds to ", label)

In [None]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(train_dir, batch_size = batch_size, validation_split = 0.2, subset = 'validation', seed = seed)

In [None]:
test_dir = dataset_dir/'test'

raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(test_dir, batch_size = batch_size)

In [None]:
VOCAB_SIZE = 10000

binary_vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens = VOCAB_SIZE, output_mode = 'binary')

In [None]:
MAX_SEQUENCE_LENGTH= 250

int_vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens = VOCAB_SIZE, output_mode = 'int', output_sequence_length = MAX_SEQUENCE_LENGTH)

In [None]:
train_text = raw_train_ds.map(lambda text, labels: text)

train_text

In [None]:
for item in train_text.take(1):
    print(item)

In [None]:
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [None]:
def binary_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)

    return binary_vectorize_layer(text), label

In [None]:
def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)

    return int_vectorize_layer(text), label

In [None]:
text_batch, label_batch = next(iter(raw_train_ds))

first_question, first_label = text_batch[0], label_batch[0]

In [None]:
print(first_label)
print(first_question)

In [None]:
print("'binary' vectorized question:", binary_vectorize_text(first_question, first_label)[0])

In [None]:
print("'int' vectorized question:", int_vectorize_text(first_question, first_label)[0])

In [None]:
print("1289 ----> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ----> ", int_vectorize_layer.get_vocabulary()[313])
print("10000 ----> ", int_vectorize_layer.get_vocabulary()[9999])
print("Vocabulary size: {}".format(len(int_vectorize_layer.get_vocabulary())))

In [None]:
binary_train_ds = raw_train_ds.map(binary_vectorize_text)
binary_val_ds = raw_val_ds.map(binary_vectorize_text)
binary_test_ds = raw_test_ds.map(binary_vectorize_text)

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size = AUTOTUNE)

In [None]:
binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

In [None]:
binary_model = tf.keras.Sequential([tf.keras.layers.Dense(4)])

binary_model.compile(optimizer = 'adam', loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), metrics = ['accuracy'])

history = binary_model.fit(binary_train_ds, validation_data = binary_val_ds, epochs = 10)

In [None]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64, mask_zero = True),
        tf.keras.layers.Conv1D(64, 5, 2, 'valid', activation = 'relu'),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(num_labels),
    ])

    return model

In [None]:
int_model = create_model(VOCAB_SIZE + 1, 4)

int_model.compile(optimizer = 'adam', loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), metrics = ['accuracy'])

history = int_model.fit(int_train_ds, validation_data = int_val_ds, epochs = 5)

In [None]:
print(binary_model.summary())

In [None]:
print(int_model.summary())

In [None]:
binary_loss, binary_acc = binary_model.evaluate(binary_test_ds)
int_loss, int_acc = int_model.evaluate(int_test_ds)

print("Binary model acc: {:2.2%}".format(binary_acc))
print("Int model acc: {:2.2%}".format(int_acc))

In [None]:
export_model = tf.keras.Sequential([
    binary_vectorize_layer,
    binary_model,
    tf.keras.layers.Activation('sigmoid')
])

export_model.compile(optimizer = 'adam', loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), metrics = ['accuracy'])

loss, acc = export_model.evaluate(raw_test_ds)
print("Acc: {:2.2%}".format(acc))

In [None]:
def get_string_labels(predicted_score_batch):
    predicted_int_labels = tf.argmax(predicted_score_batch, axis = 1)
    predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels)

    return predicted_labels

In [None]:
inputs = [
    "how do I extract keys from a dict into a list?",  # python
    "debug public static void main(string[] args) {...}",  # java
]

In [None]:
predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)

for input, label in zip(inputs, predicted_labels):
    print("Question: ", input)
    print("Label: ", label.numpy())

In [None]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin = DIRECTORY_URL + name)

In [None]:
parent_dir = pathlib.Path(text_dir).parent

list(parent_dir.iterdir())

In [None]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

In [None]:
labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_datasets = tf.data.TextLineDataset(str(parent_dir/file_name))
    labeled_dataset = lines_datasets.map(lambda ex: labeler(ex, i))

    labeled_data_sets.append(labeled_dataset)

In [None]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000

In [None]:
labeled_data_sets

In [None]:
all_labeled_data = labeled_data_sets[0]

for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

In [None]:
all_labeled_data = all_labeled_data.shuffle(BUFFER_SIZE, reshuffle_each_iteration = False)

In [None]:
for text, label in all_labeled_data.take(10):
    print("Sentence: ", text.numpy())
    print("Label: ", label.numpy())

In [None]:
tokenizer = tf_text.UnicodeScriptTokenizer()

In [None]:
def tokenize(text, unused_label):
    lower_case = tf_text.case_fold_utf8(text)

    return tokenizer.tokenize(lower_case)

In [None]:
tokenized_ds = all_labeled_data.map(tokenize)

In [None]:
for text_batch in tokenized_ds.take(5):
  print("Tokens: ", text_batch.numpy())

In [None]:
tokenized_ds = configure_dataset(tokenized_ds)

In [None]:
vocab_dict = collections.defaultdict(lambda: 0)

for toks in tokenized_ds.as_numpy_iterator():
    for tok in toks:
        vocab_dict[tok] += 1

In [None]:
vocab = sorted(vocab_dict.items(), key = lambda x: x[1], reverse = True)

In [None]:
vocab = [token for token, count in vocab]

In [None]:
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab);
print("vocab size: ", vocab_size)
print("first five vocab: ", vocab[:100])

In [None]:
keys = vocab
values = range(2, len(vocab) + 2)

init = tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.string, value_dtype=tf.int64)
num_oov_buckets = 1

vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)

In [None]:
def preprocess_text(text, label):
    standardized = tf_text.case_fold_utf8(text)
    tokenized = tokenizer.tokenize(standardized)
    vectorized = vocab_table.lookup(tokenized)

    return vectorized, label

In [None]:
example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text)
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorizer sentence: ", vectorized_text.numpy())

In [None]:
all_encoded_data = all_labeled_data.map(preprocess_text)

In [None]:
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)

In [None]:
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)

In [None]:
sample_text, sample_label = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_label.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_label[0])

In [None]:
vocab_size += 2

In [None]:
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

In [None]:
model = create_model(vocab_size, num_labels=3)
model.compile(optimizer = 'adam', loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics = ['accuracy'])
history = model.fit(train_data, validation_data = validation_data, epochs = 3)

In [None]:
loss, acc = model.evaluate(validation_data)

print("Loss: ", loss)
print("Acc: ", acc)

In [None]:
preprocess_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=vocab_size, standardize = tf_text.case_fold_utf8, split=tokenizer.tokenize, output_mode = 'int', output_sequence_length=MAX_SEQUENCE_LENGTH)

In [None]:
preprocess_layer.set_vocabulary(vocab)

In [None]:
export_model = tf.keras.Sequential([
    preprocess_layer,
    model,
    tf.keras.layers.Activation('sigmoid')
])

export_model.compile(optimizer = 'adam', loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [None]:
test_ds = all_labeled_data.take(VALIDATION_SIZE).batch(BATCH_SIZE)
test_ds = configure_dataset(test_ds)
loss, acc = export_model.evaluate(test_ds)

print("Loss: ", loss)
print("Acc: ", acc)

In [None]:
inputs = [
    "Join'd to th' Ionians with their flowing robes,",  # Label: 1
    "the allies, and his armour flashed about him so that he seemed to all",  # Label: 2
    "And with loud clangor of his arms he fell.",  # Label: 0
]

In [None]:
predicted_scores = export_model.predict(inputs)
predicted_labels = tf.argmax(predicted_scores, axis=1)

for input, label in zip(inputs, predicted_labels):
    print("Q: ", input)
    print("L: ", label)

In [None]:
train_ds = tfds.load('imdb_reviews', split = 'train', batch_size = BATCH_SIZE, shuffle_files = True, as_supervised = True)
valid_ds = tfds.load('imdb_reviews', split = 'train', batch_size = BATCH_SIZE, shuffle_files = True, as_supervised = True)

In [None]:
for review_batch, label_batch in valid_ds.take(1):
    for i in range(5):
        print("Review: ", review_batch[i])
        print("Label: ", label_batch[i])

In [None]:
vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens = VOCAB_SIZE,
    output_mode = 'int',
    output_sequence_length = MAX_SEQUENCE_LENGTH,
)

In [None]:
train_text = train_ds.map(lambda text, label: text)

In [None]:
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [None]:
train_ds = train_ds.map(vectorize_text)
valid_ds = valid_ds.map(vectorize_text)

In [None]:
train_ds = configure_dataset(train_ds)
valid_ds = configure_dataset(valid_ds)

In [None]:
model = create_model(VOCAB_SIZE + 1, num_labels = 1)
model.summary()

In [None]:
model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics = ['accuracy'])

In [None]:
history = model.fit(train_ds, validation_data=valid_ds, epochs=3)

In [None]:
loss, acc = model.evaluate(valid_ds)

print('Loss: ', loss)
print('Acc: ', acc)

In [None]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    tf.keras.layers.Activation('sigmoid')
])

export_model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits = True), metrics = ['accuracy'])

In [None]:
inputs = [
    "This is a fantastic movie.",
    "This is a bad movie.",
    "This movie was so bad that it was good.",
    "I will never say yes to watching this movie.",
]

In [None]:
predicted_scores = export_model.predict(inputs)

In [None]:
predicted_scores

In [None]:
for x in predicted_scores:
    print(x)
    print(x[0])
    print(round(x[0]))
    print(int(x[0]))

In [None]:
predicted_labels = [round(x[0]) for x in predicted_scores]

In [None]:
for input, label in zip(inputs, predicted_labels):
    print("Input: ", input)
    print("Label: ", label)