In [54]:
import numpy as np
import os
import tensorflow as tf
import re
print(tf.__version__)
assert(tf.__version__.startswith("2."))

from tensorflow import keras
from tensorflow.keras import layers, backend as K
from tensorflow.keras.models import Model
from tensorflow.keras import backend
assert(tf.__version__.startswith("2."))
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras import Input
from tensorflow.keras import Model, layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import StringLookup

# Tensorboard
from tensorflow.python.keras.callbacks import TensorBoard
import datetime

# Helper libraries
# from w266_common import utils, vocabulary, tf_embed_viz

# From sklearn 
from sklearn.model_selection import train_test_split

import time

2.3.1


# Loading in Open Subtitles Data (informal dataset)
Split up the dataset into 500MB parts. Only loading in the smallest file (~7MB)

In [11]:
path = 'opensub_en_split/'
file = 'xat.txt' # the smallest file atm

In [87]:
sentences = []
f = open(path+file, "r")
for line in f:
    sentences.append(re.findall(r"[\w']+|[.,!?;-]", line.split("\n")[0]))
# print(f.read())
f.close()

In [95]:
sentences[:5]

[['t', 'was', 'that', '?'],
 ['Come', 'on', '.'],
 ['What', 'is', 'it', '?'],
 ['I', 'think', "we'd", 'better', 'get', 'out', 'of', 'here', '.'],
 ['I', 'heard', 'something', '.']]

# Standardize words -- lower case characters, convert numbers to a standard code, etc
Also inserting a < s > character at the beginning and end of every sentence

In [83]:
# taken from utils.py from w266 common

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word, wordset=None, digits=True):
    word = word.lower()
    if digits:
        if (wordset != None) and (word in wordset): return word
        word = canonicalize_digits(word) # try to canonicalize numbers
    if (wordset == None) or (word in wordset):
        return word
    else:
        return constants.UNK_TOKEN

def canonicalize_words(words, **kw):
    return [canonicalize_word(word, **kw) for word in words]

In [89]:
canonsentences = np.array([['<s>'] + [canonicalize_word(word) for word in sentence] + ['<s>'] for sentence in sentences ])
print('An example of pre-standardized sentence:\n  {}'.format(sentences[0]))
print('\n\nand after standardization:\n  {}'.format(canonsentences[0]))

An example of pre-standardized sentence:
  ['t', 'was', 'that', '?']


and after standardization:
  ['<s>', 't', 'was', 'that', '?', '<s>']


# Keras PreProcessing: StringLookup

In [90]:
# Size of corpus
print('Length of xat.txt corpus is {} sentences'.format(len(canonsentences)))

# Convert to single dimension of words
canonwords = [ word for sentence in canonsentences for word in sentence]
print('Length of words in brown corpus is {}'.format(len(canonwords)))

Length of xat.txt corpus is 232356 sentences
Length of words in brown corpus is 2138978


In [92]:
# Create the string lookup object using the 10000 most-popular words
words_to_ids = StringLookup(max_tokens = 10000)

# Process the input corpus words, creating a vocabulary / id lookup:
words_to_ids.adapt(canonwords)

# Get vocabulary size
V = len(words_to_ids.get_vocabulary())
print('Extracted vocabulary length is {}'.format(V))

# Also create an object to convert from ids back to words from the same vocabulary:
ids_to_words = StringLookup(vocabulary=words_to_ids.get_vocabulary(), invert=True)

Extracted vocabulary length is 10000


# Keras Dataset input utility

In [93]:
# Create training / test sets of word ids 
corpus_ids = words_to_ids(canonwords).numpy()

# Split into train (80%) dev (10%) test (10%)
train_ids, dev_test_ids = train_test_split(corpus_ids, train_size=0.8, random_state=42, shuffle=False)

dev_ids, test_ids = train_test_split(dev_test_ids, train_size=0.5, random_state=42, shuffle=False)

x_ids_train = train_ids[:-1]
y_ids_train = train_ids[1:]

# inputs of length max_time words
max_time = 25   # length of words per sequence
buffer_size = 100
batch_size = 100

ids_labels_dataset = tf.data.Dataset.from_tensor_slices((x_ids_train, y_ids_train))
# examples_per_epoch = len(corpus_ids)//(max_time+1)

# Create a train sequence dimension for words.  
sequences_train = ids_labels_dataset.batch(max_time, drop_remainder=True).shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)

# Create a dataset for validating during fit
x_dev = dev_ids[:-1]
y_dev = dev_ids[1:]
ids_labels_validation = tf.data.Dataset.from_tensor_slices((x_dev, y_dev))
sequences_val = ids_labels_validation.batch(max_time, drop_remainder=True).shuffle(buffer_size).batch(
    batch_size, drop_remainder=True)



# Create the model. Setup tensorboard. Train the model.


In [96]:
# The LSTM layer provides two arguments:
#   return_state (which returns lstm_state, lstm_last_time_state, cell_state)
#   return_sequence (which ensures that the 'lstm_state' returned object is the vector output
#   for all time positions in the sequence.)
#
#   Note that for the case (return_sequence = False, return_state = True) lstm_state and lstm_last_time_state
#   are the same tensor.
#
# Here is a good article illustrating the two options:
#    https://machinelearningmastery.com/return-sequences-and-return-states-for-lstms-in-keras/


# Let's build a model class to instantiate our model 
# ...and more closely control training / inference behavior.
class MyModel(keras.Model):
    def __init__(self, vocab_size, embedding_dim, n_layers, rnn_units, hidden_activation, dropout_rate,
                 hidden_initializer, batchnorm = True):
        super().__init__(self)
        self.embedding = keras.layers.Embedding(vocab_size, embedding_dim)
        # self.rnn = keras.layers.GRU(rnn_units, return_sequences = True, return_state=True, 
        #                              activation = hidden_activation, kernel_initializer = hidden_initializer, 
        #                              stateful=False)
        self.n_layers = n_layers
        self.rnn = []
        self.norm = []
        self.dropout = []
        for i in range(n_layers):
            self.rnn.append(tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True, 
                                                 activation = hidden_activation,
                                                 kernel_initializer = hidden_initializer))
            self.norm.append(tf.keras.layers.BatchNormalization())
            self.dropout.append(tf.keras.layers.Dropout(dropout_rate))
            
        # self.rnn = tf.keras.layers.LSTM(rnn_units, return_sequences=True, return_state=True,
        #                                 activation = hidden_activation, 
        #                                 kernel_initializer = hidden_initializer)
        
        self.do_batchnorm = batchnorm
    
        # tf.keras.layers.GRU(rnn_units,
        #                                return_sequences=True, 
        #                                return_state=True)
        self.dense = keras.layers.Dense(vocab_size)
 
    # You must set return_sequences=True when stacking LSTM layers so that the second LSTM layer has a three-dimensional sequence input.

    def call(self, inputs, passin_states=None, return_state=False, training=False):
        x = inputs
        x = self.embedding(x, training=training)
            
        # In the following expression the return values are:  
        #         x = the sequence of outputs from the layer, 
        #   state_h = the final state vector (at the last time-step)
        #   state_c = the cell memory at the final step
        states = []
        for i in range(self.n_layers):
            if passin_states is None:
                statesi = self.rnn[i].get_initial_state(x)
            else:
                statesi = passin_states[i]
            x, state_h, state_c = self.rnn[i](x, initial_state=statesi, training=training)
            # x, state_h, state_c = self.rnn(x, initial_state=states, training=training)
            x = self.dropout[i](x)
            if self.do_batchnorm:
                x = self.norm[i](x)
            states.append((state_h, state_c))
        
        # Output layer outputs logits rather than softmax as we didn't specify any activation
        x = self.dense(x, training=training)
        
        if return_state:
            return x, states
        else: 
            return x

In [97]:
# Length of the vocabulary in chars
vocab_size = 10000

# The embedding dimension
# embedding_dim = 256
embedding_dim = 50

# Number of hidden layers
n_layers = 2

# Number of RNN units
rnn_units = 100

hidden_activation = 'relu'

hidden_initializer = 'he_uniform'

# Dropout
dropout_rate = 0.1

# Create model instance
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    n_layers = n_layers,
    rnn_units=rnn_units,
    hidden_activation = hidden_activation, 
    hidden_initializer = hidden_initializer,
    dropout_rate = dropout_rate,
    batchnorm = True)

In [98]:
# Get a feel for looking at training samples in our input Dataset
for input_example_batch, target_example_batch in sequences_train.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")
    print(input_example_batch.shape, target_example_batch.shape)

# Print out a model summary
model.summary()

(100, 25, 10000) # (batch_size, sequence_length, vocab_size)
(100, 25) (100, 25)
Model: "my_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  500000    
_________________________________________________________________
lstm (LSTM)                  multiple                  60400     
_________________________________________________________________
lstm_1 (LSTM)                multiple                  80400     
_________________________________________________________________
batch_normalization (BatchNo multiple                  400       
_________________________________________________________________
batch_normalization_1 (Batch multiple                  400       
_________________________________________________________________
dropout (Dropout)            multiple                  0         
___________________________________________

In [99]:
# See the behavior of loss function, how to take mean loss over batch
# We will use "from_logits" = True since our outputs are logits rather than softmax (ie, [batch,seq_len,V])
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

# We can calculate an example loss using eager execution.
example_batch_loss = loss(target_example_batch, example_batch_predictions)
print('Shape of example batch loss: {}'.format(example_batch_loss.numpy().shape))
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

# We know that tensorflow uses natural log (base e) for crossentropy calculation 
checkbase = loss(np.array([[[0]]]), np.array([[[0., 20, 0.]]]))
print(checkbase.numpy())
# Confirm that exp(mean loss ~ V)  (why?)
# If initialization is good, each q ~ 1 / V => p ln(V) = ln(V) -> exp(ln(V)) = V !!
# ln(x) = ln(2^log2(x)) = log2(x) * ln(2) => 
print(tf.exp(mean_loss).numpy())

Shape of example batch loss: ()
Prediction shape:  (100, 25, 10000)  # (batch_size, sequence_length, vocab_size)
Mean loss:         9.210382
20.0
10000.421


In [100]:
# Compile model
model.compile(optimizer='adam', loss=loss, metrics = ['sparse_categorical_accuracy'])


# Checkpoints
When training a model over the course of several hours or days it is important (vital!) to setup periodic checkpoints for your model, so if something bad happens (a power outage, timeout, loss of colab resources, etc.) you will not need to start over training from scratch. This is the purpose of checkpoints.

The following cell shows an example of how to set this up and use it when fitting your model. In this case we're check-pointing every epoch but you can also specify the frequency in a couple of other ways detailed in the documentation.

In [101]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [102]:
# Train
EPOCHS = 20
history = model.fit(sequences_train, 
                    validation_data = sequences_val, epochs=EPOCHS, 
                    callbacks=[checkpoint_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [103]:
class OneStep(tf.keras.Model):
  def __init__(self, model, ids_to_words, words_to_ids, temperature=1.0):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.ids_to_words = ids_to_words
    self.words_to_ids = words_to_ids

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.words_to_ids(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(words_to_ids.get_vocabulary())]) 
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  # @tf.function
  def generate_one_step(self, input_words, passin_states=None):
    # Convert strings to token IDs.
    # input_words = tf.strings.unicode_split(inputs, 'UTF-8')
    # input_ids = self.words_to_ids(input_words).to_tensor()
    input_words = tf.strings.split(input_words)
    input_ids = self.words_to_ids(input_words.to_tensor())

    # Run the model.
    # predicted_logits.shape is [batch, word, next_word_logits] 
    predicted_logits, states =  self.model(inputs=input_ids, passin_states=passin_states, 
                                          return_state=True)
    # Only use the prediction in the final time-position.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to words
    predicted_words = self.ids_to_words(predicted_ids)

    # Return the words and model state.
    return predicted_words, states

In [104]:
one_step_model = OneStep(model, ids_to_words, words_to_ids)

# 
start = time.time()
states = None
# next_word = tf.constant(['hello, my name is'])
next_word = np.array(['hello, my name is'])
# next_word = tf.constant([['hello, my name is'],['hello', 'my', 'name', 'is']])
result = [next_word]

for n in range(100):
    next_word, states = one_step_model.generate_one_step(next_word, passin_states=states)
    result.append(next_word)

result = tf.strings.join(result, separator=' ')
end = time.time()

print('Generated language:')
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

print(f"\nRun time: {end - start}")

Generated language:
hello, my name is malik from the tan city sun watching us . <s> <s> sold , they ask me . <s> <s> stop there . <s> <s> two seconds ! <s> <s> you tough , jim ? <s> <s> he did not do with me soon for a observe it later why you got the interest for a strong quarters . <s> <s> now , right ? <s> <s> it was stupid - work at another . <s> <s> i can't touch him is a nuts lot ? <s> <s> he was trash each other loves corn acts of you in your world 

________________________________________________________________________________

Run time: 2.2687811851501465
