<a href="https://colab.research.google.com/github/kyle-gao/ML_ipynb/blob/kyle-gao-patch-1/TF_Transformer_Calculus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Copyright 2020 Yi Lin(Kyle) Gao


##### Copyright 2019 The TensorFlow Authors.

In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time

# Training a transformer for symbolic mathematics (differentiation) following the Transfomer tutorial https://www.tensorflow.org/tutorials/text/transformer
The dataset used is https://github.com/deepmind/mathematics_dataset.

# Data pipeline with tfds

In [3]:
train, val = tfds.load(
    'math_dataset/calculus__differentiate',
    split=['train[:200000]', 'test[:5000]'],
    as_supervised=True)

[1mDownloading and preparing dataset math_dataset/calculus__differentiate/1.0.0 (download: 2.17 GiB, generated: Unknown size, total: 2.17 GiB) to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…











HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0.incomplete9JCSWZ/math_dataset-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1999998.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0.incomplete9JCSWZ/math_dataset-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

[1mDataset math_dataset downloaded and prepared to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0. Subsequent calls will reuse this data.[0m


In [4]:
#since we are dealing with mathemetics, we expect a low vocabulary size
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (question.numpy() for question, answer in train), target_vocab_size=2**12)

In [5]:
#Taking a look at the dataset
test = list(train.take(1))
test

[(<tf.Tensor: shape=(), dtype=string, numpy=b'Find the first derivative of -4*a**4*v - 84*a**4 - v - 226 wrt a.'>,
  <tf.Tensor: shape=(), dtype=string, numpy=b'-16*a**3*v - 336*a**3'>)]

In [6]:
def encode(question,answer):
  #Adds start token (tokenizer.vocab_size) and end token (tokenizer.vocab_size + 1) to (question,answer)

  question = [tokenizer.vocab_size] + tokenizer.encode(question.numpy()) + [tokenizer.vocab_size + 1] 
  answer = [tokenizer.vocab_size] + tokenizer.encode(answer.numpy()) + [tokenizer.vocab_size + 1] 

  return question,answer

def tf_encode(question, answer):
  #We have to wrap encode in a tf.py_function() since the dataset elements do not have 
  question, answer = tf.py_function(encode, [question, answer], [tf.int64, tf.int64])
  question.set_shape([None])
  answer.set_shape([None])

  return question, answer

def tf_interleave_encode(question, answer):
  #We have to wrap encode in a tf.py_function() since the dataset elements do not have 
  question, answer = tf.py_function(encode, [question, answer], [tf.int64, tf.int64])
  question.set_shape([None])
  answer.set_shape([None])

  return tf.data.Dataset.from_tensors((question, answer))

In [24]:
train_dataset = preprocess(train, 64).take(1000)

In [8]:
max_length_question = 35
max_length_answer = 25
def filter_max_length(x, y, max_length_question = max_length_question, max_length_answer = max_length_answer):
  return tf.logical_and(tf.size(x) <= max_length_question,
                        tf.size(y) <= max_length_answer)

In [9]:
def preprocess(dataset,batch_size, pad_len_question = max_length_question, pad_length_answer = max_length_answer):
  dataset = dataset.cache()
  #dataset = dataset.map(tf_encode)
  dataset = dataset.interleave(tf_interleave_encode, num_parallel_calls = tf.data.experimental.AUTOTUNE)
  dataset = dataset.filter(filter_max_length)
  dataset = dataset.shuffle(10000)
  #dataset = dataset.padded_batch(batch_size)
  pad = tf.cast(0,tf.int64)
  dataset = dataset.padded_batch(batch_size, drop_remainder = True, padded_shapes = ([pad_len_question],[pad_length_answer]), padding_values = pad)
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset

# Positional Encoding and Masks


In [10]:
def positional_encoding(pos, d_model):

  pos_enc = np.zeros((1, pos , d_model))

  for p in range(pos):
    for i in range(d_model//2):
      angles = p / np.power(10000, (2 * i) / np.float32(d_model))
      pos_enc[:,p,2*i] = np.sin(angles)
      pos_enc[:,p,2*i+1] = np.cos(angles)
    if d_model % 2 == 1:
      # if d_model is odd loop doesn't hit last even index
      angles = p / np.power(10000, (2 * d_model) / np.float32(d_model))
      pos_enc[:,p,d_model-1] = np.sin(angles)
  return tf.cast(pos_enc, tf.float32)

In [11]:
def padding_mask(seq):
  #0's where the sequence is padded, 1 where it is not

  mask = 1-tf.cast(tf.math.equal(seq,0),tf.float32)
  return mask[:,:,tf.newaxis,tf.newaxis] #(batch, seq_len, 1, 1)

In [12]:
def forward_mask(seq):
  seq_len = tf.shape(seq)[1]
  """Returns a combined look_ahead_mask (lower triangular 1s)
    and padding mask"""
  look_ahead_mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) 
  look_ahead_mask = look_ahead_mask [tf.newaxis,:,:,tf.newaxis]

  padded_mask = padding_mask(seq)

  return padded_mask*look_ahead_mask  #(batch, seq_len, seq_len, 1)

# The Transformer

In [13]:
#We will use tf.einsum to save ourselfs 3 tf.transpose operations during the calculation of the attention
#This has the additional advantange of facilitating implementation of differente types of attention kernel.

q = tf.random.uniform((5000,50,8, 100))
k = tf.random.uniform((5000,45,8, 100))


tstart = time.time()
qt = tf.transpose(q, perm=[0, 2, 1, 3])
kt = tf.transpose(k, perm=[0, 2, 1, 3])
qk = tf.matmul(qt, kt, transpose_b=True)
qk = tf.transpose(qk, perm=[0, 2, 1, 3])
tend = time.time()

print("With matmul :", tend-tstart)

tstart = time.time()
qkeinsum = tf.einsum("mlhd,mjhd->mljh",q, k)
tend = time.time()
print("With einsum :", tend-tstart)



With matmul : 1.544126033782959
With einsum : 0.007376670837402344


In [14]:
#the encoder qk has shape (batch, seq_len_question, seq_len_question, num_heads) enc_padding_mask goes here
#the 1st decoder qk has shape (batch, seq_len_answer, seq_len_answer, num_heads) dec_forward mask goes here
#the 2nd decoder qk has shape (batch, seq_len_answer, seq_len_question, num_heads) dec_padding_mask goes here

seq_question = tf.random.uniform((5000,45))
seq_answer = tf.random.uniform((5000,50))
pad_mask = padding_mask(seq_answer)
print(tf.shape(pad_mask))
for_mask = forward_mask(seq_answer)
print(tf.shape(for_mask))
print(tf.shape(qkeinsum*pad_mask))

tf.Tensor([5000   50    1    1], shape=(4,), dtype=int32)
tf.Tensor([5000   50   50    1], shape=(4,), dtype=int32)
tf.Tensor([5000   50   45    8], shape=(4,), dtype=int32)


In [15]:
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self,d_model,num_heads):

    super().__init__()

    self.d_model = d_model
    self.num_heads = num_heads
    assert d_model%self.num_heads == 0

    self.depth=d_model//self.num_heads
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self,x, batch_size):

    """Split the last dimension into (num_heads,depth)

    Arguments:
    x -- A tokenized sequence (batch_size, seq_len, d_model)
    
    Returns:
    A tokenized sequence with dimensions (batch_size, seq_len, num_heads, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))

    return x 

  def call(self,q,k,v,mask=None):

    batch_size = tf.shape(q)[0]
    q = self.wq(q) #(batch_size,len_q, dim_q) 
    k = self.wk(k) #(batch_size,len_v, dim_q) 
    v = self.wv(v) #(batch_size,len_v, dim_v) 
    
    q = self.split_heads(q, batch_size)  # (batch_size, len_q, num_heads, depth_q) (m,l,h,d)
    k = self.split_heads(k, batch_size)  # (batch_size, len_v, num_heads, depth_q) (m,j,h,d)
    v = self.split_heads(v, batch_size)  # (batch_size, len_v, num_heads, depth_v) (m,j,h,e)

    qk = tf.einsum("mlhd,mjhd->mljh",q,k) #(batch_size, len_q, len_v, num_heads) (m,l,j,h)
    dk = tf.cast(tf.shape(k)[-1], tf.float32) 
    qk = qk/tf.math.sqrt(dk)

    if mask is not None: 
       qk = qk * mask # We are using a multiplicative mask

    qk = tf.nn.softmax(qk, axis = -2) #(batch_size,len_q,len_v, num_heads) (m,l,j,h)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    qk = qk/tf.math.sqrt(dk)

    output = tf.einsum("mljh, mjhe -> mlhe",qk,v) #(batch_size,len_q, heads, depth_v)
    output = tf.reshape(output,(batch_size, -1, self.num_heads*self.depth)) #(batch_size,len_q, d_model)

    return self.dense(output)


In [16]:
class EncoderLayer(tf.keras.layers.Layer):
  
  """The EncoderLayer consisters of one MultiHeadAttention layer connected to a FeedForward layer,
  each of these 2 layers have a residual connection."""

  def __init__(self, num_heads, d_model, dense_dim, dropout = 0.1):
    super().__init__()

    self.attention = MultiHeadAttention(d_model,num_heads)
    self.dense = tf.keras.Sequential([tf.keras.layers.Dense(dense_dim,activation='relu'),
                                         tf.keras.layers.Dense(d_model)])
    
    self.norm1 = tf.keras.layers.LayerNormalization()
    self.norm2 = tf.keras.layers.LayerNormalization()

    self.dropout1 = tf.keras.layers.Dropout(dropout)
    self.dropout2 = tf.keras.layers.Dropout(dropout)

  def call(self, x, training, mask):

    out_attention = self.attention(x, x, x, mask) #(batch_size,seq_len,d_model)
    out_attention = self.dropout1(out_attention, training=training)
    out1 = self.norm1(x + out_attention) #residual connection (batch_size,seq_len,d_model)

    out_dense = self.dense(out1) #(batch_size,seq_len,d_model)
    out2 = self.norm2(out1 + out_dense) #residual conenction (batch_size,seq_len,d_model)
    return out2

    
class Encoder(tf.keras.layers.Layer):

  def __init__(self, num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position, dropout  = 0.1):
    super().__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    self.num_layers = num_layers 
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    self.positional_encoding = positional_encoding(max_encoding_position, d_model)
    self.encoding_layers = [EncoderLayer(num_heads, d_model, dense_dim, dropout) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout)

  def call(self, x, training, mask = None):

    seq_len = tf.shape(x)[1]
    x = self.embedding(x) #(batch_size,input_len,d_model)
    x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.positional_encoding[:, :seq_len, :] 
    x = self.dropout(x, training = training)    
    for i in range(self.num_layers):
      x = self.encoding_layers[i](x, training, mask)  # (batch_size, input_seq_len, d_model)
    
    return x 

In [17]:
class DecoderLayer(tf.keras.layers.Layer):


  def __init__(self, num_heads, d_model, dense_dim, dropout = 0.1):
    super().__init__()

    self.attention1 = MultiHeadAttention(d_model,num_heads)
    self.attention2 = MultiHeadAttention(d_model,num_heads)

    self.dense = tf.keras.Sequential([tf.keras.layers.Dense(dense_dim,activation='relu'),
                                        tf.keras.layers.Dense(d_model)])

    
    self.norm1 = tf.keras.layers.LayerNormalization()
    self.norm2 = tf.keras.layers.LayerNormalization()
    self.norm3 = tf.keras.layers.LayerNormalization()

    self.dropout1 = tf.keras.layers.Dropout(dropout)
    self.dropout2 = tf.keras.layers.Dropout(dropout)
    self.dropout3 = tf.keras.layers.Dropout(dropout)

  def call(self, encoder_out, x, training, forward_mask, padding_mask):

    #We will not use forward masking since it makes little sense in this context

    out_attention1 = self.attention1(x, x, x, forward_mask) #(batch_size, seq_len_answer, d_model) -> The return seq_len is the same as that of the first argument of the call.
    out_attention1 = self.dropout1(out_attention1, training = training)
    out1 = self.norm1(x + out_attention1) #residual connection (batch_size, seq_len_answer, d_model)

    out_attention2 = self.attention2(out1, encoder_out, encoder_out, padding_mask) #(batch_size, seq_len_answer, d_model)
    out_attention2 = self.dropout2(out_attention2, training = training)
    out2 = self.norm2(out1 + out_attention2)

    out_dense = self.dense(out2)
    out_dense = self.dropout3(out_dense + out2)

    return out_dense

class Decoder(tf.keras.layers.Layer):


  def __init__(self, num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position, dropout  = 0.1):
    super().__init__()
    
    self.num_heads = num_heads
    self.d_model = d_model
    self.num_layers = num_layers 
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    self.positional_encoding = positional_encoding(max_encoding_position, d_model)
    self.decoder_layers = [DecoderLayer(num_heads, d_model, dense_dim, dropout) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout)


  def call(self, encoder_out, x, training, forward_mask = None, padding_mask = None):

    seq_len = tf.shape(x)[1]
    x = self.embedding(x) #(batch_size,input_len,d_model)
    x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32)) 
    x = x + self.positional_encoding[:, :seq_len, :] 
    x = self.dropout(x, training = training)    
    for i in range(self.num_layers):
      x = self.decoder_layers[i](encoder_out, x, training, forward_mask, padding_mask)  # (batch_size, input_seq_len, d_model)
    return x

In [18]:
class Transformer(tf.keras.Model):

  
    def __init__(self, num_layers, num_heads, d_model,  dense_dim, vocab_size,
                 input_max_position, target_max_position, rate=0.1):
      super().__init__()

      self.encoder = Encoder(num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position = input_max_position, dropout  = 0.1)
      
      self.decoder = Decoder(num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position = target_max_position, dropout  = 0.1)
      
      self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, input, target, training = False, enc_mask = None , dec_forward_mask= None, dec_padding_mask = None):

      out_encoder = self.encoder(input, training = training, mask = enc_mask)

      out_decoder = self.decoder(out_encoder, target, training = training, forward_mask = dec_forward_mask, padding_mask = dec_padding_mask)

      out = self.dense(out_decoder)

      return out

#Training

In [19]:
num_layers = 3
d_model = 64
dense_dim = 256
num_heads = 4

vocab_size = tokenizer.vocab_size + 2

dropout_rate = 0.1
transformer = Transformer( num_layers = num_layers, num_heads = num_heads, d_model = d_model,  dense_dim = dense_dim, vocab_size = vocab_size,
                 input_max_position = max_length_question, target_max_position = max_length_answer, rate=0.1)

In [20]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [21]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def masked_loss_fn(answer, prediction):
  mask = tf.math.logical_not(tf.math.equal(answer,0)) #0 at zeroes, 1 at non-zeroes since seq is padded
  mask = tf.cast(mask, tf.int32)
  loss_value = loss_fn(answer,prediction,sample_weight=mask) #set the zeros to zero weight, other values have weight of 1. 

  return loss_value

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [26]:
EPOCHS = 10
signature = [tf.TensorSpec(shape=(None, max_length_question), dtype=tf.int64), tf.TensorSpec(shape=(None, max_length_answer), dtype=tf.int64),] #quite a bit faster if we specify the signature

@tf.function(input_signature=signature)
def train_step(question, answer):
  answer_in = answer[:, :-1]
  answer_tar = answer[:, 1:]
  
  enc_padding_mask = padding_mask(question)
  dec_padding_mask = padding_mask(answer_in)
  dec_forward_mask = forward_mask(answer_in)
  
  with tf.GradientTape() as tape:
    predictions = transformer(question, answer_in, training = True, enc_mask = enc_padding_mask , dec_forward_mask = dec_forward_mask, dec_padding_mask = dec_padding_mask)
    loss = masked_loss_fn(answer_tar, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(answer_tar, predictions)


for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  for (batch, (question, answer)) in enumerate(train_dataset):
    train_step(question, answer)

  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Loss 0.6750 Accuracy 0.2320
Time taken for 1 epoch: 95.12350106239319 secs

Epoch 2 Loss 0.5439 Accuracy 0.2563
Time taken for 1 epoch: 91.63019752502441 secs

Epoch 3 Loss 0.4196 Accuracy 0.2707
Time taken for 1 epoch: 91.36850237846375 secs

Epoch 4 Loss 0.3170 Accuracy 0.2831
Time taken for 1 epoch: 91.55873107910156 secs

Epoch 5 Loss 0.2479 Accuracy 0.2929
Time taken for 1 epoch: 91.14557266235352 secs

Epoch 6 Loss 0.2011 Accuracy 0.3005
Time taken for 1 epoch: 91.39584112167358 secs

Epoch 7 Loss 0.1743 Accuracy 0.3056
Time taken for 1 epoch: 90.67380905151367 secs

Epoch 8 Loss 0.1514 Accuracy 0.3098
Time taken for 1 epoch: 92.1460874080658 secs

Epoch 9 Loss 0.1382 Accuracy 0.3123
Time taken for 1 epoch: 94.22340703010559 secs

Epoch 10 Loss 0.1273 Accuracy 0.3142
Time taken for 1 epoch: 92.45964097976685 secs



# EVALUATION


In [67]:
def evaluate(question):
    start_token = [tokenizer.vocab_size]
    end_token = [tokenizer.vocab_size + 1]
    question = start_token + tokenizer.encode(question) + end_token
    question = tf.expand_dims(question, 0)
    answer_in = [tokenizer.vocab_size]
    answer_in = tf.expand_dims(answer_in, 0)

    for i in range(max_length_answer):
        enc_padding_mask = padding_mask(question)
        dec_padding_mask = padding_mask(answer_in)
        dec_forward_mask = forward_mask(answer_in)

        predictions = transformer(question, answer_in, training=False, enc_mask=enc_padding_mask,
                                  dec_forward_mask=dec_forward_mask, dec_padding_mask=dec_padding_mask)
        prediction = predictions[:, -1:, :]  # select the last word to add to the outputs

        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)

        if predicted_id == tokenizer.vocab_size:
          return tf.squeeze(answer, axis=0)
        answer_in = tf.concat([answer_in, predicted_id], axis=-1)

    return tf.squeeze(answer_in, axis=0)

def translate(sentence):
    result = np.array(evaluate(sentence))

    predicted_sentence = tokenizer.decode([i for i in result
                                              if tokenizer.vocab_size > i > 0])
    print('Input: {}'.format(sentence))
    print('Predicted answer: {}'.format(predicted_sentence))

In [33]:
def find_answer(sentence):
  result = np.array(evaluate(sentence))
  
  predicted_sentence = tokenizer.decode([i for i in result 
                                            if i < tokenizer.vocab_size and i > 0])
  print('Input: {}'.format(sentence))
  print('Predicted answer: {}'.format(predicted_sentence))

In [76]:
question = b'Find the first derivative of x*2 with respect to x.'
find_answer(question)

Input: b'Find the first derivative of x*2 with respect to x.'
Predicted answer: ********************x**x*


It seems that we have complete garbage results. This tokenizer represents each number as an unique token, which doesn't seem suitable for mathematics. 
Things to try.

- Test the transformer on a translation task
- Find an input format that understands the basic structure of a mathematical expression (trees?)