<a href="https://colab.research.google.com/github/kyle-gao/ML_ipynb/blob/unfinished/TF_Transformer_Calculus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Copyright 2020 Yi Lin(Kyle) Gao

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 .

In [228]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time

# Training a transformer for symbolic mathematics (differentiation) following the Transfomer tutorial https://www.tensorflow.org/tutorials/text/transformer
The dataset used is https://github.com/deepmind/mathematics_dataset.

# Data pipeline with tfds

In [129]:
train, val = tfds.load(
    'math_dataset/calculus__differentiate',
    split=['train[:50000]', 'test[:5000]'],
    as_supervised=True)

[1mDownloading and preparing dataset math_dataset/calculus__differentiate/1.0.0 (download: 2.17 GiB, generated: Unknown size, total: 2.17 GiB) to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0.incompleteKNKW5T/math_dataset-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1999998.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0.incompleteKNKW5T/math_dataset-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))

[1mDataset math_dataset downloaded and prepared to /root/tensorflow_datasets/math_dataset/calculus__differentiate/1.0.0. Subsequent calls will reuse this data.[0m


In [131]:
#since we are dealing with mathemetics, we expect a low vocabulary size
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (question.numpy() for question, answer in train), target_vocab_size=512)

In [187]:
#Taking a look at the dataset
test = list(train.take(1))
test

[(<tf.Tensor: shape=(), dtype=string, numpy=b'Find the first derivative of -4*a**4*v - 84*a**4 - v - 226 wrt a.'>,
  <tf.Tensor: shape=(), dtype=string, numpy=b'-16*a**3*v - 336*a**3'>)]

In [210]:
def encode(question,answer):
  #Adds start token (tokenizer.vocab_size) and end token (tokenizer.vocab_size + 1) to (question,answer)

  question = [tokenizer.vocab_size] + tokenizer.encode(question.numpy()) + [tokenizer.vocab_size + 1] 
  answer = [tokenizer.vocab_size] + tokenizer.encode(answer.numpy()) + [tokenizer.vocab_size + 1] 

  return question,answer

def tf_encode(question, answer):
  #We have to wrap encode in a tf.py_function() since the dataset elements do not have 
  question, answer = tf.py_function(encode, [question, answer], [tf.int64, tf.int64])
  question.set_shape([None])
  answer.set_shape([None])

  return question, answer

In [204]:
(question,answer) = test[0]
question = question.numpy()
answer = answer.numpy()
question = tokenizer.encode(question)
answer = tokenizer.encode(answer)

In [216]:
train_dataset = train.take(10).map(tf_encode)
list(train_dataset.take(1))

[(<tf.Tensor: shape=(29,), dtype=int64, numpy=
  array([515,  12,   4,  16,   5,  11,   7, 311, 301, 356,   1, 311, 301,
         377,   2,  99, 301, 356,   1, 311,   2, 377,   2,  20, 130,   6,
         356, 305, 516])>, <tf.Tensor: shape=(17,), dtype=int64, numpy=
  array([515, 304, 111, 301, 356,   1, 310, 301, 377,   2,  33, 313, 301,
         356,   1, 310, 516])>)]

In [223]:
def preprocess(dataset,batch_size):
  dataset = dataset.cache()
  #dataset = dataset.interleave(tf_encode, num_parallel_calls = 7)
  dataset = dataset.map(tf_encode)
  dataset = dataset.shuffle(10000).padded_batch(batch_size)
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
  return dataset

In [166]:
def positional_encoding(pos, d_model):

  pos_enc = np.zeros((1, pos , d_model))

  for p in range(pos):
    for i in range(d_model//2):
      angles = p / np.power(10000, (2 * i) / np.float32(d_model))
      pos_enc[:,p,2*i] = np.sin(angles)
      pos_enc[:,p,2*i+1] = np.cos(angles)
    if d_model % 2 == 1:
      # if d_model is odd loop doesn't hit last even index
      angles = p / np.power(10000, (2 * d_model) / np.float32(d_model))
      pos_enc[:,p,d_model-1] = np.sin(angles)
  return tf.cast(pos_enc, tf.float32)

In [169]:
def padding_mask(seq):
  #0's where the sequence is padded, 1 where it is not

  mask = 1-tf.cast(tf.math.equal(seq,0),tf.float32)
  return mask[:,tf.newaxis,tf.newaxis,:]

# The Transformer

In [372]:
#We will use tf.einsum to save ourselfs 3 tf.transpose operations during the calculation of the attention
#This has the additional advantange of facilitating implementation of differente types of attention kernel.

q = tf.random.uniform((5000,50,8, 100))
k = tf.random.uniform((5000,45,8, 100))

tstart = time.time()
qt = tf.transpose(q, perm=[0, 2, 1, 3])
kt = tf.transpose(k, perm=[0, 2, 1, 3])
qk = tf.matmul(qt, kt, transpose_b=True)
qk = tf.transpose(qk, perm=[0, 2, 1, 3])
tend = time.time()

print("With matmul :", tend-tstart)

tstart = time.time()
qkeinsum = tf.einsum("mlhd,mjhd->mljh",q, k)
tend = time.time()
print("With einsum :", tend-tstart)



With matmul : 0.002737760543823242
With einsum : 0.0009129047393798828


In [391]:
class MultiHeadAttention(tf.keras.layers.Layer):

  def __init__(self,d_model,num_heads):

    super().__init__()

    self.d_model = d_model
    self.num_heads = num_heads
    assert d_model%self.num_heads == 0

    self.depth=d_model//self.num_heads
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self,x, batch_size):

    """Split the last dimension into (num_heads,depth)

    Arguments:
    x -- A tokenized sequence (batch_size,seq_len,d_model)
    
    Returns:
    A tokenized sequence with dimensions (batch_size, seq_len, num_heads, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))

    return x 

  def call(self,q,k,v,mask=None):

    batch_size = tf.shape(q)[0]
    q = self.wq(q) #(batch_size,len_q, dim_q) 
    k = self.wk(k) #(batch_size,len_v, dim_q) 
    v = self.wv(v) #(batch_size,len_v, dim_v) 
    
    q = self.split_heads(q, batch_size)  # (batch_size, , len_q, num_heads, depth_q) (m,l,h,d)
    k = self.split_heads(k, batch_size)  # (batch_size, len_k, num_heads, depth_q) (m,j,h,d)
    v = self.split_heads(v, batch_size)  # (batch_size, len_v, num_heads, depth_v) (m,j,h,e)

    qk = tf.einsum("mlhd,mjhd->mljh",q,k) #(batch_size,len_q,len_v) (m,h,l,j)
    dk = tf.cast(tf.shape(k)[-1], tf.float32) 
    qk = qk/tf.math.sqrt(dk)

    if mask is not None:
       k = k * mask # We are using a multiplicative mask

    qk = tf.nn.softmax(qk, axis = -1) #(batch_size,len_q,len_v) (m,h,l,j)
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    qk = qk/tf.math.sqrt(dk)

    output = tf.einsum("mljh, mjhe -> mlhe",qk,v) #(batch_size,len_q, heads, depth_v)
    output = tf.reshape(output,(batch_size,-1,self.num_heads*self.depth)) #(batch_size,len_q, d_model)

    return self.dense(output)


In [392]:
class EncoderLayer(tf.keras.layers.Layer):
  
  """The EncoderLayer consisters of one MultiHeadAttention layer connected to a FeedForward layer,
  each of these 2 layers have a residual connection."""

  def __init__(self, num_heads, d_model, dense_dim, dropout = 0.1):
    super().__init__()

    self.attention = MultiHeadAttention(d_model,num_heads)
    self.dense = tf.keras.Sequential([tf.keras.layers.Dense(dense_dim,activation='relu'),
                                         tf.keras.layers.Dense(d_model)])
    
    self.norm1 = tf.keras.layers.LayerNormalization()
    self.norm2 = tf.keras.layers.LayerNormalization()

    self.dropout1 = tf.keras.layers.Dropout(dropout)
    self.dropout2 = tf.keras.layers.Dropout(dropout)

  def call(self, x, training, mask):

    out_attention = self.attention(x, x, x, mask) #(batch_size,seq_len,d_model)
    out_attention = self.dropout1(out_attention, training=training)
    out1 = self.norm1(x + out_attention) #residual connection (batch_size,seq_len,d_model)

    out_dense = self.dense(out1) #(batch_size,seq_len,d_model)
    out2 = self.norm2(out1 + out_dense) #residual conenction (batch_size,seq_len,d_model)
    return out2

    
class Encoder(tf.keras.layers.Layer):

  def __init__(self, num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position, dropout  = 0.1):
    super().__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    self.num_layers = num_layers 
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    self.positional_encoding = positional_encoding(max_encoding_position, d_model)
    self.encoding_layers = [EncoderLayer(num_heads, d_model, dense_dim, dropout) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout)

  def call(self, x, training, mask = None):

    seq_len = tf.shape(x)[1]
    x = self.embedding(x) #(batch_size,input_len,d_model)
    x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.positional_encoding[:, :seq_len, :] 
    x = self.dropout(x, training = training)    
    for i in range(self.num_layers):
      x = self.encoding_layers[i](x, training, mask)  # (batch_size, input_seq_len, d_model)
    
    return x 

In [406]:
class DecoderLayer(tf.keras.layers.Layer):


  def __init__(self, num_heads, d_model, dense_dim, dropout = 0.1):
    super().__init__()

    self.attention1 = MultiHeadAttention(d_model,num_heads)
    self.attention2 = MultiHeadAttention(d_model,num_heads)

    self.dense = tf.keras.Sequential([tf.keras.layers.Dense(dense_dim,activation='relu'),
                                        tf.keras.layers.Dense(d_model)])

    
    self.norm1 = tf.keras.layers.LayerNormalization()
    self.norm2 = tf.keras.layers.LayerNormalization()
    self.norm3 = tf.keras.layers.LayerNormalization()

    self.dropout1 = tf.keras.layers.Dropout(dropout)
    self.dropout2 = tf.keras.layers.Dropout(dropout)
    self.dropout3 = tf.keras.layers.Dropout(dropout)

  def call(self, encoder_out, x, training, mask):

    #We will not use forward masking since it makes little sense in this context

    out_attention1 = self.attention1(x, x, x, mask) #(batch_size,seq_len,d_model)
    out_attention1 = self.dropout1(out_attention1, training = training)
    out1 = self.norm1(x + out_attention1) #residual connection (batch_size,seq_len,d_model)

    out_attention2 = self.attention2(out1, encoder_out, encoder_out, mask) #(batch_size,seq_len,d_model)
    out_attention2 = self.dropout2(out_attention2, training = training)
    out2 = self.norm2(out1 + out_attention2)

    out_dense = self.dense(out2)
    out_dense = self.dropout3(out_dense + out2)

    return out_dense

class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position, dropout  = 0.1):
    super().__init__()
    
    self.num_heads = num_heads
    self.d_model = d_model
    self.num_layers = num_layers 
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
    self.positional_encoding = positional_encoding(max_encoding_position, d_model)
    self.decoder_layers = [DecoderLayer(num_heads, d_model, dense_dim, dropout) for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout)

  def call(self, encoder_out, x, training, mask = None):
    seq_len = tf.shape(x)[1]
    x = self.embedding(x) #(batch_size,input_len,d_model)
    x = x * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.positional_encoding[:, :seq_len, :] 
    x = self.dropout(x, training = training)    
    for i in range(self.num_layers):
      x = self.decoder_layers[i](encoder_out, x, training, mask)  # (batch_size, input_seq_len, d_model)
    return x

In [421]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, num_heads, d_model,  dense_dim, vocab_size,
                 input_max_position, target_max_position, rate=0.1):
      super().__init__()

      self.encoder = Encoder(num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position = input_max_position, dropout  = 0.1)
      
      self.decoder = Decoder(num_layers, num_heads, d_model, dense_dim,
               vocab_size, max_encoding_position = target_max_position, dropout  = 0.1)
      
      self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, input, target, training, enc_mask = None , dec_mask = None):

      out_encoder = self.encoder(input, training = training, mask = enc_mask)

      out_decoder = self.decoder(out_encoder, target, training = training, mask = dec_mask)

      out = self.dense(out_decoder)

      return out

In [423]:
transformer = Transformer( num_layers = 2, num_heads = 8, d_model = 512,  dense_dim = 512, vocab_size = 1000,
                 input_max_position = 50, target_max_position = 60, rate=0.1)

temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

out = transformer(temp_input, temp_target)
print(out.shape)

(64, 36, 1000)
