In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow_probability as tfp
import time

## Load the dataset

In [2]:
dataset = pd.read_csv('../input/language-translation-englishfrench/eng_-french.csv')
dataset.head()

In [3]:
## Size of dataset
dataset.shape

In [4]:
# Visualizing the length of sequences
eng = []
fra = []
for i in range(dataset.shape[0]):
    eng.append(len(dataset.iloc[i,0].split(' ')))
    fra.append(len(dataset.iloc[i,1].split(' ')))
lengths= pd.DataFrame({'English':eng,'French':fra})
lengths.hist(bins=20)
plt.show()

## Preprocess dataset

In [5]:
def convert_lower(text):
    # Convert text to lowercase
    text  = text.lower()
    return text

In [6]:
def remove_punctuation(text):
    return text.translate(str.maketrans('','',string.punctuation))


In [7]:
def preprocess(text):
    text = convert_lower(text)
    return remove_punctuation(text)

In [8]:
def tokenize_text_word_wise(text):
        tokenizer = tf.keras.preprocessing.text.Tokenizer()
        tokenizer.fit_on_texts(text)
        return tokenizer

In [9]:
# Encoding and padding sequences
def encode(tokenizer,length,text):
    sequence = tokenizer.texts_to_sequences(text)
    sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence,maxlen=length,padding='post')
    return sequence

In [10]:
def positional_encoding(position,d):
    rate =  1/np.power(10000,(2*((np.arange(d)[np.newaxis,:])//2))/np.float32(d))
    angles = np.arange(position)[:,np.newaxis]*rate
    angles[:,0::2] =  np.sin(angles[:,0::2])
    angles[:,1::2] =  np.cos(angles[:,1::2])
    encod= angles[np.newaxis,...]
    return tf.cast(encod,dtype=tf.float32)
    

In [11]:
## Example of positional encoding
pos = positional_encoding(512,32)
print(pos.shape)

In [12]:
## Showing a  preprocessing of english text
dataset.iloc[:,0] = dataset.iloc[:,0].apply(preprocess)
eng_tokenizer = tokenize_text_word_wise(dataset.iloc[:,0])
eng_vocab_length  = len(eng_tokenizer.word_index)+1
dataset.iloc[:,0].head()

In [13]:
print(f'English vocab length {eng_vocab_length}')

In [14]:
## Showing a  preprocessing of French text
dataset.iloc[:,1] = dataset.iloc[:,1].apply(preprocess)
fra_tokenizer = tokenize_text_word_wise(dataset.iloc[:,1])
fra_vocab_length  = len(fra_tokenizer.word_index)+1
dataset.iloc[:,1].head()

In [15]:
print(f'French vocab length {fra_vocab_length}')

In [16]:
X = encode(eng_tokenizer,10,dataset.iloc[:,0])
Y = encode(fra_tokenizer,10,dataset.iloc[:,1])


In [17]:
# some examples
print(X[:10])

## Building model

In [18]:
from sklearn.model_selection import train_test_split
trainX,testX,trainY,testY = train_test_split(X,Y,test_size=0.2)

In [19]:
BUFFER_SIZE= 200000
BATCH_SIZE = 128

In [20]:
def make_batches(ds):
    return (
          ds
          .cache()
          .shuffle(BUFFER_SIZE)
          .batch(BATCH_SIZE)
          .prefetch(tf.data.AUTOTUNE))
train_batches = make_batches(tf.data.Dataset.from_tensor_slices((trainX,trainY)))
val_batches = make_batches(tf.data.Dataset.from_tensor_slices((testX,testY)))


In [21]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq,0),tf.float32)
    return seq[:,tf.newaxis,tf.newaxis,:]

In [22]:
def create_frwrd_mask(ln):
    mask = tfp.math.fill_triangular(tf.ones((int(ln*(ln+1)/2),),dtype=tf.int32),upper=False)
    return mask

In [23]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,d,num_heads,dff,rate=0.5):
        super(EncoderLayer,self).__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(num_heads,key_dim=d)
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(dff,activation="relu"),tf.keras.layers.Dense(d)])
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(rate)
    def __call__(self,x,training,mask):
        attn_out,attn_weights = self.mha(x,x,attention_mask=mask,return_attention_scores=True)
        attn_out = self.dropout(attn_out,training=training)
        out = self.layernorm_1(x+attn_out)
        ffn_out = self.ffn(out)
        ffn_out = self.dropout(ffn_out,training=training)
        out = self.layernorm_2(out+ffn_out)
        return out

In [24]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,num_layers,d,num_heads,dff,source_vocab,target_vocab,tokens=128,rate=0.5):
        super(Encoder,self).__init__()
        self.TOKENS=tokens
        self.d = d
        self.num_layers = num_layers
        self.embedding= tf.keras.layers.Embedding(source_vocab,self.d)
        self.pos_encod = positional_encoding(self.TOKENS,self.d)
        self.enc_layers = [EncoderLayer(d=d,num_heads=num_heads,dff=dff,rate=rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    def __call__(self,x,training,mask):
        sq_len = tf.shape(x)[1]
        x = self.embedding(x)
        x = x+ self.pos_encod[:,:sq_len,:]
        x= self.dropout(x,training=training)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x,training,mask)
        return x

In [25]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,d,num_heads,dff,rate=0.5):
        super(DecoderLayer,self).__init__()
        self.mha_1 = tf.keras.layers.MultiHeadAttention(num_heads,key_dim=d)
        self.mha_2 = tf.keras.layers.MultiHeadAttention(num_heads,key_dim=d)
        self.ffn = tf.keras.Sequential([tf.keras.layers.Dense(dff,activation="relu"),tf.keras.layers.Dense(d)])
        self.layernorm_1 = tf.keras.layers.LayerNormalization()
        self.layernorm_2 = tf.keras.layers.LayerNormalization()
        self.layernorm_3 = tf.keras.layers.LayerNormalization()
        self.dropout_1 = tf.keras.layers.Dropout(rate)
        self.dropout_2 =tf.keras.layers.Dropout(rate)
        self.dropout_3 = tf.keras.layers.Dropout(rate)
    def __call__(self,x,enc_out,training,frwrd_mask,padding_mask):
        attn1,attn1_weights = self.mha_1(x,x,x,frwrd_mask,return_attention_scores=True)
        attn1 =self.dropout_1(attn1,training=training)
        out1 = self.layernorm_1(attn1+x)
        attn2,attn2_weights = self.mha_2(out1,enc_out,enc_out,padding_mask,return_attention_scores=True)
        out2 = self.layernorm_2(attn2+out1)
        ffn_out = self.ffn(out2)
        ffn_out =  self.dropout_3(ffn_out,training=training)
        out = self.layernorm_3(ffn_out+out2)
        return out,attn1_weights,attn2_weights

In [26]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,num_layers,d,num_heads,dff,source_vocab,target_vocab,tokens=128,rate=0.5):
            super(Decoder,self).__init__()
            self.d= d 
            self.num_layers = num_layers
            self.TOKENS=  tokens
            self.embedding = tf.keras.layers.Embedding(target_vocab,d)
            self.pos_encod = positional_encoding(self.TOKENS,d)
            self.dec_layers = [DecoderLayer(d,num_heads,dff,rate) for _ in range(num_layers)]
            self.dropout =  tf.keras.layers.Dropout(rate)
    def __call__(self,x,enc_out,training,frwrd_mask,padding_mask):

        sq_len = tf.shape(x)[1]
        attn  ={}
        x  = self.embedding(x)
        x+=self.pos_encod[:,:sq_len,:]
        x = self.dropout(x,training=training)
        for i in range(self.num_layers):
            x,block1,block2 = self.dec_layers[i](x,enc_out,training,frwrd_mask,padding_mask)
            attn[f'Decoder_layer{i+1}_block1'] = block1
            attn[f'Decoder_layer{i+1}_block2'] = block2
        return x,attn

In [27]:
class Transformer(tf.keras.Model):
    def __init__(self,num_layers,d,num_heads,dff,source_vocab,target_vocab,tokens=128,rate=0.5):
        super().__init__()
        self.encoder = Encoder(num_layers,d,num_heads,dff,source_vocab,target_vocab,tokens=tokens,rate=rate)
        self.decoder = Decoder(num_layers,d,num_heads,dff,source_vocab,target_vocab,tokens=tokens,rate=rate)
        self.dense = tf.keras.layers.Dense(target_vocab)
    def __call__(self,x,training):
        inp,tar = x 
        frwrd_mask,padding_mask = self.create_mask(inp,tar)
        enc_out = self.encoder(inp,training,padding_mask)
        dec_out,attn = self.decoder(tar,enc_out,training,frwrd_mask,padding_mask)
        out  = self.dense(dec_out)
        return out,attn
    def create_mask(self,inp,tar):
        padding_mask = create_padding_mask(inp)
        frwrd_mask = create_frwrd_mask(tf.shape(tar)[1])
        dec_padding_mask = create_padding_mask(tar)
        frwrd_mask = tf.maximum(dec_padding_mask,tf.cast(frwrd_mask,dtype=tf.float32))
        return frwrd_mask,padding_mask

In [28]:
class LearningSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,d,warmup_steps=5000):
        super(LearningSchedule,self).__init__()
        self.d = tf.cast(d,tf.float32)
        self.warmup_steps= warmup_steps
    def __call__(self,step):
        out1 = tf.math.rsqrt(step)
        out2 = step*(self.warmup_steps**(-1.5))
        return tf.math.rsqrt(self.d)*tf.math.minimum(out1,out2)

In [29]:
# Hyperparams
num_layers = 4
d= 128
dff= 1024
num_heads =8
dropout= 0.1

In [30]:
lr = LearningSchedule(d)
optimizer = tf.keras.optimizers.Adam(lr)

In [31]:
def loss_fn(y,y_hat):
    mask = tf.math.logical_not(tf.math.equal(y,0))
    loss_class = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')
    loss = loss_class(y,y_hat)
    mask = tf.cast(mask,dtype=loss.dtype)
    loss*=mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [32]:
def accuracy_fn(y,y_hat):
    acc = tf.equal(tf.cast(y,tf.int64),tf.cast(tf.argmax(y_hat,axis=2),tf.int64))
    mask = tf.math.logical_not(tf.math.equal(y,0))
    acc = tf.math.logical_and(mask,acc)
    return tf.reduce_sum(tf.cast(acc,tf.float32))/tf.reduce_sum(tf.cast(mask,tf.float32))

In [33]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')


In [34]:
model = Transformer(num_layers,d,num_heads,dff,eng_vocab_length,fra_vocab_length,rate=dropout)

In [35]:
def train_step(inp,tar):
    tar_inp = tar[:,:-1]
    tar_real = tar[:,1:]
    with tf.GradientTape() as tape:
        preds,_=model([inp,tar_inp],training=True)
        loss =loss_fn(tar_real,preds)
    gradients = tape.gradient(loss,model.trainable_variables)
    optimizer.apply_gradients(zip(gradients,model.trainable_variables))
    train_loss(loss)
    train_accuracy(accuracy_fn(tar_real, preds))


In [36]:
with tf.device('/gpu:0'):
    for epoch in range(20):
        start =  time.time()
        train_loss.reset_states()
        train_accuracy.reset_states()
        for (batch,(inp,tar)) in enumerate(train_batches):
            train_step(inp,tar)
            if batch % 50 == 0:
                  print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

        print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
        print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

    