<h3> Summary of notebook: </h3>

- Now we move from word-based embeddings to BPE-based embeddings

In [1]:
#! pip install tokenizers===0.9.3

In [2]:
import pandas as pd
import numpy as np
import string

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, Concatenate, LSTM, Embedding, Dense, MultiHeadAttention, LayerNormalization, Dropout
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant

from sklearn.model_selection import train_test_split

import re
import os
import io
import time

In [3]:
from tokenizers import Tokenizer, ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

In [3]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# %cd gdrive/MyDrive/ColabNotebooks/colab_upload

#df_en_de = pd.read_csv('/content/gdrive/MyDrive/transformer_nmt_dataset/df_complete_30.csv')

In [4]:
from model_components import preprocess_sentence, get_angles, positional_encoding, \
                            create_padding_mask, create_look_ahead_mask, \
                            FullyConnected, EncoderLayer, Encoder, DecoderLayer, Decoder, Transformer, CustomSchedule, \
                                create_train_tokenizer, load_tokenizer

In [5]:
from training_helper_functions import loss_function, accuracy_function, compute_test_metrics

In [6]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [7]:
df_en_de['german'] = df_en_de['german'].apply(preprocess_sentence)
df_en_de['english'] = df_en_de['english'].apply(preprocess_sentence)

In [8]:
# pre-process sentences using helper function
pairs = df_en_de
pairs = pairs.sample(frac = 0.01)

In [9]:
pairs.head()

Unnamed: 0,english,german
115225,Tom has done so much for me.,Tom hat so viel fuer mich getan.
146067,He told us an interesting story.,Er erzaehlte uns eine interessante Geschichte.
73892,I ' m going to rent a car.,Ich werde ein Auto mieten.
25764,She ' s fashionable.,Sie ist modebewusst.
30385,How can I help you?,Womit kann ich Ihnen behilflich sein?


In [10]:
len(pairs)

2517

In [11]:
# load pre-trained tokenizers for de and en
en_tokenizer, en_word_index = load_tokenizer('tokenizer_en_corpus.json')
de_tokenizer, de_word_index = load_tokenizer('tokenizer_de_corpus.json')

In [12]:
vocab_len_source = len(de_word_index.keys())
vocab_len_target = len(en_word_index.keys())

vocab_len_source, vocab_len_target

(29999, 29999)

In [13]:
num_tokens_source = vocab_len_source + 1
num_tokens_target = vocab_len_target + 1

- use the loaded (pre-trained) tokenizers to tokenize the text

In [14]:
source_de = []
target_en = []

# iterrate over every row of dataframe "pairs"
for idx, row in pairs.iterrows():
    string_de = row['german']
    string_en = row['english']

    if type(string_de) == str and type(string_en) == str:
        # encode
        encoding_de = de_tokenizer.encode(string_de)
        encoding_en = en_tokenizer.encode(string_en)
        # retrieve ids (integers) and append to list
        source_de.append(encoding_de.ids)
        target_en.append(encoding_en.ids)

- run time for 100% of df_complete_30: 4m 10s

In [15]:
# convert to tensors with zero-padding
source_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_de, padding = 'post')
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_en, padding = 'post')

In [16]:
source_train_tensor, source_test_tensor, target_train_tensor, target_test_tensor = train_test_split(
                                                                source_tensor, target_tensor, test_size=0.2
                                                                )

In [17]:
de_tokenizer.decode(source_train_tensor[10])

'Ich weiss, dass Tom euer Freund ist.'

In [18]:
# save numpy array as csv file:
np.savetxt('source_train_tensor.csv', source_train_tensor, delimiter = ',')
np.savetxt('source_test_tensor.csv', source_test_tensor, delimiter = ',')
np.savetxt('target_train_tensor.csv', target_train_tensor, delimiter = ',')
np.savetxt('target_test_tensor.csv', target_test_tensor, delimiter = ',')


- run time for 100% of df_complete_30: 3m

In [19]:
max_source_length= max(len(t) for t in source_tensor)
max_target_length= max(len(t) for t in  target_tensor)

In [20]:
max_source_length, max_target_length

(43, 35)

In [21]:
BATCH_SIZE = 32
#Create training dataset and shuffle
dataset_train = tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(BATCH_SIZE)
# divide into batches
dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True)

#Create test dataset
dataset_test = tf.data.Dataset.from_tensor_slices((source_test_tensor, target_test_tensor)).shuffle(BATCH_SIZE)
dataset_test = dataset_test.batch(BATCH_SIZE, drop_remainder=True)


In [22]:
source_batch_train, target_batch_train =next(iter(dataset_train))
print(source_batch_train.shape, target_batch_train.shape)



(32, 43) (32, 35)


<h3> Define arguments for transformer </h3>

In [23]:
# Transformer arguments: 
# num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, 
# target_vocab_size, max_positional_encoding_input,
# max_positional_encoding_target, dropout_rate=0.1, layernorm_eps=1e-6

num_layers = 4
embedding_dim = 64
num_heads = 5
fully_connected_dim = 128
input_vocab_size = num_tokens_source
target_vocab_size = num_tokens_target
max_positional_encoding_input = max_source_length
max_positional_encoding_target = max_target_length

<h3> Create transformer </h3>

In [35]:
transformer = Transformer(
    num_layers=num_layers,
    embedding_dim=embedding_dim,
    num_heads=num_heads,
    fully_connected_dim=fully_connected_dim,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    max_positional_encoding_input = max_positional_encoding_input,
    max_positional_encoding_target = max_positional_encoding_target
    )

(1, 43, 64)
(1, 35, 64)


- Create optimizer
- Use customised learning rate as defined in 'Attention Is All You Need' paper
- The learning rate increases linearly until training_step reaches "warmup_steps", then decays asymptotically
- Inputs: d_model, warmup_steps (default = 4000)

In [36]:
learning_rate = CustomSchedule(embedding_dim, warmup_steps = 4000)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [37]:
# define loss object
# from_logits = False, because we apply softmax to final Dense layer of Transformer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')


In [38]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name = 'test_loss')
test_accuracy = tf.keras.metrics.Mean(name = 'test_accuracy')

In [39]:
@tf.function
def train_step(inp, tar):
                            # inp = (m, Tx)
                            # tar = (m, Ty)


  tar_inp = tar[:, :-1]     # "start_" to last word
  tar_real = tar[:, 1:]     # first word to "_end"

  with tf.GradientTape() as tape:
    predictions, _ = transformer(inputs = (inp, tar_inp),
                                 training = True)
    loss = loss_function(tar_real, predictions, loss_object)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  acc = accuracy_function(tar_real, predictions)

  # store cumulative loss and acc in train_loss and train_accuracy
  train_loss(loss)
  train_accuracy(acc)

In [40]:
checkpoint_path = './checkpoints'

ckpt = tf.train.Checkpoint(optimizer=optimizer,
                                 transformer=transformer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 3)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!')
else:
    print('Initialising from scratch')

Initialising from scratch


In [41]:
print(ckpt_manager.latest_checkpoint)

None


In [42]:
epoch_batch_list = []
train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

In [45]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  # reset tf Mean objects
  train_loss.reset_states()
  train_accuracy.reset_states()
  test_loss.reset_states()
  train_accuracy.reset_states()

  # iterate over every batch (= (inp, tar) tuple) in training dataset
  for (batch, (inp, tar)) in enumerate(dataset_train):
    train_step(inp, tar)

    if batch % 50 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} -- Train_Loss: {train_loss.result():.4f} Train_Accuracy: {train_accuracy.result():.4f}')


    # if batch % 5000 == 0:
    #   ckpt_save_path = ckpt_manager.save()
    #   print(f'Saving checkpoint after epoch {epoch +1} batch {batch} at {ckpt_save_path}')

    if batch % 50 == 0:
      epoch_batch_list.append(f'epoch_{epoch+1}_batch_{batch}')
      train_loss_list.append (train_loss.result().numpy())
      train_acc_list.append(train_accuracy.result().numpy())

      test_loss_list.append(test_loss.result().numpy())
      test_acc_list.append(test_accuracy.result().numpy())


  if (epoch+1) % 1 == 0:
    ckpt_save_path = ckpt_manager.save()
    print(f'Saving checkpoint after epoch {epoch + 1} at {ckpt_save_path}')
  
  # after one epoch of training, compute test loss and test acc
  for (batch, (inp, tar)) in enumerate(dataset_test):
    test_loss_batch, test_accuracy_batch = compute_test_metrics(inp, tar, transformer, loss_object)
    # Update tf Mean objects
    test_loss(test_loss_batch)
    test_accuracy(test_accuracy_batch)
  

  print(f'Summary -- Epoch {epoch + 1} Train_Loss: {train_loss.result():.4f} Train_Accuracy: {train_accuracy.result():.4f} \
    Test_Loss: {test_loss.result():.4f} Test_Accuracy: {test_accuracy.result():.4f}')

  epoch_batch_list.append(f'end of epoch {epoch+1}')
  train_loss_list.append (train_loss.result().numpy())
  train_acc_list.append(train_accuracy.result().numpy())

  test_loss_list.append(test_loss.result().numpy())
  test_acc_list.append(test_accuracy.result().numpy())
  


  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 Batch 0 -- Train_Loss: 1.0076 Train_Accuracy: 0.7908
Epoch 1 Batch 50 -- Train_Loss: 0.9632 Train_Accuracy: 0.7867
Saving checkpoint after epoch 1 at ./checkpoints/ckpt-41
Summary -- Epoch 1 Train_Loss: 0.9756 Train_Accuracy: 0.7831     Test_Loss: 4.7419 Test_Accuracy: 0.3352
Time taken for 1 epoch: 20.66 secs

Epoch 2 Batch 0 -- Train_Loss: 0.8015 Train_Accuracy: 0.8389
Epoch 2 Batch 50 -- Train_Loss: 0.8693 Train_Accuracy: 0.8084
Saving checkpoint after epoch 2 at ./checkpoints/ckpt-42
Summary -- Epoch 2 Train_Loss: 0.8785 Train_Accuracy: 0.8082     Test_Loss: 4.9453 Test_Accuracy: 0.3381
Time taken for 1 epoch: 19.76 secs

Epoch 3 Batch 0 -- Train_Loss: 0.8001 Train_Accuracy: 0.8333
Epoch 3 Batch 50 -- Train_Loss: 0.7697 Train_Accuracy: 0.8332
Saving checkpoint after epoch 3 at ./checkpoints/ckpt-43
Summary -- Epoch 3 Train_Loss: 0.7787 Train_Accuracy: 0.8314     Test_Loss: 5.0634 Test_Accuracy: 0.3409
Time taken for 1 epoch: 20.18 secs

Epoch 4 Batch 0 -- Train_Loss: 0.6568

In [46]:
all_metrics = zip(epoch_batch_list, train_loss_list, train_acc_list, test_loss_list, test_acc_list)
df_metrics = pd.DataFrame(all_metrics, columns = ['epoch_batch', 'train_loss', 'train_acc', 'test_loss', 'test_acc'])
#df_metrics['epoch'] = df_metrics['epoch'].apply(lambda x: x+1)
#df_metrics = df_metrics.apply(lambda x: round(x, 3))
df_metrics[['train_loss', 'train_acc', 'test_loss', 'test_acc']] = df_metrics[['train_loss', 'train_acc', 'test_loss', 'test_acc']].apply(lambda x: round(x,3))
df_metrics

Unnamed: 0,epoch_batch,train_loss,train_acc,test_loss,test_acc
0,epoch_1_batch_0,10.306,0.000,0.000,0.000
1,epoch_1_batch_50,10.271,0.013,0.000,0.000
2,end of epoch 1,10.259,0.029,10.173,0.110
3,epoch_2_batch_0,10.184,0.109,0.000,0.110
4,epoch_2_batch_50,10.081,0.108,0.000,0.110
...,...,...,...,...,...
145,epoch_9_batch_50,0.433,0.897,0.000,0.353
146,end of epoch 9,0.443,0.894,5.477,0.355
147,epoch_10_batch_0,0.419,0.898,0.000,0.355
148,epoch_10_batch_50,0.403,0.904,0.000,0.355


In [105]:
df_metrics.to_csv('df_metrics.csv', index = False)

In [59]:
# with open("params.txt", "a") as text_file:
#     text_file.write(params_3)
#     #text_file.write('params_3 -- time taken for 1 epoch: 54 secs')

In [47]:
file_path = 'saved_models/model'
transformer.save_weights(file_path,save_format='tf')

# # Recreate the exact same model purely from the file
# new_model = keras.models.load_model('path_to_my_model')