- run following cell when training on colab

In [1]:
#! pip install tokenizers===0.9.3
# from google.colab import drive
# drive.mount('/content/gdrive')
# %cd gdrive/MyDrive/Colab Notebooks/colab_upload

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import time
import os

# My custom modules
from tokenizer_helpers import load_tokenizer
from model_components import Transformer
from training_helper_functions import loss_function, accuracy_function, compute_test_metrics, CustomSchedule

- run following cell when training on colab

In [None]:
# df_en_de = pd.read_csv('/content/gdrive/MyDrive/transformer_nmt_dataset/df_complete_30.csv')

In [2]:
df_en_de = pd.read_csv('./data/df_complete_30.csv')

In [3]:
pairs = df_en_de

## Select small fraction for testing purposes
#pairs = pairs.sample(frac = 0.002)

In [4]:
pairs.head()

Unnamed: 0,german,english,german_length,english_length
1233401,Strenge Haushaltsdisziplin ist kein Synonym fu...,Rigour is not synonymous with saving.,7,6
1484698,"Wir wissen, was am letzten Wochenende passiert...",We know what happened last weekend.,8,6
194333,"Wir brauchen Wettbewerb, und ich hoffe, meine ...","We need competition, and I hope that my Britis...",16,16
775447,"Es ist wichtig, all diese Aktionen zu koordini...",It is important that these operations are coor...,8,8
1440351,Ich sah ihn beim Ueberqueren der Strasse.,I saw him crossing the street.,7,6


In [5]:
len(pairs)

3124

In [6]:
# load pre-trained tokenizers for de and en
en_tokenizer, en_word_index = load_tokenizer('tokenizers/tokenizer_en_corpus.json')
de_tokenizer, de_word_index = load_tokenizer('tokenizers/tokenizer_de_corpus.json')

In [7]:
vocab_len_source = len(de_word_index.keys())
vocab_len_target = len(en_word_index.keys())

print (vocab_len_source, vocab_len_target)

29999 29999


In [8]:
# add 1 for zero-padding in embedding layer
num_tokens_source = vocab_len_source + 1
num_tokens_target = vocab_len_target + 1

- Tokenize the data using the pre-trained tokenizers

In [9]:
source_de = []
target_en = []

# iterrate over every row of dataframe "pairs"
for idx, row in pairs.iterrows():
    string_de = row['german']
    string_en = row['english']

    if type(string_de) == str and type(string_en) == str:
        # encode
        encoding_de = de_tokenizer.encode(string_de)
        encoding_en = en_tokenizer.encode(string_en)
        # retrieve ids (integers) and append to list
        source_de.append(encoding_de.ids)
        target_en.append(encoding_en.ids)

- run time for 100% of df_complete_30: 4m 10s

In [10]:
# convert to tensors with zero-padding
source_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_de, padding = 'post')
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_en, padding = 'post')

In [11]:
# split into training and test sets
source_train_tensor, source_test_tensor, target_train_tensor, target_test_tensor = train_test_split(
                                                                source_tensor, target_tensor, test_size=0.05
                                                                )

In [12]:
de_tokenizer.decode(source_train_tensor[10])

'Ich hoffe, dass die Entschliessung ueber Belarus einer von vielen Schritten ist, die wir noch setzen werden.'

In [13]:
# For first run-through:
# save numpy array as csv file:

os.mkdir('tensors')

np.savetxt('tensors/source_train_tensor.csv', source_train_tensor, delimiter = ',')
np.savetxt('tensors/source_test_tensor.csv', source_test_tensor, delimiter = ',')
np.savetxt('tensors/target_train_tensor.csv', target_train_tensor, delimiter = ',')
np.savetxt('tensors/target_test_tensor.csv', target_test_tensor, delimiter = ',')


- run time for 100% of df_complete_30: 3m

In [None]:
mask_source = np.random.choice([False, True], len(source_train_tensor), p=[0.999, 0.001])

source_train_sample = source_train_tensor[mask_source]
target_train_sample = target_train_tensor[mask_source]

np.savetxt('tensors/source_train_sample.csv', source_train_sample, delimiter = ',')
np.savetxt('tensors/target_train_sample.csv', target_train_sample, delimiter = ',')

In [None]:
# When picking up training again after an interruption, you need to work with the same training and test sets.
# Load saved numpy arrays


# source_train_tensor = np.loadtxt('tensors/source_train_tensor.csv', delimiter = ',', dtype = 'int32')
# source_test_tensor = np.loadtxt('tensors/source_test_tensor.csv', delimiter = ',', dtype = 'int32')
# target_train_tensor = np.loadtxt('tensors/target_train_tensor.csv', delimiter = ',', dtype = 'int32')
# target_test_tensor = np.loadtxt('tensors/target_test_tensor.csv', delimiter = ',', dtype = 'int32')

In [14]:
max_source_length= max(len(t) for t in np.concatenate((source_train_tensor, source_test_tensor), axis=0))
max_target_length= max(len(t) for t in np.concatenate((target_train_tensor, target_test_tensor), axis=0))

print(max_source_length, max_target_length)

49 47


In [15]:
BATCH_SIZE = 32
#Create training dataset and shuffle
dataset_train = tf.data.Dataset.from_tensor_slices((source_train_tensor, target_train_tensor)).shuffle(BATCH_SIZE)
# divide into batches
dataset_train = dataset_train.batch(BATCH_SIZE, drop_remainder=True)

#Create test dataset
dataset_test = tf.data.Dataset.from_tensor_slices((source_test_tensor, target_test_tensor)).shuffle(BATCH_SIZE)
dataset_test = dataset_test.batch(BATCH_SIZE, drop_remainder=True)


In [16]:
source_batch_train, target_batch_train =next(iter(dataset_train))
print(source_batch_train.shape, target_batch_train.shape)



(32, 49) (32, 47)


<h3> Define arguments for Transformer object </h3>

In [17]:
# Transformer arguments: 
# num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, 
# target_vocab_size, max_positional_encoding_input,
# max_positional_encoding_target, dropout_rate=0.1, layernorm_eps=1e-6

num_layers = 4
embedding_dim = 64
num_heads = 5
fully_connected_dim = 128
input_vocab_size = num_tokens_source
target_vocab_size = num_tokens_target
max_positional_encoding_input = max_source_length
max_positional_encoding_target = max_target_length

<h3> Create Transformer object </h3>

In [18]:
transformer = Transformer(
    num_layers=num_layers,
    embedding_dim=embedding_dim,
    num_heads=num_heads,
    fully_connected_dim=fully_connected_dim,
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size,
    max_positional_encoding_input = max_positional_encoding_input,
    max_positional_encoding_target = max_positional_encoding_target
    )

- Create optimizer

In [19]:
# Use customised learning rate as defined in 'Attention Is All You Need' paper
# The learning rate increases linearly until training_step reaches "warmup_steps", then decays asymptotically
# Inputs: d_model, warmup_steps (default = 4000)

learning_rate = CustomSchedule(embedding_dim, warmup_steps = 4000)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [20]:
# define loss object
# from_logits = False, because we apply softmax to final Dense layer of Transformer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction='none')


In [21]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name = 'test_loss')
test_accuracy = tf.keras.metrics.Mean(name = 'test_accuracy')

In [22]:
@tf.function
def train_step(inp, tar):
                            # inp = (m, Tx)
                            # tar = (m, Ty)


  tar_inp = tar[:, :-1]     # "start_" to last word
  tar_real = tar[:, 1:]     # first word to "_end"

  with tf.GradientTape() as tape:
    predictions, _ = transformer(inputs = (inp, tar_inp),
                                 training = True)
    loss = loss_function(tar_real, predictions, loss_object)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  acc = accuracy_function(tar_real, predictions)

  # store cumulative loss and acc in train_loss and train_accuracy
  train_loss(loss)
  train_accuracy(acc)

In [23]:
checkpoint_path = './checkpoints'

ckpt = tf.train.Checkpoint(optimizer=optimizer,
                                 transformer=transformer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 3)
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!')
else:
    print('Initialising from scratch')

Latest checkpoint restored!


In [24]:
print(ckpt_manager.latest_checkpoint)

./checkpoints/ckpt-40


In [25]:
epoch_batch_list = []
train_loss_list = []
train_acc_list = []
test_loss_list = []
test_acc_list = []

In [28]:
EPOCHS = 5

for epoch in range(EPOCHS):
  start = time.time()

  # reset tf Mean objects
  train_loss.reset_states()
  train_accuracy.reset_states()
  test_loss.reset_states()
  train_accuracy.reset_states()

  # iterate over every batch (= (inp, tar) tuple) in training dataset
  for (batch, (inp, tar)) in enumerate(dataset_train):
    train_step(inp, tar)

    if batch % 50 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} -- Train_Loss: {train_loss.result():.4f} Train_Accuracy: {train_accuracy.result():.4f}')


    # if batch % 5000 == 0:
    #   ckpt_save_path = ckpt_manager.save()
    #   print(f'Saving checkpoint after epoch {epoch +1} batch {batch} at {ckpt_save_path}')

    if batch % 50 == 0:
      epoch_batch_list.append(f'epoch_{epoch+1}_batch_{batch}')
      train_loss_list.append (train_loss.result().numpy())
      train_acc_list.append(train_accuracy.result().numpy())

      test_loss_list.append(test_loss.result().numpy())
      test_acc_list.append(test_accuracy.result().numpy())


  if (epoch+1) % 1 == 0:
    ckpt_save_path = ckpt_manager.save()
    print(f'Saving checkpoint after epoch {epoch + 1} at {ckpt_save_path}')
  
  # after one epoch of training, compute test loss and test acc
  for (batch, (inp, tar)) in enumerate(dataset_test):
    test_loss_batch, test_accuracy_batch = compute_test_metrics(inp, tar, transformer, loss_object)
    # Update tf Mean objects
    test_loss(test_loss_batch)
    test_accuracy(test_accuracy_batch)
  

  print(f'Summary -- Epoch {epoch + 1} Train_Loss: {train_loss.result():.4f} Train_Accuracy: {train_accuracy.result():.4f} \
    Test_Loss: {test_loss.result():.4f} Test_Accuracy: {test_accuracy.result():.4f}')

  epoch_batch_list.append(f'end of epoch {epoch+1}')
  train_loss_list.append (train_loss.result().numpy())
  train_acc_list.append(train_accuracy.result().numpy())

  test_loss_list.append(test_loss.result().numpy())
  test_acc_list.append(test_accuracy.result().numpy())
  


  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 Batch 0 -- Train_Loss: 2.8347 Train_Accuracy: 0.4646
Epoch 1 Batch 50 -- Train_Loss: 2.5686 Train_Accuracy: 0.4939
Saving checkpoint after epoch 1 at ./checkpoints/ckpt-46
Summary -- Epoch 1 Train_Loss: 2.5815 Train_Accuracy: 0.4910     Test_Loss: 5.7602 Test_Accuracy: 0.3099
Time taken for 1 epoch: 32.98 secs

Epoch 2 Batch 0 -- Train_Loss: 2.2754 Train_Accuracy: 0.5318
Epoch 2 Batch 50 -- Train_Loss: 2.2276 Train_Accuracy: 0.5424
Saving checkpoint after epoch 2 at ./checkpoints/ckpt-47
Summary -- Epoch 2 Train_Loss: 2.2438 Train_Accuracy: 0.5390     Test_Loss: 5.9072 Test_Accuracy: 0.3090
Time taken for 1 epoch: 32.92 secs

Epoch 3 Batch 0 -- Train_Loss: 2.0502 Train_Accuracy: 0.5761
Epoch 3 Batch 50 -- Train_Loss: 1.9334 Train_Accuracy: 0.5874
Saving checkpoint after epoch 3 at ./checkpoints/ckpt-48
Summary -- Epoch 3 Train_Loss: 1.9512 Train_Accuracy: 0.5825     Test_Loss: 6.1031 Test_Accuracy: 0.3076
Time taken for 1 epoch: 32.84 secs

Epoch 4 Batch 0 -- Train_Loss: 1.8628

In [29]:
all_metrics = zip(epoch_batch_list, train_loss_list, train_acc_list, test_loss_list, test_acc_list)
df_metrics = pd.DataFrame(all_metrics, columns = ['epoch_batch', 'train_loss', 'train_acc', 'test_loss', 'test_acc'])
df_metrics[['train_loss', 'train_acc', 'test_loss', 'test_acc']] = df_metrics[['train_loss', 'train_acc', 'test_loss', 'test_acc']].apply(lambda x: round(x,3))
df_metrics

Unnamed: 0,epoch_batch,train_loss,train_acc,test_loss,test_acc
0,epoch_1_batch_0,7.058,0.272,0.0,0.0
1,epoch_1_batch_50,5.789,0.258,0.0,0.0
2,end of epoch 1,5.586,0.267,4.954,0.314
3,epoch_2_batch_0,4.45,0.339,0.0,0.314
4,epoch_2_batch_50,4.417,0.327,0.0,0.314
5,end of epoch 2,4.427,0.327,4.971,0.317
6,epoch_3_batch_0,4.005,0.368,0.0,0.317
7,epoch_3_batch_50,3.882,0.367,0.0,0.317
8,end of epoch 3,3.889,0.366,5.152,0.315
9,epoch_4_batch_0,3.584,0.387,0.0,0.315


In [30]:
df_metrics.to_csv('metrics/df_metrics.csv', index = False)

In [31]:
file_path = 'saved_models/model'
transformer.save_weights(file_path,save_format='tf')

# # Recreate the exact same model purely from the file
# new_model = keras.models.load_model('path_to_my_model')