<a href="https://colab.research.google.com/github/karumugamio/NLAProjEnglishtoSimpleEnglishMT/blob/master/Transformer_Based_MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import numpy as np
import unicodedata
import re
import time

In [18]:
from google.colab import drive
drive.mount('/gdrive')
import os
os.chdir('/gdrive/My Drive/NLAProjectWS')
os.listdir()

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


['Data',
 'workspace',
 'Readme.md',
 'merged_dataset.txt',
 'X_train.pkl',
 'X_test.pkl',
 'training_checkpoints',
 'EDA',
 '.ipynb_checkpoints',
 'merged_PD_v1Data.csv']

In [0]:
import pandas as pd
input_simple = '/gdrive/My Drive/NLAProjectWS/Data/v1_wiki.simple'
input_en = '/gdrive/My Drive/NLAProjectWS/Data/v1_wiki.unsimplified'

input_simple = '/gdrive/My Drive/NLAProjectWS/Data/v1_simple.training.txt'
input_en = '/gdrive/My Drive/NLAProjectWS/Data/v1_normal.training.txt'



input_simple = '/gdrive/My Drive/NLAProjectWS/Data/v1_simple.tuning.txt'
input_en = '/gdrive/My Drive/NLAProjectWS/Data/v1_normal.tuning.txt'


en_dataset=pd.read_csv(input_en,delimiter="\n",header=None,names = ['enSrc'])
simple_dataset = pd.read_csv(input_simple,delimiter="\n",header = None,names = ['simpleSrc'])

In [0]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')


def normalize_string(s):
    s = unicode_to_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s

In [0]:
raw_data_en = en_dataset['enSrc'].tolist()
raw_data_simple = simple_dataset['simpleSrc'].tolist()
raw_data_en, raw_data_simple = list(raw_data_en), list(raw_data_simple)
raw_data_en = [normalize_string(data) for data in raw_data_en]
raw_data_si_in = ['<start> ' + normalize_string(data) for data in raw_data_simple]
raw_data_si_out = [normalize_string(data) + ' <end>' for data in raw_data_simple]

Tokenizing all Src Contents

In [0]:
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
en_tokenizer.fit_on_texts(raw_data_en)
data_en = en_tokenizer.texts_to_sequences(raw_data_en)
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en,
                                                        padding='post')

simple_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
simple_tokenizer.fit_on_texts(raw_data_si_in)
simple_tokenizer.fit_on_texts(raw_data_si_out)
data_si_in = simple_tokenizer.texts_to_sequences(raw_data_si_in)
data_si_in = tf.keras.preprocessing.sequence.pad_sequences(data_si_in,
                                                           padding='post')

data_si_out = simple_tokenizer.texts_to_sequences(raw_data_si_out)
data_si_out = tf.keras.preprocessing.sequence.pad_sequences(data_si_out,
                                                            padding='post')

Create tf.data.Dataset object

In [0]:
BATCH_SIZE = 100
dataset = tf.data.Dataset.from_tensor_slices(
    (data_en, data_si_in, data_si_out))
dataset = dataset.shuffle(20).batch(BATCH_SIZE)

Create the Positional Embedding

In [0]:
def positional_embedding(pos, model_size):
    PE = np.zeros((1, model_size))
    for i in range(model_size):
        if i % 2 == 0:
            PE[:, i] = np.sin(pos / 10000 ** (i / model_size))
        else:
            PE[:, i] = np.cos(pos / 10000 ** ((i - 1) / model_size))
    return PE

max_length = max(len(data_en[0]), len(data_si_in[0]))
MODEL_SIZE = 128

pes = []
for i in range(max_length):
    pes.append(positional_embedding(i, MODEL_SIZE))

pes = np.concatenate(pes, axis=0)
pes = tf.constant(pes, dtype=tf.float32)

Create the Multihead Attention layer

In [0]:
class MultiHeadAttention(tf.keras.Model):
    def __init__(self, model_size, h):
        super(MultiHeadAttention, self).__init__()
        self.query_size = model_size // h
        self.key_size = model_size // h
        self.value_size = model_size // h
        self.h = h
        self.wq = [tf.keras.layers.Dense(self.query_size) for _ in range(h)]
        self.wk = [tf.keras.layers.Dense(self.key_size) for _ in range(h)]
        self.wv = [tf.keras.layers.Dense(self.value_size) for _ in range(h)]
        self.wo = tf.keras.layers.Dense(model_size)

    def call(self, decoder_output, encoder_output):
        # decoder_output has shape (batch, decoder_len, model_size)
        # encoder_output has shape (batch, encoder_len, model_size)
        heads = []
        for i in range(self.h):
            score = tf.matmul(self.wq[i](decoder_output), self.wk[i](encoder_output), transpose_b=True) / tf.math.sqrt(tf.dtypes.cast(self.key_size, tf.float32))
            # score has shape (batch, decoder_len, encoder_len)
            alignment = tf.nn.softmax(score, axis=2)
            # alignment has shape (batch, decoder_len, encoder_len)
            head = tf.matmul(alignment, self.wv[i](encoder_output))
            # head has shape (batch, decoder_len, value_size)
            heads.append(head)
        heads = tf.concat(heads, axis=2)
        heads = self.wo(heads)
        # heads has shape (batch, decoder_len, model_size)
        return heads

## Create the Encoder

In [0]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Encoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h
        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)
        self.attention = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        
        self.attention_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        
        self.dense_1 = [tf.keras.layers.Dense(512, activation='relu') for _ in range(num_layers)]
        self.dense_2 = [tf.keras.layers.Dense(model_size) for _ in range(num_layers)]
        self.ffn_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        
    def call(self, sequence):
        sub_in = []
        for i in range(sequence.shape[1]):
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
            sub_in.append(embed + pes[i, :])
            
        sub_in = tf.concat(sub_in, axis=1)
        
        for i in range(self.num_layers):
            sub_out = []
            for j in range(sub_in.shape[1]):
                attention = self.attention[i](
                    tf.expand_dims(sub_in[:, j, :], axis=1), sub_in)

                sub_out.append(attention)

            sub_out = tf.concat(sub_out, axis=1)
            sub_out = sub_in + sub_out
            sub_out = self.attention_norm[i](sub_out)
            
            ffn_in = sub_out

            ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
            ffn_out = ffn_in + ffn_out
            ffn_out = self.ffn_norm[i](ffn_out)

            sub_in = ffn_out
            
        return ffn_out

## Creating Decoders


In [0]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Decoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h
        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)
        self.attention_bot = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.attention_bot_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.attention_mid = [MultiHeadAttention(model_size, h) for _ in range(num_layers)]
        self.attention_mid_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        
        self.dense_1 = [tf.keras.layers.Dense(512, activation='relu') for _ in range(num_layers)]
        self.dense_2 = [tf.keras.layers.Dense(model_size) for _ in range(num_layers)]
        self.ffn_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, sequence, encoder_output):
        # EMBEDDING AND POSITIONAL EMBEDDING
        embed_out = []
        for i in range(sequence.shape[1]):
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
            embed_out.append(embed + pes[i, :])
            
        embed_out = tf.concat(embed_out, axis=1)
        
        
        bot_sub_in = embed_out
        
        for i in range(self.num_layers):
            # BOTTOM MULTIHEAD SUB LAYER
            bot_sub_out = []
            
            for j in range(bot_sub_in.shape[1]):
                values = bot_sub_in[:, :j, :]
                attention = self.attention_bot[i](
                    tf.expand_dims(bot_sub_in[:, j, :], axis=1), values)

                bot_sub_out.append(attention)
            bot_sub_out = tf.concat(bot_sub_out, axis=1)
            bot_sub_out = bot_sub_in + bot_sub_out
            bot_sub_out = self.attention_bot_norm[i](bot_sub_out)
            
            # MIDDLE MULTIHEAD SUB LAYER
            mid_sub_in = bot_sub_out

            mid_sub_out = []
            for j in range(mid_sub_in.shape[1]):
                attention = self.attention_mid[i](
                    tf.expand_dims(mid_sub_in[:, j, :], axis=1), encoder_output)

                mid_sub_out.append(attention)

            mid_sub_out = tf.concat(mid_sub_out, axis=1)
            mid_sub_out = mid_sub_out + mid_sub_in
            mid_sub_out = self.attention_mid_norm[i](mid_sub_out)

            # FFN
            ffn_in = mid_sub_out

            ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
            ffn_out = ffn_out + ffn_in
            ffn_out = self.ffn_norm[i](ffn_out)

            bot_sub_in = ffn_out
        
        logits = self.dense(ffn_out)
            
        return logits

In [28]:
H = 2
NUM_LAYERS = 2

en_vocab_size = len(en_tokenizer.word_index) + 1
encoder = Encoder(en_vocab_size, MODEL_SIZE, NUM_LAYERS, H)

en_sequence_in = tf.constant([[1, 2, 3, 4, 6, 7, 8, 0, 0, 0], 
                           [1, 2, 3, 4, 6, 7, 8, 0, 0, 0]])
encoder_output = encoder(en_sequence_in)

print('Input vocabulary size', en_vocab_size)
print('Encoder input shape', en_sequence_in.shape)
print('Encoder output shape', encoder_output.shape)

si_vocab_size = len(simple_tokenizer.word_index) + 1
max_len_simple = data_si_in.shape[1]
decoder = Decoder(si_vocab_size, MODEL_SIZE, NUM_LAYERS, H)

si_sequence_in = tf.constant([[1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0],
                           [1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0]])
decoder_output = decoder(si_sequence_in, encoder_output)

print('Target vocabulary size', si_vocab_size)
print('Decoder input shape', si_sequence_in.shape)
print('Decoder output shape', decoder_output.shape)

Input vocabulary size 28560
Encoder input shape (2, 10)
Encoder output shape (2, 10, 128)
Target vocabulary size 25726
Decoder input shape (2, 14)
Decoder output shape (2, 14, 25726)


In [0]:
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True)
def loss_func(targets, logits):
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = crossentropy(targets, logits, sample_weight=mask)

    return loss


optimizer = tf.keras.optimizers.Adam()

In [0]:
def predict(test_source_text=None):
    if test_source_text is None:
        test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
    print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
    print(test_source_seq)

    en_output = encoder(tf.constant(test_source_seq))

    de_input = tf.constant([[simple_tokenizer.word_index['<start>']]], dtype=tf.int64)

    out_words = []

    while True:
        de_output = decoder(de_input, en_output)
        new_word = tf.expand_dims(tf.argmax(de_output, -1)[:, -1], axis=1)
        out_words.append(simple_tokenizer.index_word[new_word.numpy()[0][0]])

        de_input = tf.concat((de_input, new_word), axis=-1)

        if out_words[-1] == '<end>' or len(out_words) >= 14:
            break

    print(' '.join(out_words))

In [0]:
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out):
    with tf.GradientTape() as tape:
        encoder_output = encoder(source_seq)
        
        decoder_output = decoder(target_seq_in, encoder_output)

        loss = loss_func(target_seq_out, decoder_output)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [32]:
NUM_EPOCHS = 2
batch_counter = 0
start_time = time.time()
for e in range(NUM_EPOCHS):
    print("Epoch Started: {}".format(e+1))
    batch_counter = 0
    for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
      batch_counter = batch_counter + 1
      print('Epoch {} Batch Count {:.4f}'.format(e + 1, batch_counter)) 
      loss = train_step(source_seq, target_seq_in,target_seq_out)
       

    print('Epoch {} Loss {:.4f}'.format(e + 1, loss.numpy()))

    if (e + 1) % 10 == 0:
        end_time = time.time()
        print('Average elapsed time: {:.2f}s'.format((end_time - start_time) / (e + 1)))
        try:
            predict()
        except Exception as e:
            print(e)
            continue

Epoch Started: 1
Epoch 1 Batch Count 1.0000
Epoch 1 Batch Count 2.0000
Epoch 1 Batch Count 3.0000
Epoch 1 Batch Count 4.0000
Epoch 1 Batch Count 5.0000
Epoch 1 Batch Count 6.0000
Epoch 1 Batch Count 7.0000
Epoch 1 Batch Count 8.0000
Epoch 1 Batch Count 9.0000
Epoch 1 Batch Count 10.0000
Epoch 1 Batch Count 11.0000
Epoch 1 Batch Count 12.0000
Epoch 1 Batch Count 13.0000
Epoch 1 Batch Count 14.0000
Epoch 1 Batch Count 15.0000
Epoch 1 Batch Count 16.0000
Epoch 1 Batch Count 17.0000
Epoch 1 Batch Count 18.0000
Epoch 1 Batch Count 19.0000
Epoch 1 Batch Count 20.0000
Epoch 1 Batch Count 21.0000
Epoch 1 Batch Count 22.0000
Epoch 1 Batch Count 23.0000
Epoch 1 Batch Count 24.0000
Epoch 1 Batch Count 25.0000
Epoch 1 Batch Count 26.0000
Epoch 1 Batch Count 27.0000
Epoch 1 Batch Count 28.0000
Epoch 1 Batch Count 29.0000
Epoch 1 Batch Count 30.0000
Epoch 1 Batch Count 31.0000
Epoch 1 Batch Count 32.0000
Epoch 1 Batch Count 33.0000
Epoch 1 Batch Count 34.0000
Epoch 1 Batch Count 35.0000
Epoch 1 Batc

This is End of This Model! 

Add Step to Save the model and reload it 

In [33]:
print('hello world')

hello world


In [0]:
def predict_for_report(test_source_text=None):
    if test_source_text is None:
        test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
    print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
    print(test_source_seq)

    en_output = encoder(tf.constant(test_source_seq))

    de_input = tf.constant([[simple_tokenizer.word_index['<start>']]], dtype=tf.int64)

    out_words = []

    while True:
        de_output = decoder(de_input, en_output)
        new_word = tf.expand_dims(tf.argmax(de_output, -1)[:, -1], axis=1)
        out_words.append(simple_tokenizer.index_word[new_word.numpy()[0][0]])

        de_input = tf.concat((de_input, new_word), axis=-1)

        if out_words[-1] == '<end>' or len(out_words) >= 14:
            break

    predicted_value = ' '.join(out_words)

    return predicted_value

In [35]:
predict_for_report()

When his class arrives they discover that their substitute teacher has not shown up .
[[57, 22, 367, 6822, 47, 6375, 17, 43, 10111, 1728, 27, 39, 1353, 100, 2]]


'in is is the is the is and and and and and and and'

In [0]:
def predict_for_report2(test_source_text=None):
    if test_source_text is None:
        indexV= np.random.choice(len(raw_data_en))
        test_source_text = raw_data_en[indexV]
        actual_target = raw_data_simple[indexV]
    #print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
    #print(test_source_seq)

    en_output = encoder(tf.constant(test_source_seq))

    de_input = tf.constant([[simple_tokenizer.word_index['<start>']]], dtype=tf.int64)

    out_words = []

    while True:
        de_output = decoder(de_input, en_output)
        new_word = tf.expand_dims(tf.argmax(de_output, -1)[:, -1], axis=1)
        out_words.append(simple_tokenizer.index_word[new_word.numpy()[0][0]])

        de_input = tf.concat((de_input, new_word), axis=-1)

        if out_words[-1] == '<end>' or len(out_words) >= 14:
            break

    predicted_value = ' '.join(out_words)

    return test_source_text,actual_target,predicted_value

In [39]:
for i in range(30):
  src,target,predicted = predict_for_report2()
  print("Source Sentence {}".format(src))
  print("Target Sentence {}".format(target))
  print("Predicted Sentence {}".format(predicted))

Source Sentence The Dust Bowl or the Dirty Thirties was a period of severe dust storms causing major ecological and agricultural damage to American and Canadian prairie lands from to .
Target Sentence The Dust Bowl was a period of severe dust storms causing major ecological and agricultural damage to American and Canadian and also the Chinese area prairie lands from 1930 to 1936 and also in some parts through the 1940s .
Predicted Sentence in is is the first is the first is the is the is .
Source Sentence Unguja comprises three administrative regions Zanzibar Central South Zanzibar North and Zanzibar Urban West .
Target Sentence The capital of Zanzibar , located on the island of Unguja , is Zanzibar City .
Predicted Sentence <end>
Source Sentence The University of Adelaide The Flinders University of South Australia and The University of South Australia are the public universities .
Target Sentence South Australia is one of the six states of Australia . Its Capital is Adelaide .
Predict

In [0]:
import pandas as pd
df = pd.DataFrame(columns=['Source', 'Target', 'Predicted'])
for i in range(3000):
  src,target,predicted = predict_for_report2()
  df.loc[i]=[src,target,predicted]

In [0]:
df.to_pickle('Transformer Batch 2 3000 Sentence pd.pkl')
df.to_csv('Transformer Batch 2 3000 Sentence pd.csv')