In [1]:
import renom as rm
import renom.graph as rmg
import numpy as np
from tensorflow.keras.utils import to_categorical
import pickle
import time
from datetime import datetime
from IPython.display import clear_output
from tensorboardX import SummaryWriter
if not rm.has_cuda():
    print("NO CUDA!!!!!")
rm.set_cuda_active(rm.has_cuda())
rm.set_cuda_active(False)
tokenized_data, tokenizer_en, tokenizer_pt = None, None, None

In [2]:
class MultiHeadAttention(rmg.core.GraphFactory):
    def prepare(self, d_model, num_heads):
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        depth = d_model // num_heads
        self.dense_q, self.dense_k, self.dense_v = list(), list(), list()
        
        for layer in [self.dense_q, self.dense_k, self.dense_v]:
            for head_id in range(num_heads):
                dense = rmg.Dense(depth)
                layer.append(dense)
        self.dense_final = rmg.Dense(d_model)
        
    def custom_matmul(self, a, b, max_seq_len = 40, transpose_b = True):
        if int(a.shape[0]) == max_seq_len:
            if transpose_b:
                return a@b.T
            else:
                return a@b
        mul = list()
        for i in range(int(a.shape[0])//max_seq_len):
            begin = i * max_seq_len
            end = begin + max_seq_len
            a_i = a[begin:end]
            b_i = b[begin:end]
            if transpose_b:
                b_i = b_i.T
            ab = a_i @ b_i
            mul.append(ab)
        result = rmg.concatenate(mul)
        return result
    
    def scaled_dot_product_attention(self, 
                                     q, # (B*SQ, DEPTH)
                                     k, # (B*SK, DEPTH)
                                     v, # (B*SK, DEPTH)
                                     mask):
        matmul_qk = self.custom_matmul(q, k) #(B*SQ, SK)
#         dk = float(q.shape[0]) #Should be sequence length
        scaled_attention_logits = matmul_qk / np.sqrt(40)
        scaled_attention_logits += mask 
        scaled_attention = rmg.softmax(scaled_attention_logits)
        output =  self.custom_matmul(scaled_attention, v, transpose_b = False) #(B*SQ, DEPTH)
        return output
    
    def connect(self, 
                q, # (B*SQ, D_MODEL)
                k, # (B*SK, D_MODEL)
                v, # (B*SK, D_MODEL)
                mask):
#         assert k.shape[0].value == v.shape[0].value
        head_summaries = list()
        
        for head_id in range(self.num_heads):
            out_q = self.dense_q[head_id](q) # (B*SQ, DEPTH)
            out_k = self.dense_k[head_id](k) # (B*SK, DEPTH)
            out_v = self.dense_v[head_id](v) # (B*SK, DEPTH)     
            head_attention_weights = self.scaled_dot_product_attention(out_q, out_k, out_v, mask)
            head_summaries.append(head_attention_weights) 
#         print("CONCAT")
        if self.num_heads == 1:
            concat_attention = head_summaries[0]
        else:
            concat_attention = rmg.concatenate(head_summaries, axis=1) # (B*SQ, D_MODEL)
#         print("CONCAT_END")
        output = self.dense_final(concat_attention) # (B*SQ, D_MODEL)
        return output

In [3]:
mha = MultiHeadAttention(32, 4)
q = np.arange(0, 2*40*32).reshape(2*40, 32)
k = np.arange(0 + 10, 2*40*32 + 10).reshape(2*40, 32)
v = np.arange(0 + 20, 2*40*32 + 20).reshape(2*40, 32)
mask = np.arange(2*40*40).reshape(2*40, 40)

In [7]:
for layer in [mha.dense_q, mha.dense_k, mha.dense_v]:
    for dense in layer:
        weight = rmg.DynamicVariable(np.arange(32*8).reshape(32, 8))
        bias = rmg.DynamicVariable(np.arange(8).reshape(1,8))
        dense.params["w"] = weight
        dense.params["b"] = bias
        
weight = rmg.DynamicVariable(np.arange(32*32).reshape(32, 32))
mha.dense_final.params["w"] = weight
bias = rmg.DynamicVariable(np.arange(32).reshape(1,32))
mha.dense_final.params["b"] = bias

In [8]:
result = mha(q, k, v, mask)
print(result.numpy)

[[8.3683779e+10 8.3852050e+10 8.4020322e+10 ... 8.8563679e+10
  8.8731943e+10 8.8900215e+10]
 [8.3683779e+10 8.3852050e+10 8.4020322e+10 ... 8.8563679e+10
  8.8731943e+10 8.8900215e+10]
 [8.3683779e+10 8.3852050e+10 8.4020322e+10 ... 8.8563679e+10
  8.8731943e+10 8.8900215e+10]
 ...
 [1.6679390e+11 1.6712928e+11 1.6746468e+11 ... 1.7652020e+11
  1.7685558e+11 1.7719096e+11]
 [1.6679390e+11 1.6712928e+11 1.6746468e+11 ... 1.7652020e+11
  1.7685558e+11 1.7719096e+11]
 [1.6679390e+11 1.6712928e+11 1.6746468e+11 ... 1.7652020e+11
  1.7685558e+11 1.7719096e+11]]


In [4]:
class MHA_Cell(rmg.core.GraphFactory):
    def prepare(self, d_model, num_heads, dropout_rate):
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.dropout = rmg.Dropout(dropout_rate)
        self.layernorm = rmg.LayerNormalize()
        
    def connect(self, q, k, v, mask, inference):
        self.dropout.set_inference(inference)
        output = self.mha(q, k, v, mask)  # (B*SQ, D_MODEL)
        output = self.dropout(output)
        output = self.layernorm(q + output)
        return output

    
class FFN_Cell(rmg.core.GraphFactory):
    def prepare(self, d_model, dff, dropout_rate):
        self.sequential = rmg.Sequential([
            rmg.Dense(dff, activation='relu'), # (B*SQ, DFF)
            rmg.Dense(d_model)  # (B*SQ, D_MODEL)
        ])
        self.dropout = rmg.Dropout(dropout_rate)
        self.layernorm = rmg.LayerNormalize()
        
    def connect(self, x, inference):
        self.dropout.set_inference(inference)
        output = self.sequential(x)
        output = self.dropout(output)
        output = self.layernorm(x + output)
        return output

In [5]:
class EncoderLayer(rmg.core.GraphFactory):
    def prepare(self, d_model, num_heads, dff, dropout_rate):
        self.mha_cell = MHA_Cell(d_model, num_heads, dropout_rate)
        self.ffn_cell = FFN_Cell(d_model, dff, dropout_rate)
    
    def connect(self, x, mask, inference):
        print("    ENCODER MHA")
        output = self.mha_cell(x, x, x, mask, inference)
        print("    ENCODER FFN")
        output = self.ffn_cell(output, inference)
        return output

class DecoderLayer(rmg.core.GraphFactory):
    def prepare(self, d_model, num_heads, dff, dropout_rate):
        self.mha_cell_1 = MHA_Cell(d_model, num_heads, dropout_rate)
        self.mha_cell_2 = MHA_Cell(d_model, num_heads, dropout_rate)
        self.ffn_cell = FFN_Cell(d_model, dff, dropout_rate)
        
    def connect(self, x, encoder_output, input_padding_mask, look_ahead_mask, inference):
        print("    DECODER MHA 1")
        output = self.mha_cell_1(x, x, x, look_ahead_mask, inference)
        print("    DECODER MHA 2")
        output = self.mha_cell_2(output, encoder_output, encoder_output, input_padding_mask, inference)      
        print("    DECODER FFN")
        output = self.ffn_cell(output, inference)   
        return output

In [6]:
class Encoder(rmg.core.GraphFactory):
    def prepare(self, num_layers, d_model, num_heads, dff, dropout_rate):
        self.dropout = rmg.Dropout(dropout_rate)
        self.encoder_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        
    def connect(self, x, mask, inference):
        self.dropout.set_inference(inference)
        print("  ENCODER: ")
        x = self.dropout(x)
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, mask, inference)
        print("  ENCODER FINISHED!")
        return x  # (B*S, D_MODEL)
    
class Decoder(rmg.core.GraphFactory):
    def prepare(self, num_layers, d_model, num_heads, dff, dropout_rate):
        self.dropout = rmg.Dropout(dropout_rate)
        self.decoder_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        
    def connect(self, x, encoder_output, input_padding_mask, look_ahead_mask, inference):
        self.dropout.set_inference(inference)
        print("  DECODER: ")
        x = self.dropout(x)
        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x, encoder_output, input_padding_mask, look_ahead_mask, inference)
        print("  DECODER FINISHED!")
        return x # (B*S, D_MODEL)

In [7]:
class Transformer(rmg.core.GraphFactory):
    def prepare(self, 
                num_layers, d_model, num_heads, dff, 
                input_vocab_size, target_vocab_size, 
                pos_enc_size, dropout_rate=0.1):
        self.d_model = d_model
        self.sqrt_d_model = np.sqrt(d_model)
        self.pos_enc_size = pos_enc_size
        self.input_embedding = rmg.Embedding(input_vocab_size, d_model)
        self.target_embedding = rmg.Embedding(target_vocab_size, d_model)
        self.pos_enc, self.pos_enc_tiled = None, None
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, dropout_rate)  
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, dropout_rate)
        self.linear = rmg.Dense(target_vocab_size)
        self.softmax = rmg.Softmax()
    
    def connect(self, 
                inputs, #(B*S, 1)
                targets, #(B*S, 1)
                batch_size=1,
                inference = False):
        print("TRANSFORMER FORWARD: ")
        self.batch_size = batch_size
        input_padding_mask, look_ahead_mask = self.create_masks(inputs, targets)
        
#         inputs  = self.input_embedding(inputs)   * rmg.sqrt(self.d_model) #(B*S, D_MODEL)
#         targets = self.target_embedding(targets) * rmg.sqrt(self.d_model) #(B*S, D_MODEL)
        inputs  = self.input_embedding(inputs)   * self.sqrt_d_model #(B*S, D_MODEL)
        targets = self.target_embedding(targets) * self.sqrt_d_model #(B*S, D_MODEL)
        
        if self.pos_enc is None:
            self.pos_enc, self.pos_enc_tiled = self.get_position_encoding_matrix(self.pos_enc_size, self.d_model) 
            #(B, D_MODEL)
        inputs  = self.add_position_encoding(inputs)
        targets = self.add_position_encoding(targets)
        
        encoder_output = self.encoder(inputs, input_padding_mask, inference)  # (B*S, D_MODEL)
        decoder_output = self.decoder(targets, encoder_output, input_padding_mask, look_ahead_mask, inference) 
        # (B*S, D_MODEL)

        output = self.linear(decoder_output)  # (B*S, target_vocab_size)
#         output = self.softmax(output) # (B*S, target_vocab_size)
        print("TRANSFORMER FINISHED!\n")
        return output
    
    def add_position_encoding(self, x):
        if x.shape == self.pos_enc_tiled.shape:
            return x + self.pos_enc_tiled
        if x.shape == self.pos_enc.shape:
            return x + self.pos_enc
        assert False
    
    def get_position_encoding_matrix(self, num_position, d_model, min_rate = 1/10000):
        angle_rates = min_rate**(np.linspace(0, 1, d_model//2))
        positions = np.arange(num_position) 
        angle_rads = (positions[:, np.newaxis]) * angle_rates[np.newaxis, :]
        sines, cosines = np.sin(angle_rads), np.cos(angle_rads)
        pos_encoding = np.stack([sines, cosines], axis=2).reshape(sines.shape[0], -1)
        pos_encoding_tiled = np.tile(pos_encoding, (self.batch_size, 1))
        
        pos_encoding = rmg.DynamicVariable(pos_encoding)
        pos_encoding_tiled = rmg.DynamicVariable(pos_encoding_tiled)
        return pos_encoding, pos_encoding_tiled

    def create_masks(self, inputs, targets):
        def create_padding_mask(seq):
            mapping_func = np.vectorize(lambda x: x == 0, otypes=[np.float32])
            mask = mapping_func(np.copy(seq.numpy))
            return mask

        def create_look_ahead_mask(size):
            ones = np.ones((size, size), dtype=np.float32)
            look_ahead_mask = np.triu(ones, k=1)
            return look_ahead_mask 
        MAX_SEQ_LEN = targets.shape[0].value//self.batch_size
        
        input_padding_mask = create_padding_mask(inputs)
        input_padding_mask = input_padding_mask.flatten().reshape(self.batch_size, MAX_SEQ_LEN)
        input_padding_mask = np.repeat(input_padding_mask, repeats = [MAX_SEQ_LEN]*self.batch_size, axis=0)
        
        target_padding_mask = create_padding_mask(targets)    
        target_padding_mask = target_padding_mask.flatten().reshape(self.batch_size, MAX_SEQ_LEN)
        target_padding_mask = np.repeat(target_padding_mask, repeats = [MAX_SEQ_LEN]*self.batch_size, axis=0)
        
        look_ahead_mask = create_look_ahead_mask(MAX_SEQ_LEN) 
        look_ahead_mask = np.tile(look_ahead_mask, (self.batch_size, 1))
        assert look_ahead_mask.shape == target_padding_mask.shape
        look_ahead_mask = np.maximum(look_ahead_mask, target_padding_mask)
        input_padding_mask *= -1e9
        look_ahead_mask *= -1e9
        input_padding_mask = rmg.DynamicVariable(input_padding_mask)
        look_ahead_mask = rmg.DynamicVariable(look_ahead_mask)
        
        return input_padding_mask, look_ahead_mask 

In [8]:
if not tokenizer_en:
    with open('pickle/tokenizer_en.pickle', 'rb') as f:
        tokenizer_en = pickle.load(f)
if not tokenizer_pt:
    with open('pickle/tokenizer_pt.pickle', 'rb') as f:
        tokenizer_pt = pickle.load(f)

In [9]:
class TransformerModel():
    def __init__(self):
        self.MAX_SEQ_LEN = 40
        self.BATCH_SIZE = 1

        self.model = Transformer(num_layers = 2,
                                 d_model = 128,
                                 num_heads = 4,
                                 dff = 256, 
                                 input_vocab_size = tokenizer_pt.vocab_size + 2,
                                 target_vocab_size = tokenizer_en.vocab_size + 2,
                                 pos_enc_size = self.MAX_SEQ_LEN,
                                 dropout_rate = 0.1)
        self.loss = rmg.SoftmaxCrossEntropy()
        self.opt = rmg.Adam(beta1=0.9, beta2=0.98, epsilon=1e-8)
#         self.opt = rmg.Adam()
#         self.opt = rmg.Rmsprop(lr=0.05)
        
    def fit(self, x, y_inp, y_tar):
        y_tar = to_categorical(y_tar)
        dist = rmg.SimpleGenerator(x, y_inp, y_tar).shuffle().batch(self.BATCH_SIZE*self.MAX_SEQ_LEN)
        x_in, y_in, target = dist.get_output_graphs(num_gpus=1)
#         print("MODEL INPUT SHAPE: ", x_in.shape, y_in.shape, target.shape) 
        prediction = self.model(inputs=x_in, targets=y_in, batch_size=self.BATCH_SIZE)
#         assert prediction.shape == target.shape      
        self.graph = self.loss(prediction, target)
        self.graph.backward()
        self.exe = self.graph.get_executor(mode='training', optimizer=self.opt)
        
    def execute(self, epochs = 1):
        self.exe.execute(epochs=epochs)
        
    def get_model(self):
        return self.model
    
    def save(self, name=None):
        if name is None:
            name = datetime.now().strftime("%d/%m/%Y %H:%M:%S").replace("/","_").replace(":", "_").replace(" ", "_")
        full_name = "saved_transformer/transformer_2_" + name + ".hdf5"
        return self.model.save(full_name)
    
    def load(self, name):
        return self.model.load("saved_transformer/" + name)

In [10]:
if not tokenized_data:
    with open('pickle2/tokenized_data.pickle', 'rb') as f:
        tokenized_data = pickle.load(f)
data = "train"
train_X = tokenized_data["pt." + data]
train_y_inp = tokenized_data["en." + data + ".inp"]
train_y_tar = tokenized_data["en." + data + ".tar"]

transformer = TransformerModel()
writer = SummaryWriter(logdir='tensorboardX_data/transformer')
loss_list = list()

In [11]:
# np.set_printoptions(threshold=100000)
# transformer.fit(train_X, train_y_inp, train_y_tar, epochs=1)

In [None]:
num_epochs = 30
transformer.fit(train_X, train_y_inp, train_y_tar)

for epoch in range (0, num_epochs):
    print("EPOCH : ", epoch)
    gc.collect()
    transformer.execute()
    transformer.save()
    loss_val = transformer.graph.output.as_ndarray()[0]
    print("LOSS: ", loss_val)
    loss_list.append(loss_val)
    writer.add_scalar('Transformer loss w/ more layers', loss_val, epoch)

In [None]:
# transformer.fit(train_X, train_y_inp, train_y_tar, epochs)

In [None]:
# transformer2 = TransformerModel()
# transformer2.load()

In [None]:
# !tensorboard --logdir tensorboardX_data/transformer2 --port=6006

In [None]:
start_token_en = [tokenizer_en.vocab_size]
end_token_en = [tokenizer_en.vocab_size + 1]
start_token_pt = [tokenizer_pt.vocab_size]
end_token_pt = [tokenizer_pt.vocab_size + 1]

MAX_SEQ_LEN = 40

def translate(model, sentence):
    encoder_input = [start_token_pt] + [[i] for i in tokenizer_pt.encode(sentence)] + [end_token_pt]
    encoder_input = encoder_input + [[0]] * (MAX_SEQ_LEN - len(encoder_input))
    encoder_input = rmg.DynamicVariable(np.asarray(encoder_input))
    decoder_input = [start_token_en]

    for i in range(MAX_SEQ_LEN):
#         print(decoder_input)
        output = decoder_input + [[0]] * (MAX_SEQ_LEN - len(decoder_input))
        output = rmg.DynamicVariable(np.asarray(output))
#         print("TRANSLATE INPUT SHAPE ", encoder_input.shape, output.shape)
        prediction = model(inputs=encoder_input, targets=output, inference=True)
#         return prediction
#             print("PREDICTION: ", prediction)
        last_word = prediction[len(decoder_input)-1:]
        predicted_id = rmg.argmax(last_word, axis=1).numpy[0]
        if predicted_id == end_token_en:
            break
        decoder_input.append([predicted_id])

#         clear_output()
    print(decoder_input)
    token_list = [int(token[0]) for token in decoder_input if token[0] < tokenizer_en.vocab_size]
    translated_sentence = tokenizer_en.decode(token_list)
    return translated_sentence    

In [None]:
seq = "este é um problema que temos que resolver."
translate(transformer.get_model(), seq)