In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from tensorflow.keras.utils import to_categorical
import pickle
import time

from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/My Drive/Colab Notebooks/"

with open(path + 'tokenizer_en.pickle', 'rb') as f:
    tokenizer_en = pickle.load(f)
with open(path + 'tokenizer_pt.pickle', 'rb') as f:
    tokenizer_pt = pickle.load(f)
with open(path + 'tokenized_data.pickle', 'rb') as f:
    tokenized_data = pickle.load(f)

data = "train"
train_X = tokenized_data["pt." + data]
train_y_inp = tokenized_data["en." + data + ".inp"]
train_y_tar = tokenized_data["en." + data + ".tar"]

train_X = train_X.reshape(train_X.shape[0]//40, 40)
train_y_inp = train_y_inp.reshape(train_y_inp.shape[0]//40, 40)
train_y_tar = train_y_tar.reshape(train_y_tar.shape[0]//40, 40)

train_X = torch.from_numpy(train_X)
train_y_inp = torch.from_numpy(train_y_inp)
train_y_tar = torch.from_numpy(train_y_tar)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [0]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        depth = d_model // num_heads
        self.dense_q, self.dense_k, self.dense_v = list(), list(), list()
        
        for layer in [self.dense_q, self.dense_k, self.dense_v]:
            for head_id in range(num_heads):
                layer.append(nn.Linear(d_model, depth).to(device))
        self.dense_final = nn.Linear(d_model, d_model).to(device)
    
    def scaled_dot_product_attention(self, q, k, v, mask):
        matmul_qk = torch.matmul(q, k.permute(0,2,1)) #(B*SQ, SK)
        sqrt_dk = torch.Tensor([np.sqrt(40)]).to(device) #Should be sequence length
        scaled_attention_logits = matmul_qk / sqrt_dk
        # print(scaled_attention_logits.shape, padding_mask.shape)
        scaled_attention_logits += mask 
        scaled_attention = nn.Softmax(2)(scaled_attention_logits)
        output =  torch.matmul(scaled_attention, v) #(B*SQ, DEPTH)
        return output
    
    def forward(self, q, k, v, mask):
        # assert k.shape[0].value == v.shape[0].value
        head_summaries = list()
        for head_id in range(self.num_heads):
            out_q = self.dense_q[head_id](q) # (B*SQ, DEPTH)
            out_k = self.dense_k[head_id](k) # (B*SK, DEPTH)
            out_v = self.dense_v[head_id](v) # (B*SK, DEPTH)     
            head_attention_weights = self.scaled_dot_product_attention(out_q, out_k, out_v, mask)
            head_summaries.append(head_attention_weights) 
        
        concat_attention = torch.cat(head_summaries, axis=2).to(device) # (B*SQ, D_MODEL)
        output = self.dense_final(concat_attention) # (B*SQ, D_MODEL)
        return output

# CELL

In [0]:
class MHA_Cell(nn.Module):
    def __init__(self, d_model, num_heads, dropout_rate):
        super(MHA_Cell, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.layernorm = nn.LayerNorm(d_model).to(device)
        
    def forward(self, q, k, v, mask):
        output = self.mha(q, k, v, mask)  # (B*SQ, D_MODEL)
        output = self.dropout(output)
        output = self.layernorm(q + output)
        return output

    
class FFN_Cell(nn.Module):
    def __init__(self, d_model, dff, dropout_rate):
        super(FFN_Cell, self).__init__()
        self.sequential = nn.Sequential(
            nn.Linear(d_model, dff).to(device), # (B*SQ, DFF)
            nn.ReLU().to(device),
            nn.Linear(dff, d_model).to(device)  # (B*SQ, D_MODEL)
        ).to(device)
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.layernorm = nn.LayerNorm(d_model).to(device)
        
    def forward(self, x):
        output = self.sequential(x)
        output = self.dropout(output)
        output = self.layernorm(x + output)
        return output

# LAYER

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dff, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.mha_cell = MHA_Cell(d_model, num_heads, dropout_rate)
        self.ffn_cell = FFN_Cell(d_model, dff, dropout_rate)
    
    def forward(self, x, mask):
        # print("    ENCODER MHA")
        output = self.mha_cell(x, x, x, mask)
        # print("    ENCODER FFN")
        output = self.ffn_cell(output)
        return output

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dff, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.mha_cell_1 = MHA_Cell(d_model, num_heads, dropout_rate)
        self.mha_cell_2 = MHA_Cell(d_model, num_heads, dropout_rate)
        self.ffn_cell = FFN_Cell(d_model, dff, dropout_rate)
        
    def forward(self, x, encoder_output, input_padding_mask, look_ahead_mask):
        # print("    DECODER MHA 1")
        output = self.mha_cell_1(x, x, x, look_ahead_mask)
        # print("    DECODER MHA 2")
        output = self.mha_cell_2(output, encoder_output, encoder_output, input_padding_mask)      
        # print("    DECODER FFN")
        output = self.ffn_cell(output)   
        return output

In [0]:
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.encoder_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        
    def forward(self, x, mask):
        # print("  ENCODER: ")
        x = self.dropout(x)
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, mask)
        # print("  ENCODER FINISHED!")
        return x  # (B*S, D_MODEL)
    
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(dropout_rate).to(device)
        self.decoder_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)]
        
    def forward(self, x, encoder_output, input_padding_mask, look_ahead_mask):
        # print("  DECODER: ")
        x = self.dropout(x)
        for decoder_layer in self.decoder_layers:
            x = decoder_layer(x, encoder_output, input_padding_mask, look_ahead_mask)
        # print("  DECODER FINISHED!")
        return x # (B*S, D_MODEL)

# TRANSFORMER

In [0]:
class Transformer(nn.Module):
    def __init__(self, 
                num_layers, d_model, num_heads, dff, 
                input_vocab_size, target_vocab_size, 
                pos_enc_size, dropout_rate=0.1):
        super(Transformer, self).__init__()
        self.d_model = d_model
        self.input_embedding = nn.Embedding(input_vocab_size, d_model)
        self.target_embedding = nn.Embedding(target_vocab_size, d_model)
        
        self.pos_enc = self.get_position_encoding_matrix(pos_enc_size, d_model) #(B, D_MODEL)
        self.pos_enc_tiled = self.pos_enc 
        
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, dropout_rate)  
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, dropout_rate)
        
        self.linear = nn.Linear(d_model, target_vocab_size).to(device)
        self.softmax = nn.Softmax(2).to(device)
    
    def forward(self, 
                inputs, #(B*S, 1)
                targets, #(B*S, 1)
                batch_size=1):
        # print("TRANSFORMER FORWARD: ")
        input_padding_mask, look_ahead_mask = self.create_masks(inputs, targets)
        sqrt_d_model = torch.Tensor([np.sqrt(self.d_model)]).to(device)
        inputs  = self.input_embedding(inputs) *  sqrt_d_model #(B*S, D_MODEL)
        targets = self.target_embedding(targets) * sqrt_d_model #(B*S, D_MODEL)
        inputs  += self.pos_enc
        targets += self.pos_enc
        encoder_output = self.encoder(inputs, input_padding_mask)  # (B*S, D_MODEL)
        decoder_output = self.decoder(targets, encoder_output, input_padding_mask, look_ahead_mask) 
        # (B*S, D_MODEL)

        output = self.linear(decoder_output)  # (B*S, target_vocab_size)
        # output = self.softmax(output) # (B*S, target_vocab_size)
        # print("TRANSFORMER FINISHED!\n")
        return output
    
    def get_position_encoding_matrix(self, num_position, d_model, min_rate = 1/10000):
        angle_rates = min_rate**(np.linspace(0, 1, d_model//2))
        positions = np.arange(num_position) 
        angle_rads = (positions[:, np.newaxis]) * angle_rates[np.newaxis, :]
        sines, cosines = np.sin(angle_rads), np.cos(angle_rads)
        pos_encoding = np.stack([sines, cosines], axis=2).reshape(sines.shape[0], -1)
        pos_encoding = np.squeeze(pos_encoding)
        pos_encoding = torch.from_numpy(pos_encoding).to(device)
        return pos_encoding

    def create_masks(self, inputs, targets):
        def create_padding_mask(seq):
            mapping_func = np.vectorize(lambda x: x == 0, otypes=[np.float32])
            mask = mapping_func(np.copy(seq.cpu().numpy()))
            mask = mask[:, np.newaxis, :]
            return mask

        def create_look_ahead_mask(size):
            ones = np.ones((size, size), dtype=np.float32)
            look_ahead_mask = np.triu(ones, k=1)
            return look_ahead_mask[np.newaxis, :, :]
        
        input_padding_mask = create_padding_mask(inputs)
        target_padding_mask = create_padding_mask(targets)
        look_ahead_mask = create_look_ahead_mask(targets.shape[1])
        look_ahead_mask = np.maximum(look_ahead_mask, target_padding_mask)
    
        input_padding_mask *= -1e9
        look_ahead_mask *= -1e9
        
        input_padding_mask = torch.from_numpy(input_padding_mask).to(device)
        look_ahead_mask = torch.from_numpy(look_ahead_mask).to(device)

        return input_padding_mask, look_ahead_mask         

In [0]:
class TransformerModel():
    def __init__(self):
        self.MAX_SEQ_LEN = 40
        self.BATCH_SIZE = 64
        self.model = Transformer(num_layers = 4,
                                 d_model = 128,
                                 num_heads = 8,
                                 dff = 256, 
                                 input_vocab_size = tokenizer_pt.vocab_size + 2,
                                 target_vocab_size = tokenizer_en.vocab_size + 2,
                                 pos_enc_size = self.MAX_SEQ_LEN,
                                 dropout_rate = 0.1)
        self.model = self.model.to(device)
        self.loss = nn.CrossEntropyLoss().to(device)
        self.opt = torch.optim.SGD(self.model.parameters(), lr=10)
        # self.opt = optim.Adam(self.model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-06)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.opt, 1.0, gamma=0.95)
        
    def fit(self, x, y_inp, y_tar, epochs = 1):
        num_batch = int(x.shape[0]//self.BATCH_SIZE)
        self.model.train()
        for epoch in range(epochs):
          print("EPOCH: ", epoch)
          accu, count = 0, 0
          for i in range(num_batch):
            begin = i*self.BATCH_SIZE
            end = begin + self.BATCH_SIZE
            x_i = x[begin:end].to(device)
            y_i = y_inp[begin:end].to(device)
            t_i = y_tar[begin:end].to(device)

            self.opt.zero_grad()
            prediction = self.model(x_i, y_i)
            prediction = prediction.view(prediction.shape[0]*prediction.shape[1], prediction.shape[2])
            t_i = t_i.flatten()
      
            l = self.loss(prediction, t_i)
            l.backward()
            accu += l.item()
            count +=1
            if i%100==0:
              print("    Loss: ", accu/count)
              accu, count = 0, 0
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.1)
            self.opt.step()
        
    def get_model(self):
        return self.model

In [0]:
count = 14

In [0]:
# transformer = TransformerModel()
for i in range(30):
  epochs = 20
  transformer.fit(train_X, train_y_inp, train_y_tar, epochs)
  model_save_name = F"transformer_{count}.pt"
  path = F"/content/drive/My Drive/Save_model/{model_save_name}" 
  with open(path, 'wb') as f:
      pickle.dump(transformer.model, f, pickle.HIGHEST_PROTOCOL)
  count += 1

In [0]:
model_save_name = 'transformer_0.pt'
path = F"/content/drive/My Drive/Save_model/{model_save_name}" 
torch.save(transformer.model.state_dict(), path)

In [0]:
start_token_en = [tokenizer_en.vocab_size]
end_token_en = [tokenizer_en.vocab_size + 1]
start_token_pt = [tokenizer_pt.vocab_size]
end_token_pt = [tokenizer_pt.vocab_size + 1]
pred = list()
MAX_SEQ_LEN = 40

def translate(model, sentence):
    encoder_input = [start_token_pt] + [[i] for i in tokenizer_pt.encode(sentence)] + [end_token_pt]
    encoder_input = encoder_input + [[0]] * (MAX_SEQ_LEN - len(encoder_input))
    encoder_input = torch.from_numpy(np.asarray(encoder_input))
    encoder_input = torch.squeeze(encoder_input)
    encoder_input = torch.unsqueeze(encoder_input, 0).to(device)
    decoder_input = [start_token_en]
    model.eval()
    for i in range(MAX_SEQ_LEN):
      with torch.no_grad():
          output = decoder_input + [[0]] * (MAX_SEQ_LEN - len(decoder_input))
          output = torch.squeeze(torch.from_numpy(np.asarray(output))).to(device)
          output = torch.unsqueeze(output, 0)
          # print("TRANSLATE INPUT SHAPE ", encoder_input.shape, output.shape)
          prediction = model(inputs=encoder_input, targets=output)
          prediction = nn.Softmax(2)(prediction)
          prediction = prediction.view(prediction.shape[0]*prediction.shape[1], prediction.shape[2])
          # print("PREDICTION: ", prediction.shape)
          last_word = prediction[len(decoder_input)-1,:].cpu().numpy()
          pred.append(prediction)
          predicted_id = np.argmax(last_word)
          if predicted_id == end_token_en:
              break
          decoder_input.append([predicted_id])

    print(decoder_input)
    token_list = [int(token[0]) for token in decoder_input if token[0] < tokenizer_en.vocab_size]
    translated_sentence = tokenizer_en.decode(token_list)
    return translated_sentence    

In [0]:
seq = "este é um problema que temos que resolver." 
translate(transformer.get_model(), seq)
#this is a problem we have to solve

[[8087], [18], [14], [24], [7], [328], [6], [385], [1], [14], [24], [5], [966], [385], [2]]


'so we have a problem of problem , we have to solve problem .'

In [0]:
seq = "os meus vizinhos ouviram sobre esta ideia." 
translate(transformer.get_model(), seq)
#and my neighboring homes heard about this idea .

[[8087], [12], [198], [126], [147], [10], [12], [98], [7941], [7870], [26], [101], [3], [1137], [10], [12], [1329], [5093], [28], [32], [626], [2]]


"i started two things that i did n't know the word that i totally obsessed with my idea ."

In [0]:
seq = "este é o primeiro livro que eu fiz." 
translate(transformer.get_model(), seq)
#this is the first book i've ever done.

[[8087], [12], [1228], [16], [774], [13], [124], [774], [13], [3], [124], [6], [16], [774], [1401], [60], [3], [124], [774], [13], [657], [2]]


'i wrote this book is first book is the first of this book written by the first book is wrong .'