### Setting up a basic training loop - not using attention

In [1]:
''' Changing directories '''
import os 
if 'BERT' not in os.getcwd():
    os.chdir('BERT')
print("Current working dir is {}".format(os.getcwd()))

Current working dir is /juice/scr/scr110/scr/nlp/mtl_bert/unidirectional-NMT/BERT


In [2]:
import pyaml
import onmt
import torch
from dataset import TextDataset
from encoder import Encoder 
#from decoder import AttnDecoderRNN
from lib.huggingface.transformers import RobertaTokenizer, CamembertTokenizer
from torch.utils.data import DataLoader

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [3]:
%load_ext autoreload
%autoreload 2

In [38]:
with open(os.path.join(os.path.dirname(os.getcwd()), "config", "config.yml"), "r") as fd:
    config = pyaml.yaml.load(fd, Loader=pyaml.yaml.Loader)

In [5]:
text_dataset = TextDataset("data/data-30k-default/", is_train=True)

Loading data from file: data.train.2.pt
Loading data from file: data.train.1.pt
Loading data from file: data.train.0.pt
removed 11332 examples - not long enough


In [6]:
tokenizer_en = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_fr = CamembertTokenizer.from_pretrained('camembert-base')

In [58]:
def collate(data): 
    '''Collating function to be passed into the dataloader '''
    input_sentences, output_sentences = zip(*data)
    input_lengths = [len(sentence)+2 for sentence in input_sentences]
    output_lengths = [len(sentence)+2 for sentence in output_sentences]
    
    batch_size = len(input_sentences)
    
    max_input_lengths = max(input_lengths)
    max_output_lengths = max(output_lengths)
    
    max_length = max(max_input_lengths, max_output_lengths)
    
    input_idx_tensor = torch.zeros((batch_size, max_length), dtype=torch.long)
    output_idx_tensor = torch.zeros((batch_size, max_length), dtype=torch.long)
    
    for idx, (sentence_len, input_sentence) in enumerate(zip(input_lengths, input_sentences)): 
        input_idx_tensor[idx, :] = torch.tensor(tokenizer_en.encode(input_sentence) + [1]*(max_length-sentence_len))

    
    for idx, (sentence_len, output_sentence) in enumerate(zip(output_lengths, output_sentences)): 
        output_idx_tensor[idx, :] = torch.tensor(tokenizer_fr.encode(output_sentence) + [1]*(max_length-sentence_len))

    return ((input_idx_tensor, torch.tensor(input_lengths)), (output_idx_tensor, torch.tensor(output_lengths)))

In [59]:
dataloader = DataLoader(text_dataset, **config["data_loader"], collate_fn=collate)

#### Specifying the encoding and decoding models

In [9]:
try:
    del encoder_en
    del encoder_fr
except:
    pass 
encoder_en = Encoder("english")
encoder_fr = Encoder("french")

In [10]:
word_padding_idx_en = encoder_en._modules['model'].embeddings.padding_idx
word_padding_idx_fr = encoder_fr._modules['model'].embeddings.padding_idx

word_vocab_size_en = encoder_en._modules['model'].embeddings.word_embeddings.num_embeddings
word_vocab_size_fr = encoder_fr._modules['model'].embeddings.word_embeddings.num_embeddings

word_vec_size_en = encoder_en._modules['model'].embeddings.word_embeddings.embedding_dim
word_vec_size_fr = encoder_fr._modules['model'].embeddings.word_embeddings.embedding_dim

In [11]:
embeddings_en = onmt.modules.embeddings.Embeddings(
    word_vec_size_en, 
    word_vocab_size_en, 
    word_padding_idx_en, 
    position_encoding=True
)

embeddings_fr = onmt.modules.embeddings.Embeddings(
    word_vec_size_fr, 
    word_vocab_size_fr, 
    word_padding_idx_fr, 
    position_encoding=True
)

In [39]:
decoder_en = onmt.decoders.TransformerDecoder(**config["small_transformer"], embeddings=embeddings_en)  
decoder_fr = onmt.decoders.TransformerDecoder(**config["small_transformer"], embeddings=embeddings_fr)

Beginning the training loop

In [13]:
from torch import optim

In [None]:
def loss_fn(encoder_output_english, encoder_output_french, decoder_output_french, decoder_output_english):
    '''Adversarial Loss'''
    return None 

In [67]:
def sequence_mask(lengths, max_len=None):
    """
    Creates a boolean mask from sequence lengths.
    """
    batch_size = lengths.numel()
    max_len = max_len or lengths.max()
    return (torch.arange(0, max_len, device=lengths.device)
            .type_as(lengths)
            .repeat(batch_size, 1)
            .lt(lengths.unsqueeze(1)))

In [87]:
def train(encoder_model, decoder_model, data_iter): 
    optimizer = optim.SGD(encoder_model.parameters(), lr=0.01, momentum=0.9)
    
    for batch in data_iter:
        optimizer.zero_grad()
        
        (english_sentences, english_sentence_lengths), (french_sentences, french_sentence_lengths) = batch
    
        encoder_outputs, language_prediction_logits = encoder_model(english_sentences)
        
        decoder_model.init_state(english_sentences.unsqueeze(2).transpose(0,1), None, None) # french coders suck
        
        decoder_french = decoder_model(french_sentences.unsqueeze(2).transpose(0,1), encoder_outputs[0].transpose(0,1), memory_lengths=english_sentence_lengths)
        print(decoder_french[0].shape)
        
        
        #loss = loss_fn(output, target)
        #loss.backward()
        #optimizer.step()

        return

In [82]:
decoder_fr

TransformerDecoder(
  (embeddings): Embeddings(
    (make_embedding): Sequential(
      (emb_luts): Elementwise(
        (0): Embedding(32005, 768, padding_idx=1)
      )
      (pe): PositionalEncoding(
        (dropout): Dropout(p=0, inplace=False)
      )
    )
  )
  (transformer_layers): ModuleList(
    (0): TransformerDecoderLayer(
      (self_attn): MultiHeadedAttention(
        (linear_keys): Linear(in_features=768, out_features=768, bias=True)
        (linear_values): Linear(in_features=768, out_features=768, bias=True)
        (linear_query): Linear(in_features=768, out_features=768, bias=True)
        (softmax): Softmax(dim=-1)
        (dropout): Dropout(p=0.1, inplace=False)
        (final_linear): Linear(in_features=768, out_features=768, bias=True)
      )
      (context_attn): MultiHeadedAttention(
        (linear_keys): Linear(in_features=768, out_features=768, bias=True)
        (linear_values): Linear(in_features=768, out_features=768, bias=True)
        (linear_query):

In [88]:
train(encoder_en, decoder_fr, dataloader)

torch.Size([52, 64, 768])


AttributeError: 'tuple' object has no attribute '__dict__'

In [52]:
encoder_fr._modules['model'].embeddings.word_embeddings

Embedding(32005, 768, padding_idx=1)