### Setting up a basic training loop - not using attention

In [2]:
''' Changing directories '''
import os 
if 'BERT' not in os.getcwd():
    os.chdir('BERT')
print("Current working dir is {}".format(os.getcwd()))

Current working dir is /juice/scr/scr110/scr/nlp/mtl_bert/unidirectional-NMT/BERT


In [3]:
import pyaml
import onmt
import torch
from dataset import TextDataset
from encoder import Encoder 
from lib.huggingface.transformers import RobertaTokenizer, CamembertTokenizer
from torch.utils.data import DataLoader

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
with open(os.path.join(os.path.dirname(os.getcwd()), "config", "config.yml"), "r") as fd:
    config = pyaml.yaml.load(fd, Loader=pyaml.yaml.Loader)

In [6]:
text_dataset = TextDataset("data/data-30k-default/", is_train=True)

Loading data from file: data.train.2.pt
Loading data from file: data.train.1.pt
Loading data from file: data.train.0.pt
removed 11332 examples - not long enough


In [7]:
tokenizer_en = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer_fr = CamembertTokenizer.from_pretrained('camembert-base')

In [8]:
def collate(data): 
    '''Collating function to be passed into the dataloader '''
    input_sentences, output_sentences = zip(*data)
    input_lengths = [len(sentence)+2 for sentence in input_sentences]
    output_lengths = [len(sentence)+2 for sentence in output_sentences]
    
    batch_size = len(input_sentences)
    
    max_input_lengths = max(input_lengths)
    max_output_lengths = max(output_lengths)
    
    input_idx_tensor = torch.zeros((batch_size, max_input_lengths), dtype=torch.long)
    output_idx_tensor = torch.zeros((batch_size, max_output_lengths), dtype=torch.long)
    
    for idx, (sentence_len, input_sentence) in enumerate(zip(input_lengths, input_sentences)): 
        input_idx_tensor[idx, :] = torch.tensor(tokenizer_en.encode(input_sentence) + [1]*(max_input_lengths-sentence_len))

    
    for idx, (sentence_len, output_sentence) in enumerate(zip(output_lengths, output_sentences)): 
        output_idx_tensor[idx, :] = torch.tensor(tokenizer_fr.encode(output_sentence) + [1]*(max_output_lengths-sentence_len))

    return (input_idx_tensor, output_idx_tensor)

In [9]:
dataloader = DataLoader(text_dataset, **config["data_loader"], collate_fn=collate)

#### Specifying the encoding and decoding models

In [11]:
try:
    del encoder_en
    del encoder_fr
except:
    pass 
encoder_en = Encoder("english")
encoder_fr = Encoder("french")

In [12]:
word_padding_idx_en = encoder_en._modules['model'].embeddings.padding_idx
word_padding_idx_fr = encoder_fr._modules['model'].embeddings.padding_idx

word_vocab_size_en = encoder_en._modules['model'].embeddings.word_embeddings.num_embeddings
word_vocab_size_fr = encoder_fr._modules['model'].embeddings.word_embeddings.num_embeddings

word_vec_size_en = encoder_en._modules['model'].embeddings.word_embeddings.embedding_dim
word_vec_size_fr = encoder_fr._modules['model'].embeddings.word_embeddings.embedding_dim

In [13]:
embeddings_en = onmt.modules.embeddings.Embeddings(
    word_vec_size_en, 
    word_vocab_size_en, 
    word_padding_idx_en, 
    position_encoding=True
)

embeddings_fr = onmt.modules.embeddings.Embeddings(
    word_vec_size_fr, 
    word_vocab_size_fr, 
    word_padding_idx_fr, 
    position_encoding=True
)

In [14]:
decoder_en = onmt.decoders.TransformerDecoder(**config["small_transformer"], embeddings=embeddings_en)  
decoder_fr = onmt.decoders.TransformerDecoder(**config["small_transformer"], embeddings=embeddings_fr)

Beginning the training loop

In [15]:
from torch import optim

In [None]:
def loss_fn(encoder_output_english, encoder_output_french, decoder_output_french, decoder_output_english):
    '''Adversarial Loss'''
    return None 

In [1]:
def train(encoder_model, decoder_model, data_iter): 
    optimizer = optim.SGD(encoder_model.parameters(), lr=0.01, momentum=0.9)
    
    for batch in data_iter:
        optimizer.zero_grad()
        english_sentences, french_sentences = batch
        
        # using the pooled output 
        (_, encoder_output_english), language_prediction_logits = encoder_model(english_sentences)
        print(encoder_output_english.shape)
        print(language_prediction_logits.shape)
        exit()
        #_, encoder_output_french = encoder_model(french_sentences)
        
        
        #loss = loss_fn(output, target)
        #loss.backward()
        #optimizer.step()

        return

In [17]:
train(encoder_en, None, dataloader)

torch.Size([64, 768])
torch.Size([64, 1])
