In [1]:
# test preprocessing.py
from data_processing.preprocessing import TextDataProcessor
from data_processing.Dataloader import DataReader

data_reader = DataReader('data/fr-en_fr.txt', 'data/fr-en_en.txt')
text_data_processor = TextDataProcessor.from_DataReader(data_reader)
vocab_src, vocab_tgt =  text_data_processor.build_vocab(src_lang="fr", tgt_lang="en", min_freq=5, random_state=42)
train_dataset, val_dataset, test_dataset = text_data_processor.get_dataset()
print(f"train_dataset: {len(train_dataset)} val_dataset: {len(val_dataset)} test_dataset: {len(test_dataset)}")

Building Source language Vocabulary ...
Building Target language Vocabulary ...
Finished.
Vocabulary sizes:
Source:  28
Target:  19
train_dataset: 16 val_dataset: 2 test_dataset: 2


In [2]:
import torch

In [3]:
# test Dataloader.py
from data_processing.Dataloader import create_dataloaders

# data_reader = DataReader('data/fr-en_fr.txt', 'data/fr-en_en.txt')
# text_data_processor = TextDataProcessor.from_DataReader(data_reader)
train_loader, val_loader = create_dataloaders(text_data_processor, device=torch.device("cuda"), src_lang="fr", tgt_lang="en")
for src, tgt in train_loader:
    print(src.shape, tgt.shape)
    break

torch.Size([16, 128]) torch.Size([16, 128])


In [4]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

class Batch:
    '''
    Object for holding a batch of data with mask during training. 
    params:
        src: torch.tensor, source data from a batch interated from DataLoader
        tgt: torch.tensor, target data from a batch interated from DataLoader
    '''

    def __init__(self, src, tgt=None, pad=2):  # 2 = <blank>
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)   # example [16, 128]  -> [16, 1, 128]
        if tgt is not None:
            self.tgt = tgt[:, :-1]
            self.tgt_y = tgt[:, 1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        '''
        Create a mask to hide padding and future words.
        '''
        tgt_mask = (tgt != pad).unsqueeze(-2)  # example tgt_mask.shape = [16, 1, 127]  [B, 1, max_padding-1]
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask  # example [16, 127, 127]

In [5]:
pad_idx = vocab_tgt["<blank>"]

In [7]:
data_iter = (Batch(b[0], b[1], pad_idx) for b in train_loader)

for i, batch in enumerate(data_iter):
    print(batch.src.shape, batch.tgt.shape)
    break

torch.Size([16, 128]) torch.Size([16, 127])
