In [53]:
import torch
import shutil
import os
from torch.utils.data import Dataset, DataLoader
import unittest
from d2l import torch as d2l

### Dataset
###  Process the english spanish translation
### link: https://www.manythings.org/anki/

In [52]:
class SpanishDataset(Dataset):
    def __init__(self, debug = False, num_steps = 10, batch_size = 32):
        super().__init__()
        self.DATASET_PATH = '../data/translation/spa.txt'
        assert os.path.exists(self.DATASET_PATH), 'English spanish dataset is not found'

        self.source = []
        self.target = []
        self.num_steps = num_steps
        self.batch_size = batch_size

        print('start building the dataset')

        with open(self.DATASET_PATH, 'r') as file:
            for idx, line in enumerate(file.readlines()):
                processed = self._preprocess(line)
                source_tokens, target_tokens = self._tokenize(processed)
                self.source.append(source_tokens)
                self.target.append(target_tokens)

        print(f'done tokenizing source and target, source len = {len(self.source)}, target len = {len(self.target)}')

        (self.source_array, self.target_array, self.valid_len, self.label_target_array), self.source_vocab, self.target_vocab = \
            self._build_arrays(self.source, self.target)
        
        print(f'done building source and target arrays')
        shape2d = lambda a: f'({len(a)},{len(a[0])})'
        print("=" * 10)
        print('STATS')
        print("=" * 10)
        print(f'source array shape', shape2d(self.source_array)) 
        print('source vocab len = ', len(self.source_vocab))
        print('valid_len shape  = ', self.valid_len.shape)
        print('target array shape = ', shape2d(self.target_array))
        print('target vocab len = ', len(self.target_vocab))

        
    def _preprocess(self, text):
        # from D2L processing step in chapter 10
        # Replace non-breaking space with space
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        # Insert space between words and punctuation marks
        no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
        out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
            for i, char in enumerate(text.lower())]
        return ''.join(out)
    
    def _tokenize(self, text):
        # Tokenization method in D2L processing step in chapter 10
        if len(text.split('\t')[:-1]) == 2:
            part = text.split('\t')[:-1]
            src = [token for token in f'{part[0]} <eos>'.split(' ') if token]
            tgt = [token for token in f'{part[1]} <eos>'.split(' ') if token]
            return src, tgt
        else:
            return '',''
    def _build_arrays(self, source, target):
        '''
        @params:
            source_raw: list[list[string]], source sequence, eg: [['a', 'b', '<eos>'], ...]
            target_rwa: list[list[string]], target sequence
        @return
            (
                source_array: list[list[int]]
                target_array_with_bos: list[list[int]]
                valid_len: list[int]
                target_array_with_eos: list[list[int]]
            ),
            source_vocab: Vocab
            target_vocab: Vocab
        '''
        #pad with <pad> token if sequence len < time step, else truncate
        #NOTE: in the book, they just truncated without adding <eos> at the end, 
        # I don't think that is correct
        pad_or_truncate = lambda sentence, numstep: \
            sentence[:numstep - 1] + ['<eos>'] if len(sentence) > numstep \
                else sentence + ['<pad>'] * (numstep - len(sentence))

        def _build_array(sequence, is_target = False):
            '''
            @params:
                sentence: string
                is_target: boolean, if sentence is target, append <bos> to beginning of sentence
            @return
                array: list[str] 
                vocab: Vocab object
            '''
            new_sequence = [ ]
            for sentence in sequence:
                sentence = pad_or_truncate(sentence, self.num_steps)
                if is_target: 
                    sentence = ['<bos>'] + sentence
                
                new_sequence.append(sentence)

            vocab = d2l.Vocab(new_sequence, min_freq = 2)

            #calculate valid_len for training later
            array = torch.tensor([vocab[sentence] for sentence in new_sequence])
            valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
            return array,vocab,valid_len

        source_array, source_vocab, valid_len = _build_array(source)        
        target_array, target_vocab, _ = _build_array(target, is_target= True)        

        return (source_array, target_array[:,:-1], valid_len, target_array[:,1:]), source_vocab, target_vocab
    
    def __len__(self):
        '''
        @return
            int: length of the english - spanish pairs
        '''
        return len(self.source_array)
    
    def __getitem__(self,idx):
        '''
        @params:
            idx: int, datapoint index
        @return
            source_array, target_array, valid_len, label_target_array
        '''
        return (self.source_array[idx], self.target_array[idx], self.valid_len[idx], self.label_target_array[idx])
    
    def get_dataloader(self):
        return DataLoader(self, batch_size = self.batch_size, shuffle = True)
    
dataset = SpanishDataset()

start building the dataset
done tokenizing source and target, source len = 141370, target len = 141370
done building source and target arrays
STATS
source array shape (141370,10)
source vocab len =  9538
valid_len shape  =  torch.Size([141370])
target array shape =  (141370,10)
target vocab len =  16679


In [23]:
print(dataset.source[1000])

['put', 'it', 'on', '.', '<eos>']


In [24]:
source = dataset.source
target = dataset.target

In [36]:
for s, t in zip(source[:10], target[:10]):
    print(s, '\t', t)

['go', '.', '<eos>'] 	 ['ve', '.', '<eos>']
['go', '.', '<eos>'] 	 ['vete', '.', '<eos>']
['go', '.', '<eos>'] 	 ['vaya', '.', '<eos>']
['go', '.', '<eos>'] 	 ['váyase', '.', '<eos>']
['hi', '.', '<eos>'] 	 ['hola', '.', '<eos>']
['run', '!', '<eos>'] 	 ['¡corre', '!', '<eos>']
['run', '!', '<eos>'] 	 ['¡corran', '!', '<eos>']
['run', '!', '<eos>'] 	 ['¡huye', '!', '<eos>']
['run', '!', '<eos>'] 	 ['¡corra', '!', '<eos>']
['run', '!', '<eos>'] 	 ['¡corred', '!', '<eos>']


In [49]:
print(source_array[0])
print(source_array[0].tolist())
print(source_vocab.to_tokens(source_array[500].tolist()))

tensor([3966,   91,  222,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,  223,
         223,  223,  223,  223])
[3966, 91, 222, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 2

### Spanish Dataset Tests

In [8]:
class SpanishDatasetTest(unittest.TestCase):

    def test_upper(self):
        self.assertEqual('foo'.upper(), 'FOO')

    def test_isupper(self):
        self.assertTrue('FOO'.isupper())
        self.assertFalse('Foo'.isupper())

    def test_split(self):
        s = 'hello world'
        self.assertEqual(s.split(), ['hello', 'world'])
    
    def test4(self):
        self.assertEqual('foo', 'foo1')

unittest.main(argv=[''], exit=False)

F...
FAIL: test4 (__main__.SpanishDatasetTest.test4)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/var/folders/_s/m9kt_szd4qq9brspl9hd44mc0000gn/T/ipykernel_51936/1831953952.py", line 15, in test4
    self.assertEqual('foo', 'foo1')
AssertionError: 'foo' != 'foo1'
- foo
+ foo1
?    +


----------------------------------------------------------------------
Ran 4 tests in 0.003s

FAILED (failures=1)


<unittest.main.TestProgram at 0x15f37ca10>