Youtube series
https://www.youtube.com/watch?v=gHk2IWivt_8&list=PLmZlBIcArwhPHmHzyM_cZJQ8_v5paQJTV

In [3]:
# !pip install d2l==1.0.3
# !pip install matplotlib_inline

In [4]:
import torch
import torch.nn as nn
import shutil
import os
from torch.utils.data import Dataset, DataLoader
import unittest
from d2l import torch as d2l
import logging
import requests
import pickle
from tqdm import tqdm
import numpy as np
import shutil

### Download the Spanish-English dataset

In [5]:
# Google Collab
!mkdir data
!curl https://www.manythings.org/anki/spa-eng.zip -o data/spa-eng.zip
!unzip data/spa-eng.zip -d data/

A subdirectory or file data already exists.


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 26 5286k   26 1413k    0     0  1739k      0  0:00:03 --:--:--  0:00:03 1741k
100 5286k  100 5286k    0     0  3521k      0  0:00:01  0:00:01 --:--:-- 3526k
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [6]:
#Windows
# !mkdir data
# !curl https://www.manythings.org/anki/spa-eng.zip -o data/spa-eng.zip
# !tar -xf data/spa-eng.zip -C data

### Setting up logger

In [7]:
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s - %(message)s')

### Dataset
###  Process the english spanish translation
### link: https://www.manythings.org/anki/

In [8]:
class SpanishDataset(Dataset):

    def setup_logger(self, level = logging.DEBUG):
        self.logger = logging.getLogger()
        self.logger.setLevel(level)

    def log(self, message:str, level: str = 'debug'):
        if level == 'debug':
            self.logger.debug(message)
        if level == 'info':
            self.logger.info(message)
        if level == 'warning':
            self.logger.warning(message)
        if level == 'error':
            self.logger.error(message)


    def __init__(self, debug = False, num_steps = 10):
        super().__init__()

        self.setup_logger()

        self.DATASET_PATH = './data/spa.txt'
        assert os.path.exists(self.DATASET_PATH), 'English spanish dataset is not found'
        self.num_steps = num_steps

        self.source = []
        self.target = []

        self.log('start building the dataset')

        with open(self.DATASET_PATH, 'r') as file:
            for idx, line in enumerate(file.readlines()):
                processed = self._preprocess(line)
                source_tokens, target_tokens = self._tokenize(processed)
                self.source.append(source_tokens)
                self.target.append(target_tokens)

        self.log(f'done tokenizing source and target, source len = {len(self.source)}, target len = {len(self.target)}', 'info')

        (self.source_array, self.target_array, self.valid_len, self.label_target_array), self.source_vocab, self.target_vocab = \
            self._build_arrays(self.source, self.target)

        #add tgt_pad (target pad) for masking in training phase
        self.tgt_pad = self.target_vocab['<pad>']

        shape2d = lambda a: f'({len(a)},{len(a[0])})'
        self.log(f'done building source and target arrays', 'info')
        self.log(f'source array shape {shape2d(self.source_array)}')
        self.log(f'source vocab len = {len(self.source_vocab)}')
        self.log(f'valid_len shape  = {self.valid_len.shape}')
        self.log(f'target array shape = {shape2d(self.target_array)}')
        self.log(f'target vocab len = {len(self.target_vocab)}')

    def _preprocess(self, text):
        # from D2L processing step in chapter 10
        # Replace non-breaking space with space
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        # Insert space between words and punctuation marks
        no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
        out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
            for i, char in enumerate(text.lower())]
        return ''.join(out)

    def _tokenize(self, text):
        # Tokenization method in D2L processing step in chapter 10
        if len(text.split('\t')[:-1]) == 2:
            part = text.split('\t')[:-1]
            src = [token for token in f'{part[0]} <eos>'.split(' ') if token]
            tgt = [token for token in f'{part[1]} <eos>'.split(' ') if token]
            return src, tgt
        else:
            return '',''
    def _build_arrays(self, source, target):
        '''
        @params:
            source_raw: list[list[string]], source sequence, eg: [['a', 'b', '<eos>'], ...]
            target_rwa: list[list[string]], target sequence
        @return
            (
                source_array: list[list[int]]
                target_array_with_bos: list[list[int]]
                valid_len: list[int]
                target_array_with_eos: list[list[int]]
            ),
            source_vocab: Vocab
            target_vocab: Vocab
        '''
        #pad with <pad> token if sequence len < time step, else truncate
        #NOTE: in the book, they just truncated without adding <eos> at the end,
        # I don't think that is correct
        pad_or_truncate = lambda sentence, numstep: \
            sentence[:numstep - 1] + ['<eos>'] if len(sentence) > numstep \
                else sentence + ['<pad>'] * (numstep - len(sentence))

        def _build_array(sequence, is_target = False):
            '''
            @params:
                sentence: string
                is_target: boolean, if sentence is target, append <bos> to beginning of sentence
            @return
                array: list[str]
                vocab: Vocab object
            '''
            new_sequence = [ ]
            for sentence in sequence:
                sentence = pad_or_truncate(sentence, self.num_steps)
                if is_target:
                    sentence = ['<bos>'] + sentence

                new_sequence.append(sentence)

            vocab = d2l.Vocab(new_sequence, min_freq = 2)

            #calculate valid_len for training later
            array = torch.tensor([vocab[sentence] for sentence in new_sequence])
            valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
            return array,vocab,valid_len

        source_array, source_vocab, valid_len = _build_array(source)
        target_array, target_vocab, _ = _build_array(target, is_target= True)

        return (source_array, target_array[:,:-1], valid_len, target_array[:,1:]), source_vocab, target_vocab

    def __len__(self):
        '''
        @return
            int: length of the english - spanish pairs
        '''
        return len(self.source_array)

    def __getitem__(self,idx):
        '''
        @params:
            idx: int, datapoint index
        @return
            source_array, target_array, valid_len, label_target_array
        '''
        return (self.source_array[idx], self.target_array[idx], self.valid_len[idx], self.label_target_array[idx])

    def get_dataloader(self, **kwargs):
        return DataLoader(self, **kwargs)


In [9]:
# dataset = SpanishDataset()
# source = dataset.source
# target = dataset.target# print(dataset.source_array[0])

# print(dataset.source_vocab.to_tokens(dataset.source_array[500].tolist()))

# train_dataloader = dataset.get_dataloader()
# sample = next(iter(train_dataloader))

# for data in sample:
#     print(f'shape = {data.shape},\t data = {data[0]}')

### Encoder and Decoder Architecture

In [10]:
class Encoder(nn.Module):

    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout = 0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(input_size = embed_size, num_layers = num_layers, hidden_size = num_hiddens, dropout = dropout)
        #custom initialization

    def forward(self, x, *args):
        #why x.t(), still confused in the book
        emb = self.embedding(x.t())
        output, state = self.rnn(emb)
        return output, state

class Decoder(nn.Module):

    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout = 0):
        #note that the vocab size in decoder is target language vocab size, not source language vocab size
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(input_size = embed_size + num_hiddens, num_layers = num_layers, hidden_size = num_hiddens, dropout = dropout)
        self.dense = nn.LazyLinear(vocab_size)
        #custom init module

    def init_state(self, enc_all_outputs, *args):
        '''
        use encoder's output to initialize state
        @params:
            encoder_output
        @return:
            decoder_input
        '''
        return enc_all_outputs

    def forward(self, x, state):
        emb = self.embedding(x.t())

        enc_output, enc_state = state
        #context variable
        context = enc_output[-1]
        context = context.repeat(emb.shape[0], 1, 1)
        emb_and_context = torch.cat((emb, context), -1)
        dec_output, dec_state = self.rnn(emb_and_context, enc_state)

        #pass to dense layer and swap back (batch_size, num_steps)
        y_pred = self.dense(dec_output).swapaxes(0,1)
        # print(x.shape)
        # print('embedding shape = ', emb.shape)
        # print('context shape = ', context.shape)
        # print('emb and context shape = ', emb_and_context.shape)
        # print('decoder output shape = ', output.shape)
        # print('decoder state shape = ', dec_state.shape)
        return y_pred, (dec_output, dec_state)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, x_enc, x_dec, *args):
        enc_outputs, enc_state = self.encoder(x_enc, *args)
        dec_init_state = self.decoder.init_state((enc_outputs, enc_state), *args)
        #only returns decoder output, decode state is not used for final pred
        return self.decoder(x_dec, dec_init_state)[0]


# Test Encoder

In [33]:
# Test with sample
vocab_size, embed_size, num_layers, num_hiddens = 1000, 8, 2, 16
batch_size, num_steps = 32, 10

x = torch.randint(0,vocab_size,(batch_size, num_steps))

encoder = Encoder(vocab_size, embed_size, num_hiddens, num_layers )

output, state = encoder(x)

print('output shape = ',output.shape, ' dtype = ', output.dtype)
print('state shape = ', state.shape, ' dtype = ', state.dtype)

output shape =  torch.Size([10, 32, 16])  dtype =  torch.float32
state shape =  torch.Size([2, 32, 16])  dtype =  torch.float32


# Test Decoder

In [32]:
# Test with sample
enc_vocab_size, embed_size, num_layers, num_hiddens = 1000, 8, 2, 16
dec_vocab_size = 2000
batch_size, num_steps = 32, 10

x = torch.randint(0,enc_vocab_size,(batch_size, num_steps))

encoder = Encoder(enc_vocab_size, embed_size, num_hiddens, num_layers )

enc_output, enc_state = encoder(x)

print('enc output shape = ',enc_output.shape, ' dtype = ', enc_output.dtype)
print('enc state shape = ', enc_state.shape, ' dtype = ', enc_state.dtype)

decoder = Decoder(dec_vocab_size, embed_size, num_hiddens, num_layers)
decoder_state = decoder.init_state((enc_output, enc_state))

y, (dec_output, dec_state) = decoder(x, decoder_state)

print('y_pred shape = ', y.shape, 'dtype = ', dec_output.dtype)
print('dec state shape  = ', dec_state.shape, 'dtype = ', dec_state.dtype)


enc output shape =  torch.Size([10, 32, 16])  dtype =  torch.float32
enc state shape =  torch.Size([2, 32, 16])  dtype =  torch.float32
y_pred shape =  torch.Size([32, 10, 2000]) dtype =  torch.float32
dec state shape  =  torch.Size([2, 32, 16]) dtype =  torch.float32




# Test Encoder Decoder

In [13]:
#Test seq 2 seq
# Test with sample
vocab_size_source, embed_size, num_layers, num_hiddens = 1000, 8, 2, 16
vocab_size_target = 2000
batch_size, num_steps = 32, 10

#this represents the source_array
x_enc = torch.randint(0,vocab_size_source,(batch_size, num_steps))
#this represents the target array with BOS
x_dec = torch.randint(0,vocab_size_target,(batch_size, num_steps))

encoder = Encoder(vocab_size_source, embed_size, num_hiddens, num_layers)
decoder = Decoder(vocab_size_target, embed_size, num_hiddens, num_layers)

seq2seq = Seq2Seq(encoder, decoder)

output = seq2seq(x_enc, x_dec)
print('seq2seq output shape = ', output.shape)


seq2seq output shape =  torch.Size([32, 10, 2000])


### Perplexity

Perplexity is PPL = e^(CrossEntropyLoss(y_pred, y))

### Training procedure

In [14]:

class Trainer():
  def __init__(self, model, dataset):
    self.model = model
    self.dataset = dataset
    self._run()
    if torch.cuda.is_available():
      self.device = 'cuda'
    else:
      self.device = 'cpu'

  def _has_gpu(self):
    return self.device == 'cuda'

  def prepare_batch(self, batch):
    if self._has_gpu():
      batch = [a.to(self.device) for a in batch]
    return batch

  def _run(self):
    pass

In [19]:
#Build the dataset
dataset = SpanishDataset()


DEBUG - start building the dataset
INFO - done tokenizing source and target, source len = 141370, target len = 141370
INFO - done building source and target arrays
DEBUG - source array shape (141370,10)
DEBUG - source vocab len = 9538
DEBUG - valid_len shape  = torch.Size([141370])
DEBUG - target array shape = (141370,10)
DEBUG - target vocab len = 16679


source vocab size = 9538
target vocab size = 16679


In [23]:
#Extract some paramaters
source_vocab_size = len(dataset.source_vocab)
print(f'source vocab size = {source_vocab_size}')
target_vocab_size = len(dataset.target_vocab)
print(f'target vocab size = {target_vocab_size}')
embed_size = 100
num_hiddens = 256
num_layers = 2
dropout = 0.2
batch_size = 256 

class PerplexityLoss(nn.Module):
  def __init__(self):
    super().__init__()
    self.loss_fn = nn.CrossEntropyLoss()

  def forward(self, y_pred, y, tgt_pad):
    y_pred = y_pred.permute(0,2,1)
    l = self.loss_fn(y_pred, y)
    mask = (y.reshape(-1) != tgt_pad).type(torch.float32)
    return torch.exp((l * mask).sum() / mask.sum())

device = 'cuda' if torch.cuda.is_available() else 'cpu'

#Get dataloader for
train_dataloader = dataset.get_dataloader(batch_size = batch_size, shuffle = True)
print('train dataloader len = ',len(train_dataloader))

encoder = Encoder(source_vocab_size, embed_size, num_layers, num_hiddens)
decoder = Decoder(target_vocab_size, embed_size, num_layers, num_hiddens)

seq2seq = Seq2Seq(encoder, decoder)

seq2seq.to(device)

ppl_loss = PerplexityLoss()
optim = torch.optim.Adam(seq2seq.parameters(), lr = 0.01)

history = []

epochs = 10

for e in range(epochs):
  loop = tqdm(train_dataloader)
  running_loss =0

  for data in loop:
    src_array, tgt_array_bos, valid_len, tgt_array_eos = data
    src_array = src_array.to(device)
    tgt_array_bos = tgt_array_bos.to(device)
    tgt_array_eos = tgt_array_eos.to(device)
    valid_len = valid_len.to(device)
    # print('src array shape = ', src_array.shape)
    # print(f'tgt array bos shape {tgt_array_bos.shape}' )
    # print(f'tgt array eos shape {tgt_array_eos.shape}')
    # print(f'shape {valid_len.shape}')

    y_pred = seq2seq(src_array, tgt_array_bos)
    # print('y pred ', y_pred[0])
    # print('y pred GRAD', y_pred.grad)
    # print('target ', tgt_array_eos[0])

    optim.zero_grad()
    loss = ppl_loss(y_pred, tgt_array_eos, dataset.tgt_pad)
    loss.backward()
    step = optim.step()
    running_loss += loss.item()
    loop.set_description(f'batch ppl = {loss.item():.2f} \t')

  print(f'epoch = {e}, ppl = {running_loss}')
  history.append({'loss': running_loss})

source vocab size = 9538
target vocab size = 16679
train dataloader len =  553


batch ppl = 261.39 	: 100%|██████████| 553/553 [01:21<00:00,  6.75it/s] 


epoch = 0, ppl = 676234.8220672607


batch ppl = 182.68 	: 100%|██████████| 553/553 [01:22<00:00,  6.73it/s]


epoch = 1, ppl = 104761.52668762207


batch ppl = 163.32 	: 100%|██████████| 553/553 [01:21<00:00,  6.83it/s]


epoch = 2, ppl = 85973.30590057373


batch ppl = 128.28 	: 100%|██████████| 553/553 [01:20<00:00,  6.87it/s]


epoch = 3, ppl = 81512.13844299316


batch ppl = 139.04 	: 100%|██████████| 553/553 [01:23<00:00,  6.61it/s]


epoch = 4, ppl = 74910.17710113525


batch ppl = 118.73 	: 100%|██████████| 553/553 [01:24<00:00,  6.53it/s]


epoch = 5, ppl = 68752.88645172119


batch ppl = 105.89 	: 100%|██████████| 553/553 [01:23<00:00,  6.66it/s]


epoch = 6, ppl = 63696.14000701904


batch ppl = 140.29 	: 100%|██████████| 553/553 [01:23<00:00,  6.63it/s]


epoch = 7, ppl = 59483.47602081299


batch ppl = 75.56 	: 100%|██████████| 553/553 [01:23<00:00,  6.63it/s] 


epoch = 8, ppl = 56032.75653839111


batch ppl = 101.74 	: 100%|██████████| 553/553 [01:24<00:00,  6.52it/s]

epoch = 9, ppl = 54147.706802368164





In [25]:
if os.path.exists('output') == False:
    os.mkdir('output')
MODEL_PATH = './output/seq2seq.h5'
torch.save(seq2seq, MODEL_PATH)

# Evaluating seq2seq model

In [26]:
MODEL_PATH = './output/seq2seq.h5'
assert os.path.exists(MODEL_PATH), f'saved model not found at {MODEL_PATH}' 
model = torch.load(MODEL_PATH, map_location = torch.device('cpu'))
print('loaded mode')
print(type(model))
dataset = SpanishDataset()

DEBUG - start building the dataset


<class '__main__.Seq2Seq'>


INFO - done tokenizing source and target, source len = 141370, target len = 141370
INFO - done building source and target arrays
DEBUG - source array shape (141370,10)
DEBUG - source vocab len = 9538
DEBUG - valid_len shape  = torch.Size([141370])
DEBUG - target array shape = (141370,10)
DEBUG - target vocab len = 16679


In [69]:
def idx_to_tokens(tokens, vocab):
    return vocab.to_tokens(tokens)

index = []
for i in range(10):
  index.append(np.random.choice(len(dataset)))

batch = []
src = []
tgt = []
valid_len = []
tgt_eos = []

for i in index:
  data = dataset[i]
  src.append(data[0])
  tgt.append(data[1])
  valid_len.append(data[2])
  tgt_eos.append(data[3])

batch = [torch.stack(a) for a in (src,tgt,valid_len,tgt_eos)]
print(batch)

[tensor([[4367, 9177, 8623, 5200, 8244, 9512,  661, 9327, 9512,  201],
        [9512, 6786, 2702, 5113,  974,   84,  201,  202,  202,  202],
        [  68, 9512, 5034, 8623, 3825, 9389, 5293,  205,   70,  201],
        [ 203, 3861, 7771, 5548, 4455, 1898,   85,  203, 9287,  201],
        [4367, 4859, 9287, 4367, 2551,   84,  201,  202,  202,  202],
        [8010, 5836, 8542, 7570, 5805, 8491, 9499, 5046,   84,  201],
        [4680, 2117, 8563, 2720,   84,  201,  202,  202,  202,  202],
        [7834,   79, 1399, 4367, 4093, 8623, 9427, 8653,   84,  201],
        [7483, 9522,  800,   79, 4675, 7480,  204,  201,  202,  202],
        [8490, 2315,   84,  201,  202,  202,  202,  202,  202,  202]]), tensor([[  189, 12738,  1576,  4547, 12630, 13849, 12729,  5259, 14001,    78],
        [  189, 12872, 14678, 15699,  8042,    78,   190,   191,   191,   191],
        [  189,    65,  7799,  8827,  3722,   194,   192,   192,     1,   190],
        [  189,   192, 14641,  5969,  3591,    79,   192,

In [71]:
def predict(seq2seq, batch, device, num_steps):
  batch = [a.to(device) for a in batch]
  src, tgt, valid_len, _ = batch
  print('source = ', src)
  print('target = ', tgt)
  print('valid len  = ', valid_len)

  enc_outputs, enc_state = seq2seq.encoder(src, valid_len)
  print('enc outputs ', enc_output.shape)
  print('enc state  ', enc_state.shape)
  dec_all_outputs = seq2seq.decoder.init_state((enc_outputs, enc_state), valid_len)
  print('dec state = ', dec_state.shape)

  outputs, attention_weights = [tgt[:,(0)].unsqueeze(1), ], []
  print(outputs)
  print(outputs[-1])

  for _ in range(num_steps):
    y_pred, dec_all_outputs = seq2seq.decoder(outputs[-1], dec_all_outputs)
    outputs.append(y_pred.argmax(2))

  return torch.cat(outputs[1:], 1)    

outputs = predict(model, batch, 'cpu', 10)
print(outputs)

source =  tensor([[4367, 9177, 8623, 5200, 8244, 9512,  661, 9327, 9512,  201],
        [9512, 6786, 2702, 5113,  974,   84,  201,  202,  202,  202],
        [  68, 9512, 5034, 8623, 3825, 9389, 5293,  205,   70,  201],
        [ 203, 3861, 7771, 5548, 4455, 1898,   85,  203, 9287,  201],
        [4367, 4859, 9287, 4367, 2551,   84,  201,  202,  202,  202],
        [8010, 5836, 8542, 7570, 5805, 8491, 9499, 5046,   84,  201],
        [4680, 2117, 8563, 2720,   84,  201,  202,  202,  202,  202],
        [7834,   79, 1399, 4367, 4093, 8623, 9427, 8653,   84,  201],
        [7483, 9522,  800,   79, 4675, 7480,  204,  201,  202,  202],
        [8490, 2315,   84,  201,  202,  202,  202,  202,  202,  202]])
target =  tensor([[  189, 12738,  1576,  4547, 12630, 13849, 12729,  5259, 14001,    78],
        [  189, 12872, 14678, 15699,  8042,    78,   190,   191,   191,   191],
        [  189,    65,  7799,  8827,  3722,   194,   192,   192,     1,   190],
        [  189,   192, 14641,  5969,  3

In [52]:
for i, data in enumerate(batch): 
    source = dataset.source_vocab.to_tokens(data[0].tolist())
    target = dataset.target_vocab.to_tokens(data[1].tolist())
    pred = dataset.target_vocab.to_tokens(outputs[i].tolist())
    print(' '.join(source))
    print(' '.join(target))
    print(' '.join(pred))
    print()

tom woke mary up at 6:30 as she had <eos>
captura descifró elegida dolores asfixiado .u abandonado abandonados abandonados abandonados
tom tom no . <pad> <pad> <pad> <pad> <pad> <pad>



IndexError: list index out of range

### Unit Tests

In [None]:
# class SpanishDatasetTest(unittest.TestCase):

#     def test_upper(self):
#         self.assertEqual('foo'.upper(), 'FOO')

#     def test_isupper(self):
#         self.assertTrue('FOO'.isupper())
#         self.assertFalse('Foo'.isupper())

#     def test_split(self):
#         s = 'hello world'
#         self.assertEqual(s.split(), ['hello', 'world'])

#     def test4(self):
#         self.assertEqual('foo', 'foo1')

# unittest.main(argv=[''], exit=False)