<a href="https://colab.research.google.com/github/iamsimha/ML-Implementations/blob/master/copy_paste_task_30k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import torch
import torch.nn as nn
import math
from torch.nn import Transformer


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [10]:
   !python -m spacy download en
   !python -m spacy download de

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/de_core_news_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/de
You can now load the model via spacy.load('de')


In [11]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
from torchtext.utils import download_from_url, extract_archive
import io


import torch
torch.manual_seed(0)

import random
random.seed(0)

import numpy as np
np.random.seed(0)

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

def build_vocab(filepath, tokenizer):
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de)],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en)],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

def copy_task_data_process(vocab_size, seq_length, num_points):
  data = []
  for i in range(num_points):
    src_tensor = torch.randint(4, vocab_size, size=(seq_length,))
    copy_tensor = src_tensor.clone()
    data.append((src_tensor, copy_tensor))
  return data

def subtract_task_data_process(max_vocab_size, seq_length, num_points):
  # Task is to predict a sequence with "1" subtracted from source sequence
  data = []
  for i in range(num_points):
    src_tensor = torch.randint(5, max_vocab_size, size=(seq_length,))
    trg_tensor = src_tensor - 1
    data.append((src_tensor, trg_tensor))
  return data



def extract_last_half_task_data_process_with_padding(max_vocab_size, seq_length,
                                            num_points):
  # Task is to predict a sequence with "1" subtracted from source sequence
  data = []
  for i in range(num_points):
    max_len = torch.randint(seq_length//2, seq_length + 1, size=(1,))
    src_tensor = torch.randint(5, max_vocab_size, size=(seq_length,))
    src_tensor = src_tensor[0:max_len]
    trg_tensor = src_tensor - 1
    data.append((src_tensor, trg_tensor))
  return data


train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)

COPY_VOCAB_SIZE = 11
COPY_SEQ_LENGTH = 10
NUM_POINTS = 8192 * 3
MAX_VOCAB_SIZE = 15

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']



# train_data = extract_last_half_task_data_process_with_padding(MAX_VOCAB_SIZE, COPY_SEQ_LENGTH, NUM_POINTS)
# val_data = extract_last_half_task_data_process_with_padding(MAX_VOCAB_SIZE, COPY_SEQ_LENGTH, NUM_POINTS//2)
# test_data = extract_last_half_task_data_process_with_padding(MAX_VOCAB_SIZE, COPY_SEQ_LENGTH, NUM_POINTS//2)

In [12]:
train_data[9]

(tensor([  93,  284,  339,    8,   17,  527,   12, 1209,    5,    4]),
 tensor([1780,  242,   10, 1062,    8,    9,  182,   14,    9,  313,    6,    5]))

In [13]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
  de_batch, en_batch = [], []
  for (de_item, en_item) in data_batch:
    de_batch.append(torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX])], dim=0))
    en_batch.append(torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0))
  de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
  en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
  return de_batch.transpose(0, 1), en_batch.transpose(0, 1)

train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE,
                       shuffle=True, collate_fn=generate_batch)

In [14]:
class PositionalEncoding(nn.Module):
    def __init__(self, dmodel, maxlen=5000):
        # Positional encoding should be added to token emebedding
        # We create positional encoding for sequences upto 5000 tokens
        # Output: [1, maxlen, dmodel]
        super(PositionalEncoding, self).__init__()

        den = torch.exp(- torch.arange(0, dmodel, 2) * math.log(10000) / dmodel)
        # den dimension: dmodel
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        # pos: maxlen X 1

        pos_embedding = torch.zeros((maxlen, dmodel))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)


        pos_embedding = pos_embedding.unsqueeze(0)

        self.register_buffer('pos_embedding', pos_embedding)
        # pos_embedding: 1 X maxlen X emb_dim

    def forward(self, x):
        # inputs: Batch_size X seq
        # outputs: Batch_size X seq X emb dim
        return x + self.pos_embedding[:, :x.size(1)]

class Embeddings(nn.Module):
    def __init__(self, vocab_size, dmodel):
        super(Embeddings, self).__init__()
        self.embedding = nn.Embedding(vocab_size, dmodel)
        self.dmodel = dmodel
    def forward(self, inps):
        return self.embedding(inps.long()) * np.sqrt(self.dmodel)


class Generator(nn.Module):
    def __init__(self, dmodel, vocab):
        super(Generator, self).__init__()
        self.dmodel = dmodel
        self.vocab = vocab
        self.linear = nn.Linear(dmodel, vocab)
    def forward(self, inp):
        return self.linear(inp)

class StandardTransformer(nn.Module):
    def __init__(self, num_blocks, dmodel, src_vocab_size, trg_vocab_size, generator):
        super(StandardTransformer, self).__init__()
        self.transformer = Transformer(num_encoder_layers = num_blocks, num_decoder_layers = num_blocks, d_model=dmodel)
        self.generator = generator
        self.src_embedding = Embeddings(src_vocab_size, dmodel)
        self.target_embedding = Embeddings(trg_vocab_size, dmodel)
        # self.positional_encoding = WeightedPositionalEncoding(dmodel)
        self.positional_encoding = PositionalEncoding(dmodel)

    def forward(self, src, trg, src_mask, trg_mask, src_padding_mask,
                trg_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_embedding(src))
        trg_emb = self.positional_encoding(self.target_embedding(trg))
        src_emb = src_emb.transpose(0,1)
        trg_emb = trg_emb.transpose(0, 1)
        outs = self.transformer(src_emb, trg_emb, src_mask=src_mask,
                                tgt_mask=trg_mask, src_key_padding_mask=src_padding_mask,
                                tgt_key_padding_mask=trg_padding_mask,
                                memory_key_padding_mask=memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src, src_mask):
        return self.transformer.encoder(self.positional_encoding(self.src_embedding(src)).transpose(0, 1), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.transformer.decoder(self.positional_encoding(self.target_embedding(tgt)).transpose(0, 1), memory, tgt_mask, src_mask)

def generate_square_subsequent_mask(sz):
    r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
        Unmasked positions are filled with float(0.0).
    """
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [15]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))

# optimizer = torch.optim.Adam(
#     encoder_decoder.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
# )



In [16]:
# We will run transfer models on synthetic task.
# The synthetic task we are interested in is, Given a sequence of tokens
# Keep tokens at even position
import numpy as np
from torch.nn import Transformer
from tqdm import tqdm
from torch.autograd import Variable



# src_vocab_size = MAX_VOCAB_SIZE + 1
# trg_vocab_size = src_vocab_size - 1

src_vocab_size = len(de_vocab)
trg_vocab_size = len(en_vocab)
dmodel = 512
nhead = 8
batch_size = 30
num_blocks = 3

generator = Generator(dmodel, trg_vocab_size)

encoder_decoder = StandardTransformer(num_blocks, dmodel, src_vocab_size,
                                      trg_vocab_size, generator)
encoder_decoder = encoder_decoder.to(device)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)


for p in encoder_decoder.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform(p)
        # nn.init.constant_(p, 0.001)

model_opt = NoamOpt(dmodel, 1, 2000,
            torch.optim.Adam(encoder_decoder.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

for epoch in range(1):
    losses = []
    val_loss = []
    encoder_decoder.train()
    for idx, (src_input, trg_input) in (
        enumerate(train_iter)
    ):
        trg_out = trg_input[:, 1:]
        trg_input = trg_input[:, :-1]
        # B X 1 X  Seq ( all True)
        src_mask = torch.zeros((src_input.shape[-1], src_input.shape[-1])).type(torch.bool)
        
        trg_mask = generate_square_subsequent_mask(trg_input.shape[-1])


        # B X seq X vocab_size
        src_input, trg_input, trg_out = src_input.to(device), trg_input.to(device), trg_out.to(device)

        src_mask, trg_mask = src_mask.to(device), trg_mask.to(device)
        src_padding_mask = src_input == PAD_IDX

        trg_padding_mask = trg_input == PAD_IDX  



        # print("src")
        # print(src_input[0, :])
        # print("trg")
        # print(trg_input[0, :])
        # print("trg_out")
        # print(trg_out[0, :])
        # print("src_padding_mask")
        # print(src_padding_mask[0, :])
        # print("trg padding mask")
        # print(trg_padding_mask[0, :])
        logits = encoder_decoder(src_input, trg_input, src_mask, trg_mask,
                                 src_padding_mask, trg_padding_mask, src_padding_mask)
        logits = logits.transpose(0, 1)
        # optimizer.zero_grad()
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), trg_out.reshape(-1))
        # make_dot(loss, dict(encoder_decoder.named_parameters())).render("tranformer_torchviz", format="png")
        # loss.backward()
        # optimizer.step()

        loss.backward()
        model_opt.step()
        model_opt.optimizer.zero_grad()


        if idx % 50 == 0:
          print(f"Epoch = {epoch}, step = {idx+1}, loss = {loss.item()}, lr = {model_opt.rate()}")
        losses.append(loss.item())
    encoder_decoder.eval()
    for idx, (src_input, trg_input) in (
        enumerate(valid_iter)
    ):
        # B X 1 X  Seq ( all True)
        trg_out = trg_input[:, 1:]
        trg_input = trg_input[:, :-1]
        src_mask = torch.zeros((src_input.shape[-1], src_input.shape[-1])).type(torch.bool)
        trg_mask = generate_square_subsequent_mask(trg_input.shape[-1])
        # B X seq X vocab_size
        src_input, trg_input, trg_out = src_input.to(device), trg_input.to(device), trg_out.to(device)
        src_mask, trg_mask = src_mask.to(device), trg_mask.to(device)

        src_padding_mask = src_input == PAD_IDX

        trg_padding_mask = trg_input == PAD_IDX  

        logits = encoder_decoder(src_input, trg_input, src_mask, trg_mask,
                                 src_padding_mask, trg_padding_mask, src_padding_mask)
        logits = logits.transpose(0, 1)
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), trg_out.reshape(-1))
        val_loss.append(loss.item())
    print(f"epoch = {epoch}, loss = {np.mean(losses)}, val loss = {np.mean(val_loss)}")




Epoch = 0, step = 1, loss = 9.30978775024414, lr = 4.941058844013093e-07
Epoch = 0, step = 51, loss = 8.015201568603516, lr = 2.5199400104466777e-05


KeyboardInterrupt: ignored

In [47]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        src_mask = src_mask.to(device)
        memory_mask = torch.zeros(ys.shape[-1], memory.shape[0]).to(device)
        out = model.decode(memory, memory_mask,
                           Variable(ys),
                           Variable(generate_square_subsequent_mask(ys.size(1))
                                    .type_as(src.data)).to(device) == 1)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys


model = encoder_decoder
model.eval()
ll = [2] + train_data[4][0].tolist() + [3]
src = Variable(torch.LongTensor([ll]).reshape(1, len(ll)) )
src_mask = Variable(torch.zeros(len(ll), len(ll)) )
print(greedy_decode(model, src, src_mask, max_len=len(ll), start_symbol=BOS_IDX))

tensor([[   2,   20,   37,   18,  376,  135,   21,    9, 1204,    6,    5,    3,
            6]], device='cuda:0')


In [34]:
def get_ln_pair(token_pair, src_vocab):
  src_lng = token_pair
  for tok in src_lng:
    print(src_vocab.itos[tok])



In [46]:
get_ln_pair(train_data[4][1].tolist(), en_vocab)

Two
men
are
at
the
stove
preparing
food
.




In [48]:
get_ln_pair([   2,   20,   37,   18,  376,  135,   21,    9, 1204,    6,    5,    3,
            6], en_vocab)

<bos>
Two
men
are
preparing
food
at
the
stove
.


<eos>
.


In [30]:
BOS_IDX

2

In [31]:
PAD_IDX

1

In [32]:
EOS_IDX

3

In [None]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))