In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

In [7]:
# class Generator(nn.Module):
#     def __init__(self,d_model,vocab):
#         super().__init__()
#         self.proj = nn.Linear(d_model,vocab)
    
#     def forward(self,x):
#         self.log_softmax(self.proj(x),dim=-1)

In [142]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.a = nn.Parameter(torch.ones(d_model))
        self.b = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(dim=-1,keepdim=True)
        std = x.std(dim=-1,keepdim=True)
        return self.a*(x-mean)/(std+self.eps)+self.b

    

In [73]:
# sublayer - each encoder layer is the sum of the input and the dropout+application of the layer

In [74]:
# class SublayerConnection(nn.Module):
#     def __init__(self,dropout,features):
#         self.dropout = nn.Dropout(dropout)
#         self.norm = LayerNorm(features)
    
#     def forward(self,x,sublayer):
#         return x + self.dropout(sublayer(self.norm(x)))

In [322]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, model_size):
        super().__init__()
        self.d_model = model_size
        self.embed = nn.Embedding(vocab_size,model_size)
    
    def forward(self,x):
        return self.embed(x)

In [127]:
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_seq_len=500):
        super().__init__()
        self.d_model = d_model
        pe = torch.zeros(max_seq_len,d_model)
        for pos in range(max_seq_len):
            for i in range(0,d_model,2):
                pe[pos,i] = math.sin(pos/10000**(2*i/d_model))
                pe[pos,i+1] = math.cos(pos/10000**(2*(i+1)/d_model))
        pe = pe.unsqueeze(0)
        self.register_buffer('pe',pe)
        
    def forward(self,x):
        x *= math.sqrt(self.d_model) # ?
        x = x + Variable(self.pe[:,:x.size(1)],requires_grad=False)
#         x = self.dropout(x) # ?
        return x

In [248]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, heads, d_model, dropout=0.1):
        super().__init__()
        # k,v,q
        self.d_model = d_model
        self.h = heads
        self.d_h = d_model//heads
        self.k = nn.Linear(d_model,d_model)
        self.v = nn.Linear(d_model,d_model)
        self.q = nn.Linear(d_model,d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model,d_model)
        
        
    def forward(self,q,k,v,mask=None):
        bs = q.size(0)
        q = self.q(q).view(bs,-1,self.h,self.d_h).transpose(1,2) #?
        k = self.k(k).view(bs,-1,self.h,self.d_h).transpose(1,2) #?
        v = self.v(v).view(bs,-1,self.h,self.d_h).transpose(1,2) #?
        out, scores = attention(q,k,v,self.d_h,mask,self.dropout)
        out = out.transpose(1,2).contiguous().view(bs,-1,self.d_model)
        out = self.out(out)
        return out
        

In [257]:
def attention(q,k,v,d_h,mask=None,dropout=None):
    scores = torch.matmul(q,k.transpose(-2,-1))/math.sqrt(d_h)
#     print(scores.shape)
    if mask is not None:
        mask = mask.unsqueeze(1)
#         print(mask.shape)

        scores = scores.masked_fill(mask==0,-1e9)
    if dropout is not None:
        scores = dropout(scores)
    scores = F.softmax(scores,dim=-1)
    out = torch.matmul(scores,v)
    return out,scores

In [239]:
def attention(query, key, value, d_k,mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        print(scores.shape)
        print(mask.shape)
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [79]:
class FeedForward(nn.Module):
    def __init__(self,d_model,d_ff=2048,dropout=0.1):
        super().__init__()
        self.l1 = nn.Linear(d_model,d_ff)
        self.dropout = nn.Dropout(dropout)
        self.l2 = nn.Linear(d_ff,d_model)
        
    def forward(self,x):
        x = self.l2(self.dropout(F.relu(self.l1(x))))
        return x

In [188]:
class EncoderLayer(nn.Module):
    def __init__(self,d_model,heads,dropout=0.1):
        super().__init__()
        # encode - attention + skip
        self.attention = MultiHeadedAttention(heads,d_model)
        self.ff = FeedForward(d_model)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        
    def forward(self,x,mask):
        x_ = self.norm1(x)
        x = x + self.dropout1(self.attention(x_,x_,x_,mask))
        x_ = self.norm2(x)
        out = x + self.dropout2(self.ff(x_))
        return out

In [254]:
class DecoderLayer(nn.Module):
    def __init__(self,d_model,heads,dropout=0.1):
        super().__init__()
        # encode - attention + skip
        self.attention1 = MultiHeadedAttention(heads,d_model)
        self.attention2 = MultiHeadedAttention(heads,d_model)
        self.ff = FeedForward(d_model)
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self,x,e_outputs,src_mask,tgt_mask):
        x_ = self.norm1(x)
        x = x + self.dropout1(self.attention1(x_,x_,x_,tgt_mask))
        x_ = self.norm2(x)
        x = x + self.dropout2(self.attention2(x_,e_outputs,e_outputs,src_mask))
        x_ = self.norm3(x)
        out = x + self.dropout3(self.ff(x_))
        return out

In [82]:
def clones(module,n):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
    

In [83]:
class Encoder(nn.Module):
    def __init__(self, d_model,vocab_size,heads,N):
        super().__init__()
        self.embed = Embedder(vocab_size,d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = clones(EncoderLayer(d_model,heads),N) 
        self.norm = LayerNorm(d_model)
    
    def forward(self, x, mask):
        x = self.embed(x)
        x = self.pe(x)
        for layer in self.layers:
            x = layer(x,mask)
        return self.norm(x)

In [251]:
class Decoder(nn.Module):
    def __init__(self, d_model,vocab_size,heads,N):
        super().__init__()
        self.embed = Embedder(vocab_size,d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = clones(DecoderLayer(d_model,heads),N) 
        self.norm = LayerNorm(d_model)
    
    def forward(self, x, e_outputs,src_mask,tgt_mask):
        x = self.embed(x)
        x = self.pe(x)
        for layer in self.layers:
            x = layer(x,e_outputs,src_mask,tgt_mask)
        return self.norm(x)

In [324]:
class Transformer(nn.Module):
    def __init__(self,d_model,src_vocab,tgt_vocab,heads,N):
        super().__init__()
        self.encoder = Encoder(d_model,src_vocab,heads,N)
        self.decoder = Decoder(d_model,tgt_vocab,heads,N)
        self.generator = nn.Linear(d_model,tgt_vocab)
        
    def forward(self, src, tgt, src_msk, tgt_msk):
        e_outputs=self.encoder(src,src_msk)
        output = self.decoder(tgt, e_outputs,src_msk,tgt_msk)
#         output = self.l(output)
        return output
        

In [16]:
# with open

In [319]:
def make_model(src_vocab=10000,tgt_vocab = 10000,N=6,H=8):
    model = Transformer(d_model,src_vocab,tgt_vocab,H,N)

    for p in model.parameters():
        if p.dim()>1:
            nn.init.xavier_uniform_(p)
    return model
# lr = 0.0001
# opt = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9,0.98),eps=1e-9)

# Training

In [2]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [3]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask

In [4]:
def data_gen(V, batch, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
        data[:, 0] = 1
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield Batch(src, tgt, 0)

In [None]:
model

In [305]:
batch = next(iter(data_gen(V,30,20)))
out = model.forward(batch.src, batch.trg, 
                    batch.src_mask, batch.trg_mask)



In [306]:
out.shape

torch.Size([30, 9, 64])

In [281]:
batch.trg.shape


torch.Size([30, 9])

In [None]:
F

In [310]:
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
#         print(out.shape)
#         print(batch.trg_y.shape)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

In [311]:
V = tgt_vocab
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
# model = make_model(V, V, N=2)
# model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
#         torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
model_opt = opt

for epoch in range(10):
    model.train()
    run_epoch(data_gen(V, 30, 20), model,
              SimpleLossCompute(model.l, criterion, model_opt))
    model.eval()
    print(run_epoch(data_gen(V, 30, 5), model, 
                    SimpleLossCompute(model.l, criterion, None)))

Epoch Step: 1 Loss: -0.024823 Tokens per Sec: 896.772705
Epoch Step: 1 Loss: -0.031543 Tokens per Sec: 1170.380615
tensor(-0.0222)
Epoch Step: 1 Loss: -0.016784 Tokens per Sec: 910.359436
Epoch Step: 1 Loss: -0.043053 Tokens per Sec: 1164.172974
tensor(-0.0434)
Epoch Step: 1 Loss: -0.037789 Tokens per Sec: 907.264526
Epoch Step: 1 Loss: -0.065972 Tokens per Sec: 1159.271973
tensor(-0.0640)
Epoch Step: 1 Loss: -0.068766 Tokens per Sec: 905.235352
Epoch Step: 1 Loss: -0.070120 Tokens per Sec: 1160.584229
tensor(-0.0784)
Epoch Step: 1 Loss: -0.088683 Tokens per Sec: 907.309570
Epoch Step: 1 Loss: -0.099105 Tokens per Sec: 1178.394165
tensor(-0.1016)
Epoch Step: 1 Loss: -0.081477 Tokens per Sec: 905.809509
Epoch Step: 1 Loss: -0.119696 Tokens per Sec: 1149.585205
tensor(-0.1238)
Epoch Step: 1 Loss: -0.104629 Tokens per Sec: 899.433655


KeyboardInterrupt: 

In [308]:
class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.zero_grad()
        return loss.data * norm

In [153]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

In [326]:
def run_epoch(data_iter, model, loss_compute):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
#         print(out.shape)
#         print(batch.trg_y.shape)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
                    (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

In [314]:

class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [315]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."
    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(size_average=False)
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None
        
    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, Variable(true_dist, requires_grad=False))

In [316]:

class SimpleLossCompute:
    "A simple loss compute and train function."
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(x.contiguous().view(-1, x.size(-1)), 
                              y.contiguous().view(-1)) / norm
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.data * norm

In [327]:

# Train the simple copy task.
V = 11
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, V, N=2)
model_opt = NoamOpt(model.encoder.embed.d_model, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

for epoch in range(10):
    model.train()
    run_epoch(data_gen(V, 30, 20), model, 
              SimpleLossCompute(model.generator, criterion, model_opt))
    model.eval()
    print(run_epoch(data_gen(V, 30, 5), model, 
                    SimpleLossCompute(model.generator, criterion, None)))

Epoch Step: 1 Loss: 0.268992 Tokens per Sec: 3215.440430
Epoch Step: 1 Loss: -2.715291 Tokens per Sec: 4280.929199
tensor(-2.7766)
Epoch Step: 1 Loss: -2.799144 Tokens per Sec: 3223.412354
Epoch Step: 1 Loss: -3.610931 Tokens per Sec: 4247.348633
tensor(-3.6410)
Epoch Step: 1 Loss: -3.715994 Tokens per Sec: 3205.256836
Epoch Step: 1 Loss: -4.782799 Tokens per Sec: 4287.460938
tensor(-4.8620)
Epoch Step: 1 Loss: -4.890470 Tokens per Sec: 3179.523682
Epoch Step: 1 Loss: -6.373354 Tokens per Sec: 4195.205078
tensor(-6.3913)
Epoch Step: 1 Loss: -6.482215 Tokens per Sec: 3153.686768
Epoch Step: 1 Loss: -8.150961 Tokens per Sec: 4268.544922
tensor(-8.2342)
Epoch Step: 1 Loss: -8.045689 Tokens per Sec: 3181.538086
Epoch Step: 1 Loss: -10.858765 Tokens per Sec: 4265.747559
tensor(-10.7475)
Epoch Step: 1 Loss: -10.481791 Tokens per Sec: 3180.872314
Epoch Step: 1 Loss: -14.508307 Tokens per Sec: 4234.595215
tensor(-14.4538)
Epoch Step: 1 Loss: -13.743816 Tokens per Sec: 3172.838135
Epoch Step: 1