Encoder Decoder stacks

In [2]:
print("Hello world")

Hello world


In [14]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
from torch.utils.data import DataLoader
import torchtext.datasets as datasets

In [15]:
def clones(module,N):
    "produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [16]:
class LayerNorm(nn.Module):
    "Construct a layernorm module."
    def __init__(self,features,eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x-mean) / (std+self.eps) + self.b_2

In [17]:
ln = LayerNorm(512)

In [None]:
class SublayerConnection(nn.Module):
    "A residual connection followed by a layer norm."
    def __init__(self,size,dropout):
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,x,sublayer):
        "apply residual conn to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x))) # pre-norm

In [24]:
class EncoderLayer(nn.Module):
    "self-attn and feedforward"
    def __init__(self,size, self_attn, feed_forward,dropout):
        super().__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size,dropout),2)
        self.size = size
        
    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x,x,x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [None]:
class DecoderLayer(nn.Module):
    
    def __init__(self,size,self_attn,src_attn, feed_forward, dropout):
        super().__init__()
        self.size= size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout),3)
        
    def forward(self,x,memory,src_mask,tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x,x,x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x,m,m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [None]:
class Encoder(nn.Module):
    "core encoder is a stack of N layers."
    def __init__(self, layer, N):
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self,x,mask):
        for layer in self.layers:
            x = layer(x,mask)
        return self.norm(x)

In [None]:
class Decoder(nn.Module):
    def __init__(self,layer,N):
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self,x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x,memory,src_mask,tgt_mask)
        return self.norm(x)

Attention and Mask

In [1]:
def subsequent_mask(size):
    "mask out subsequent positions."
    attn_shape = (1,size,size)
    subsequent_mask = torch.tiru(torch.ones(attn_shape),diagonal=1).type(
    torch.uint8
    )
    return subsequent_mask==0