In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [69]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        x = self.src_embed(x)
        x = self.encoder(x)
        x = self.tgt_embed(x)
        x = self.decoder(x)
        return self.generator(x)
    
    def encode(self, src, src_mask):
        _src = self.src_embed(src)
        return self.encoder(_src, src_mask)
    
    def decode(self, memory, tgt, src_mask, tgt_mask):
        _tgt = self.tgt_embed(tgt)
        return self.decoder(memory, _tgt, src_mask, tgt_mask)

In [20]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super().__init__()
        self.linear = nn.Linear(d_model, vocab)
        
    def forward(self, x):
        return F.softmax(self.linear(x))

In [36]:
from copy import deepcopy

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([deepcopy(module) for _ in range(N)])

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, size, eps=1e-6):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(size))
        self.bias = nn.Parameter(torch.zeros(size))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.scale * (x - mean) / (std + self.eps) + self.bias

In [73]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super().__init__()
        self.layers = clone(layer, N)
        size = layer.size
        self.norm = LayerNorm(size)
        
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [74]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, drop_prob):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(drop_prob)
        
    def forward(self, x, layer):
        next_x = layer(self.norm(x))
        return x + self.dropout(next_x)

In [76]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, drop_prob):
        super().__init__()
        self.size = size
        self.self_attn = self.self_attn
        self.feed_forward = feed_forward
        self.norm = LayerNorm(size)
        self.sublayers = clones(SublayerConnection(size, drop_prob), 2)
        
    def feed_forward(self, x, mask):
        x = self.sublayers[0](x, lambda x: self.self_attn(x, x, x, mask))
        x = self.sublayers[1](x, self.feed_forward)
        return self.norm(x)

In [77]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super().__init__()

In [78]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, drop_prob):
        self.size = size
        self.self_attn = self.self_attn
        self.src_attn  = src_attn
        self.feed_forward = feed_forward
        self.sublayers = clones(SublayerConnection(size, drop_rate), 3)
    
    def feed_forward(self, tgt, memory, src_mask, tgt_mask):
        m = memory
        x = tgt
        x = self.sublayers[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayers[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        x = self.sublayers[2](x, self.feed_forward)
        return x

In [11]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    pass

In [80]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    # qurey.size == (n_batch, input_length, key_dim)
    # key.size == (n_batch, memory_length, key_dim)
    # value.size == (n_batch, memory_length, value_dim)
    q_k = torch.matmul(query, key.transpose(-1, -2))
    key_dim = query.size()[-1]
    scores = q_k / np.sqrt(key_dim)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    # attention.size == (n_batch, input_length, memory_length)
    attention = F.softmax(scores)
    if dropout is not None:
        attention = dropout(attention)
    # output.size == (n_batch, input_length, value_dim)
    output = torch.matmul(attention, value)
    return output, attention

In [81]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, drop_prob=0.1):
        self.h = h
        self.d_model = d_model
        self.h_dim = d_model // h
        self.q_linear = clones(nn.Linear(d_model, h_dim, bias=False), h)
        self.k_linear = clones(nn.Linear(d_model, h_dim, bias=False), h)
        self.v_linear = clones(nn.Linear(d_model, h_dim, bias=False), h)
        self.o_linear = nn.Linear(h * self.h_dim, d_model, bias=False)
        self.dropout = nn.Dropout(drop_prob)
        
    def forward(self, query, key, value, mask):
        outputs = list()
        for i in range(self.h):
            proj_q = self.q_linear(query)
            proj_k = self.k_linear(key)
            proj_v = self.v_linear(value)
            output, attn = attention(proj_q, proj_k, proj_v,
                                     mask=mask, dropout=self.dropout)
            outputs.append(output)
        output = torch.cat(outputs, -1)
        return self.o_linear(output)

In [14]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        pass

In [15]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        pass

In [16]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        pass

In [40]:
layer = nn.Linear(30, 10)

In [67]:
help(torch.nn.coc)

AttributeError: module 'torch' has no attribute 'conat'

In [60]:
x = torch.zeros((3, 4, 5))
y = torch.ones((3, 5, 4))
torch.matmul(y, x).size()

torch.Size([3, 5, 5])

In [55]:
x.transpose(1, 2).size()

torch.Size([3, 5, 4])

In [56]:
x.transpose(2, 1).size()

torch.Size([3, 5, 4])

In [57]:
help(torch.multiply)

AttributeError: module 'torch' has no attribute 'multiply'

In [68]:
help(nn.Linear)

Help on class Linear in module torch.nn.modules.linear:

class Linear(torch.nn.modules.module.Module)
 |  Applies a linear transformation to the incoming data: :math:`y = Ax + b`
 |  
 |  Args:
 |      in_features: size of each input sample
 |      out_features: size of each output sample
 |      bias: If set to False, the layer will not learn an additive bias.
 |          Default: ``True``
 |  
 |  Shape:
 |      - Input: :math:`(N, *, in\_features)` where :math:`*` means any number of
 |        additional dimensions
 |      - Output: :math:`(N, *, out\_features)` where all but the last dimension
 |        are the same shape as the input.
 |  
 |  Attributes:
 |      weight: the learnable weights of the module of shape
 |          `(out_features x in_features)`
 |      bias:   the learnable bias of the module of shape `(out_features)`
 |  
 |  Examples::
 |  
 |      >>> m = nn.Linear(20, 30)
 |      >>> input = torch.randn(128, 20)
 |      >>> output = m(input)
 |      >>> print(outp

In [63]:
mlist  = nn.ModuleList([nn.Linear(10, 10)]*10)

In [66]:
mlist[8]

Linear(in_features=10, out_features=10, bias=True)

In [72]:
x = np.ones((5, 5))
torch.from_numpy(np.triu(x, k=1)) == 0

tensor([[ 1,  0,  0,  0,  0],
        [ 1,  1,  0,  0,  0],
        [ 1,  1,  1,  0,  0],
        [ 1,  1,  1,  1,  0],
        [ 1,  1,  1,  1,  1]], dtype=torch.uint8)