In [47]:
import gc
import os 
import math
import re
import random

In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.futorchnctional as F
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_

In [11]:
import torchtext

In [24]:
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [20]:
class ResidualSublayer(nn.Module):
    def __init__(self, d_model, sublayer, dropout=0.1):
        super(ResidualSublayer, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)
        self.sublayer = sublayer
    
    def forward(self, x):
        '''
        LayerNorm(x + Sublayer(x))
        
        > We apply dropout to the output of each sub-layer, before it is added to the
sub-layer input and normalized.
        '''
        return self.layer_norm(x + self.dropout(self.sublayer(x)))

In [13]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self, x):
        '''
        Just FFN
        '''
        return self.linear2(self.relu(self.linear1(x)))

In [14]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k
    
    def forward(self, q, k, v, mask=None):
        '''
        Attention(Q, K, V) = softmax(QK^T/sqrt(d_k))V
        
        Input shape: [B*T*H]
        '''
        
        # [B*T*H] @ [B*H*T] -> [B*T*T]
        attn = torch.bmm(q, k.transpose(1,2))
        
        # / sqrt(d_k)
        attn /= torch.sqrt(self.d_k)
        
        if mask:
            attn = attn.masked_fill(mask, -np.inf)
        
        # softmax
        attn = F.softmax(attn, dim=2)
        
        # [B*T*T] @ [B*T*H] -> [B*T*H]
        return torch.bmm(attn, v), attn

In [21]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h=8, d_model=512):
        super(MultiHeadAttention, self).__init__()
        self.d_k = d_model // h # 64
        self.h = h
        
        self.q_linear = nn.Linear(d_model, h * self.d_k)
        self.k_linear = nn.Linear(d_model, h * self.d_k)
        self.v_linear = nn.Linear(d_model, h * self.d_k)
        self.linear = nn.Linear(d_model, d_model)
        
        self.attn = ScaledDotProductAttention(dim=self.dim)
        
    def forward(self, q, k, v, mask=None):
        '''
        MultiHead(Q, K, V) = Concat(head1,...,headn)W^O
        
        where head = Attention(QW_1, KW_2, VW_3)
        
        Input shape: [B*T*d_model]
        '''
        n_batch, n_len, _ = q.size()
        
        # [B*T*d_model] -> [B*T*(h x d_k)] -> [B*T*h*d_k]
        q = self.q_linear(q).view(n_batch, n_len, self.h, self.d_k)
        k = self.k_linear(k).view(n_batch, n_len, self.h, self.d_k)
        v = self.v_linear(v).view(n_batch, n_len, self.h, self.d_k)
        
        # [B*T*h*d_k] -> [(B x h)*T*d_k]
        q = q.permute(0, 2, 1, 3).contiguous().view(-1, n_len, self.d_k)
        k = k.permute(0, 2, 1, 3).contiguous().view(-1, n_len, self.d_k)
        v = v.permute(0, 2, 1, 3).contiguous().view(-1, n_len, self.d_k)
        
        
        # [(B x h)*T*d_k]
        x, attn = self.attn(q, k, v, mask)
        
        # [(B x h)*T*d_k] -> [B*h*T*d_k]
        x = x.view(n_batch, self.h, n_len, self.d_k)
        
        # [B*h*T*d_k] -> [B*T*h*d_k] -> [B*T*(h x d_k)]
        x = x.permute(0, 2, 1, 3).contiguous().view(n_batch, n_len, -1)
        
        return self.linear(x)

In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model=512, h=8, d_ff=2048):
        super(EncoderLayer, self).__init__()
        attn = MultiHeadAttention(h, d_model)
        ffn = PositionWiseFeedForward(d_model, d_ff)
        self.sublayer1 = ResidualSublayer(d_model, attn)
        self.sublayer2 = ResidualSublayer(d_model, ffn)
    
    def forward(self, x, mask):
        x = self.sublayer1(x, x, x, mask)
        return self.sublayer2(x)

In [23]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model=512, h=8, d_ff=2048):
        super(DecoderLayer, self).__init__()
        masked_attn = MultiHeadAttention(h, d_model)
        attn = MultiHeadAttention(h, d_model)
        ffn = PositionWiseFeedForward(d_model, d_ff)
        
        self.sublayer1 = ResidualSublayer(d_model, masked_attn)
        self.sublayer2 = ResidualSublayer(d_model, attn)
        self.sublayer3 = ResidualSublayer(d_model, ffn)
    
    def forward(self, x, memory, src_mask, trg_mask):
        x = self.sublayer1(x, x, x, trg_mask)
        x = self.sublayer2(x, memory, memory, src_mask)
        return self.sublayer3(x)

In [25]:
class Embedding(nn.Module):
    def __init__(self, d_model, n_vocab):
        super(Embedding, self).__init__()
        self.embed = nn.Embedding(n_vocab, d_model)
        self.d_model = d_model
    
    def forward(self, x):
        return self.embed(x) * torch.sqrt(self.d_model)

In [71]:
def positional_encoding(max_len, d_model, padding_idx=None):
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).type(torch.float32).unsqueeze(1)
    div_term = 1 / (10000 ** (torch.arange(0., d_model, 2) / d_model)) 
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

In [None]:
class Encoder(nn.Module):
    def __init__(self, n_layers):
        
        
        self.layers = nn.ModuleList([
            EncoderLayer(d_model=512, h=8, d_ff=2048) 
            for _ in range(n_layers) ])

    def forward(self, x):
        pass

In [26]:
class Decoder(nn.Module):
    def __init__(self, n_layers):
        
    def forward(self, x):
        pass

In [None]:
class Transformer(nn.Module):
    def __init__(self):
        pass
    
    def forward(self):
        pass