In [29]:
import gc
import os 
import re
import random

In [30]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils import clip_grad_norm_

In [31]:
import torchtext

In [32]:
class ResidualSublayer(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super(ResidualSublayer, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)
    
    def forward(self, x, sublayer):
        '''
        LayerNorm(x + Sublayer(x))
        
        > We apply dropout to the output of each sub-layer, before it is added to the
sub-layer input and normalized.
        '''
        return self.layer_norm(x + self.dropout(sublayer(x)))

In [33]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self, x):
        '''
        Just FFN
        '''
        return self.linear2(self.relu(self.linear1(x)))

In [34]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_k):
        super(ScaledDotProductAttention, self).__init__()
        self.d_k = d_k
    
    def forward(self, q, k, v, mask=None):
        '''
        Attention(Q, K, V) = softmax(QK^T/sqrt(d_k))V
        
        Input shape: [B*T*H]
        '''
        
        # [B*T*H] @ [B*H*T] -> [B*T*T]
        attn = torch.bmm(q, k.transpose(1,2))
        
        # / sqrt(d_k)
        attn /= torch.sqrt(self.d_k)
        
        if mask:
            attn = attn.masked_fill(mask, -np.inf)
        
        # softmax
        attn = F.softmax(attn, dim=2)
        
        # [B*T*T] @ [B*T*H] -> [B*T*H]
        return torch.bmm(attn, v)

In [23]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h=8, d_model=512):
        super(MultiHeadAttention, self).__init__()
        self.d_k = d_model // h # 64
        self.h = h
        
        self.q_linear = nn.Linear(d_model, h * self.d_k)
        self.k_linear = nn.Linear(d_model, h * self.d_k)
        self.v_linear = nn.Linear(d_model, h * self.d_k)
        self.linear = nn.Linear(d_model, d_model)
        
        self.attn = ScaledDotProductAttention(dim=self.dim)
        
    def forward(self, q, k, v, mask=None):
        '''
        MultiHead(Q, K, V) = Concat(head1,...,headn)W^O
        
        where head = Attention(QW_1, KW_2, VW_3)
        
        Input shape: [B*T*d_model]
        '''
        n_batch, n_len, _ = q.size()
        
        # [B*T*d_model] -> [B*T*(h x d_k)] -> [B*T*h*d_k]
        q = self.q_linear(q).view(n_batch, n_len, self.h, self.d_k)
        k = self.q_linear(k).view(n_batch, n_len, self.h, self.d_k)
        v = self.q_linear(v).view(n_batch, n_len, self.h, self.d_k)
        
        # TODO
        
        
        
        
        
        return 

In [28]:
x = torch.randn(9)
print(x)
print(x.view(3,3))

tensor([-1.3629, -1.0559, -2.1749, -0.1680,  0.6310,  1.1448, -0.1127,  0.0061,
        -0.8717])
tensor([[-1.3629, -1.0559, -2.1749],
        [-0.1680,  0.6310,  1.1448],
        [-0.1127,  0.0061, -0.8717]])
