In [8]:
import torch
import torch.nn as nn
from torch.autograd import Variable # ?

class Embeddings(nn.Module):
    def __init__(self, vocab_size, d_model, dropout=0.1):
        # super(Embeddings, self).__init__() is used to inherit the properties of nn.Module
        super(Embeddings, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.scale = torch.sqrt(torch.FloatTensor([d_model]))
        
    def forward(self, x):
        x = self.embedding(x) * self.scale
        return self.dropout(x)
    
class PositionalEncoding(nn.Module):
    '''Positional encoding for transformer
    Args:
        d_model: the dimension of embedding
        dropout: dropout rate
        max_len: the max length of input'''
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)
        # position encoding matrix
        self.pe = torch.zeros(max_len, d_model)
        # size (max_len, 1)
        position = torch.arange(0, max_len).unsqueeze(1)
        # size (1, d_model/2) 都是偶数， -log(10000)/d_model
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.FloatTensor([10000.0])) / d_model))
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)
        # self.register_buffer('pe', self.pe)

    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad=False)
        return self.dropout(x)


class Encoder(nn.Module):
    def __init__(self, d_model, d_ff, n_heads, n_layers, dropout=0.1):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.dropout = nn.Dropout(dropout)
        self.layer = nn.ModuleList([EncoderLayer(d_model, d_ff, n_heads, dropout) for _ in range(n_layers)])
        
    def forward(self, x):
        for i in range(self.n_layers):
            x = self.layer[i](x)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, d_model, d_ff, n_heads, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.multi_head_attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
    def forward(self, x):
        x = self.layer_norm1(x + self.dropout(self.multi_head_attention(x, x, x)))
        x = self.layer_norm2(x + self.dropout(self.feed_forward(x)))
        return x
    
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout)
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.linear = nn.Linear(d_model, d_model)
        
    def forward(self, q, k, v):
        q = self.linear_q(q)
        k = self.linear_k(k)
        v = self.linear_v(v)
        q = q.view(-1, q.size(1), self.n_heads, self.d_model // self.n_heads).transpose(1, 2)
        k = k.view(-1, k.size(1), self.n_heads, self.d_model // self.n_heads).transpose(1, 2)
        v = v.view(-1, v.size(1), self.n_heads, self.d_model // self.n_heads).transpose(1, 2)
        q = q / torch.sqrt(torch.FloatTensor([self.d_model // self.n_heads]))
        attention = torch.softmax(torch.bmm(q, k.transpose(2, 3)), dim=-1)
        attention = self.dropout(attention)
        x = torch.bmm(attention, v)
        x = x.transpose(1, 2).contiguous().view(-1, x.size(1), self.d_model)
        x = self.linear(x)
        return x
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = self.linear2(self.dropout(torch.relu(self.linear1(x))))
        return x
class Decoder(nn.Module):
    def __init__(self, d_model, d_ff, n_heads, n_layers, dropout=0.1):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.dropout = nn.Dropout(dropout)
        self.layer = nn.ModuleList([DecoderLayer(d_model, d_ff, n_heads, dropout) for _ in range(n_layers)])
        
    def forward(self, x, encoder_output):
        for i in range(self.n_layers):
            x = self.layer[i](x, encoder_output)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, d_ff, n_heads, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff
        self.n_heads = n_heads
        self.dropout = nn.Dropout(dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.layer_norm3 = nn.LayerNorm(d_model)
        self.multi_head_attention1 = MultiHeadAttention(d_model, n_heads, dropout)
        self.multi_head_attention2 = MultiHeadAttention(d_model, n_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
    def forward(self, x, encoder_output):
        x = self.layer_norm1(x + self.dropout(self.multi_head_attention1(x, x, x)))
        x = self.layer_norm2(x + self.dropout(self.multi_head_attention2(x, encoder_output, encoder_output)))
        x = self.layer_norm3(x + self.dropout(self.feed_forward(x)))
        return x

class Generator(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Generator, self).__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.linear = nn.Linear(d_model, vocab_size)
        
    def forward(self, x):
        return self.linear(x)

In [2]:
# code for a template transformer Machine Translation task
import math
import torch
# shape is same as size()
print(torch.arange(0, 10, 2).shape)


torch.Size([5])


In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

# plt.figure(figsize=(15, 5))
# pe = PositionalEncoding(20, 0)
# y = pe.forward(torch.zeros(1, 100, 20))
# plt.plot(np.arange(100), y[0,:,4:8])
# plt.legend(["dim %d"%p for p in [4,5,6,7]])
a = torch.randn([5,1])
b = torch.randn([1,6])
print(a * b)


tensor([[-0.4041, -0.4654,  0.1344,  0.1758,  0.1266,  0.4327],
        [ 0.5047,  0.5812, -0.1678, -0.2195, -0.1581, -0.5404],
        [ 1.2244,  1.4101, -0.4071, -0.5325, -0.3835, -1.3111],
        [-1.2890, -1.4846,  0.4286,  0.5607,  0.4037,  1.3803],
        [ 0.0981,  0.1130, -0.0326, -0.0427, -0.0307, -0.1050]])
