In [27]:
import torch
import torch.nn as nn
import math


class NaiveEncoderLayer(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.dim = hidden_dim
        self.Wq = nn.Linear(self.dim, self.dim, bias=False)
        self.Wk = nn.Linear(self.dim, self.dim, bias=False)
        self.Wv = nn.Linear(self.dim, self.dim, bias=False)
        self.layerNorm_SA = nn.LayerNorm(self.dim)

        self.ffn1 = nn.Linear(self.dim,self.dim*4)
        self.ffn2 = nn.Linear(self.dim*4,self.dim)
        self.act = nn.GELU()
        self.layerNorm_ffn = nn.LayerNorm(self.dim)

    def SelfAttention(self, x):
        '''

        :param x: (N,L,D)
        :return: (N,L,D)
        '''
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)

        attention_score = torch.matmul(Q,K.transpose(1,2))/math.sqrt(self.dim)
        attention_score = nn.Softmax(-1)(attention_score)
        O = torch.matmul(attention_score,V)
        O = self.layerNorm_SA(x + O)
        return O

    def FFN(self,x):
        tmp1 = self.act(self.ffn1(x))
        tmp2 = self.ffn2(tmp1)
        output = self.layerNorm_ffn(x+tmp2)
        return output

    def forward(self, x):
        '''

        :param x: shape (N,L,D) N is batch size, L is the length of the sequnce, D is the dimension of word embeddings
        :return: shape (N,L,D)
        '''
        x = self.SelfAttention(x)
        x = self.FFN(x)

        return x

# 缺点
1. 没有dropout
2. 没有multi-head attention
3. 没有attention mask

In [28]:
import numpy as np
X = np.random.randn(10,50,200)
X = torch.Tensor(X)
X

tensor([[[-1.0401, -0.6295, -0.3225,  ..., -0.8235,  1.5486,  0.7692],
         [-0.1269,  1.3418, -0.3858,  ...,  0.6867, -0.3089,  0.0166],
         [-0.2949,  1.0402,  0.1966,  ...,  0.7793, -1.5150,  0.3857],
         ...,
         [-0.6879, -1.0686,  1.2191,  ...,  0.0429, -0.9116, -1.0797],
         [ 0.8545, -0.9302,  0.1264,  ...,  0.0619, -2.1369, -0.5130],
         [ 0.0893, -1.1508,  0.3735,  ...,  2.2304,  0.2492, -1.0066]],

        [[-0.0602,  0.8188,  0.8021,  ..., -0.8600,  0.2093,  1.8612],
         [ 1.8572,  0.9054, -1.1331,  ..., -0.9275, -1.4083,  2.2626],
         [ 0.1644,  0.5029,  1.3739,  ..., -0.4150,  0.0186, -0.1931],
         ...,
         [-0.4816, -0.1627,  0.5056,  ...,  0.7688,  0.4624, -1.2395],
         [-0.8423, -1.4215,  0.3208,  ..., -0.2339,  1.0953, -1.2284],
         [ 2.1312, -0.8551,  0.0459,  ..., -0.6376,  2.3748, -0.0283]],

        [[ 0.9800,  0.1088,  0.3221,  ..., -0.3778,  0.5116,  0.0306],
         [-2.0665, -0.2299,  0.0962,  ..., -0

In [29]:
naive_encoder = NaiveEncoderLayer(200)

In [30]:
output = naive_encoder(X)

output.shape,output

(torch.Size([10, 50, 200]),
 tensor([[[-9.1078e-01, -9.4485e-01, -1.6519e-01,  ..., -8.1986e-01,
            1.3281e+00,  7.8501e-01],
          [-7.0449e-02,  9.2821e-01, -4.7323e-01,  ...,  8.5309e-01,
           -3.1893e-01, -1.6464e-01],
          [-4.4850e-02,  1.2840e+00,  2.8053e-02,  ...,  6.2792e-01,
           -1.1707e+00,  4.1003e-01],
          ...,
          [-7.9872e-01, -6.7767e-01,  1.1246e+00,  ..., -3.9275e-01,
           -6.8273e-01, -1.1737e+00],
          [ 1.0243e+00, -7.3214e-01, -2.5325e-02,  ...,  5.3646e-02,
           -2.0490e+00, -5.1975e-01],
          [ 4.9983e-02, -1.4734e+00,  4.7782e-01,  ...,  2.1851e+00,
            3.6147e-01, -1.0625e+00]],
 
         [[ 1.1065e-01,  7.1265e-01,  8.9405e-01,  ..., -7.4141e-01,
            2.1874e-01,  1.7257e+00],
          [ 1.5766e+00,  7.6594e-01, -1.3727e+00,  ..., -8.1249e-01,
           -1.2478e+00,  2.0054e+00],
          [ 1.5012e-01,  6.9972e-02,  1.3647e+00,  ..., -6.2881e-01,
            2.2019e-01,  2.20